def _get_default_options(output_type='sframe'):
    """
    Return information about the default options.

    Parameters
    ----------
    output_type : str, optional

        The output can be of the following types.

        - `sframe`: A table description each option used in the model.
        - `json`: A list of option dictionaries.

        | Each dictionary/row in the JSON/SFrame object describes the
          following parameters of the given model.

        +------------------+-------------------------------------------------------+
        |      Name        |                  Description                          |
        +==================+=======================================================+
        | name             | Name of the option used in the model.                 |
        +------------------+---------+---------------------------------------------+
        | description      | A detailed description of the option used.            |
        +------------------+-------------------------------------------------------+
        | type             | Option type.                                          |
        +------------------+-------------------------------------------------------+
        | default_value    | The default value for the option.                     |
        +------------------+-------------------------------------------------------+
        | possible_values  | List of acceptable values (CATEGORICAL only)          |
        +------------------+-------------------------------------------------------+
        | lower_bound      | Smallest acceptable value for this option (REAL only) |
        +------------------+-------------------------------------------------------+
        | upper_bound      | Largest acceptable value for this option (REAL only)  |
        +------------------+-------------------------------------------------------+

    Returns
    -------
    out : JSON/SFrame
        Each row in the output SFrames correspond to a parameter, and includes
        columns for default values, lower and upper bounds, description ,and
        type.
    """

    _check_categorical_option_type('output_type', output_type,
                                   ['json', 'sframe'])
    import graphlab as _gl
    sf = _gl.SFrame({
        'name': ['model'],
        'default_value': ['auto'],
        'lower_bound': [None],
        'upper_bound': [None],
        'parameter_type': ['Model or String'],
        'possible_values': [None],
    })
    if output_type == "sframe":
        return sf
    else:
        return [row for row in sf]
def _get_default_options(output_type = 'sframe'):
    """
    Return information about the default options.

    Parameters
    ----------
    output_type : str, optional

        The output can be of the following types.

        - `sframe`: A table description each option used in the model.
        - `json`: A list of option dictionaries.

        | Each dictionary/row in the JSON/SFrame object describes the
          following parameters of the given model.

        +------------------+-------------------------------------------------------+
        |      Name        |                  Description                          |
        +==================+=======================================================+
        | name             | Name of the option used in the model.                 |
        +------------------+---------+---------------------------------------------+
        | description      | A detailed description of the option used.            |
        +------------------+-------------------------------------------------------+
        | type             | Option type.                                          |
        +------------------+-------------------------------------------------------+
        | default_value    | The default value for the option.                     |
        +------------------+-------------------------------------------------------+
        | possible_values  | List of acceptable values (CATEGORICAL only)          |
        +------------------+-------------------------------------------------------+
        | lower_bound      | Smallest acceptable value for this option (REAL only) |
        +------------------+-------------------------------------------------------+
        | upper_bound      | Largest acceptable value for this option (REAL only)  |
        +------------------+-------------------------------------------------------+

    Returns
    -------
    out : JSON/SFrame
        Each row in the output SFrames correspond to a parameter, and includes
        columns for default values, lower and upper bounds, description ,and
        type.
    """

    _check_categorical_option_type('output_type', output_type,
                                    ['json', 'sframe'])
    import graphlab as _gl
    sf = _gl.SFrame({
        'name': ['model'],
        'default_value': ['auto'],
        'lower_bound' : [None],
        'upper_bound' : [None],
        'parameter_type' : ['Model or String'],
        'possible_values' : [None],
    })
    if output_type == "sframe":
        return sf
    else:
        return [row for row in sf]
    def predict(self, dataset, output_type='class', missing_value_action='auto'):
        """
        A flexible and advanced prediction API.

        The target column is provided during
        :func:`~graphlab.boosted_trees.create`. If the target column is in the
        `dataset` it will be ignored.

        Parameters
        ----------
        dataset : SFrame
          A dataset that has the same columns that were used during training.
          If the target column exists in ``dataset`` it will be ignored
          while making predictions.

        output_type : {'probability', 'margin', 'class'}, optional.
          If output_type is 'probability', then predict will output the class
          probability between [0, 1]. Otherwise, it will output the margin
          score before transforming to probability using the logistic function.

        missing_value_action : str, optional
            Action to perform when missing values are encountered. Can be
            one of:

            - 'auto': By default the model will treat missing value as is.
            - 'impute': Proceed with evaluation by filling in the missing
              values with the mean of the training data. Missing
              values are also imputed if an entire column of data is
              missing during evaluation.
            - 'error': Do not proceed with evaluation and terminate with
              an error message.


        Returns
        -------
        out : SArray
           Predicted target value for each example (i.e. row) in the dataset.

        See Also
        ----------
        create, evaluate, classify

        Examples
        --------
        >>> m.predict(testdata)
        >>> m.predict(testdata, output_type='probability')
        >>> m.predict(testdata, output_type='margin')
        """
        _mt._get_metric_tracker().track('toolkit.classifier.boosted_trees_classifier.predict')

        _check_categorical_option_type('output_type', output_type, ['class', 'margin', 'probability'])
        return super(_Classifier, self).predict(dataset,
                                                output_type=output_type,
                                                missing_value_action=missing_value_action)
Esempio n. 4
0
    def get_default_options(output_type='sframe'):
        """
        Return information about the default options.

        Parameters
        ----------
        output_type : str, optional

            The output can be of the following types.

            - `sframe`: A table description each option used in the model.
            - `json`: A list of option dictionaries.


        Returns
        -------
        out : SFrame
            Each row in the output SFrames correspond to a parameter, and includes
            columns for default values, lower and upper bounds, description, and
            type.
        """
        _check_categorical_option_type('output_type', output_type,
                                       ['json', 'sframe'])
        out = _gl.SFrame({
            'name': [
                'features', 'excluded_features', 'output_column_prefix',
                'transform_function', 'transform_function_name'
            ],
            'default_value': ['None', 'None', 'None', 'lambda x: x', 'none'],
            'parameter_type':
            ['list[str]', 'list[func]', 'str', 'function', 'str'],
            'lower_bound': ['None', 'None', 'None', 'None', 'None'],
            'upper_bound': ['None', 'None', 'None', 'None', 'None'],
            'description': [
                'Features to include in transformation.',
                'Features to exclude from transformation.',
                'Prefix of the output column.',
                'Column transformation function.',
                'Column transformation description.'
            ]
        })
        if output_type == "sframe":
            return out
        else:
            return {
                row['name']: {
                    "default_value": row['default_value'],
                    "description": row['description'],
                    "upper_bound": row['upper_bound'],
                    "lower_bound": row['lower_bound'],
                    "parameter_type": row['parameter_type']
                }
                for row in out
            }
    def predict(self, dataset, output_type='class'):
        """
        A flexible and advanced prediction API.

        The target column is provided during
        :func:`~graphlab.boosted_trees.create`. If the target column is in the
        `dataset` it will be ignored.

        Parameters
        ----------
        dataset : SFrame
          A dataset that has the same columns that were used during training.
          If the target column exists in ``dataset`` it will be ignored
          while making predictions.

        output_type : {'probability', 'margin', 'class'}, optional.
          If output_type is 'probability', then predict will output the class
          probability between [0, 1]. Otherwise, it will output the margin
          score before transforming to probability using the logistic function.

        Returns
        -------
        out : SArray
           Predicted target value for each example (i.e. row) in the dataset.

        See Also
        ----------
        create, evaluate, classify

        Examples
        --------
        >>> m.predict(testdata)
        >>> m.predict(testdata, output_type='probability')
        >>> m.predict(testdata, output_type='margin')
        """
        _mt._get_metric_tracker().track('toolkit.classifier.boosted_trees_classifier.predict')

        _check_categorical_option_type('output_type', output_type, ['class', 'margin', 'probability'])
        return super(_Classifier, self).predict(dataset,
                                                    output_type = output_type)
Esempio n. 6
0
def recall(targets, predictions, average='macro'):
    r"""
    Compute the recall score for classification tasks. The recall score
    quantifies the ability of a classifier to predict `positive` examples.
    Recall can be interpreted as the probability that a randomly selected
    `positive` example is correctly identified by the classifier. The score
    is in the range [0,1] with 0 being the worst, and 1 being perfect.


    The recall score is defined as the ratio:
        .. math::
            \frac{tp}{tp + fn}

    where `tp` is the number of true positives and `fn` the number of false
    negatives.

    Parameters
    ----------
    targets : SArray
        Ground truth class labels. The SArray can be of any type.

    predictions : SArray
        The prediction that corresponds to each target value.  This SArray must
        have the same length as ``targets`` and must be of the same type
        as the ``targets`` SArray.

    average : string, [None, 'macro' (default), 'micro']
        Metric averaging strategies for multiclass classification. Averaging
        strategies can be one of the following:

            - None: No averaging is performed and a single metric is returned
              for each class.
            - 'micro': Calculate metrics globally by counting the total true
              positives, false negatives, and false positives.
            - 'macro': Calculate metrics for each label and find their
              unweighted mean. This does not take label imbalance into account.

    Returns
    -------
    out : float (for binary classification) or dict[float]
        Score for the positive class (for binary classification) or an average
        score for each class for multi-class classification.  If
        `average=None`, then a dictionary is returned where the key is the
        class label and the value is the score for the corresponding class
        label.

    Notes
    -----
     - For binary classification, when the target label is of type "string",
       then the labels are sorted alphanumerically and the largest label is
       chosen as the "positive" label.  For example, if the classifier labels
       are {"cat", "dog"}, then "dog" is chosen as the positive label for the
       binary classification case.

    See Also
    --------
    confusion_matrix, accuracy, precision, f1_score

    Examples
    --------

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets = graphlab.SArray([0, 1, 2, 3, 0, 1, 2, 3])
        >>> predictions = graphlab.SArray([1, 0, 2, 1, 3, 1, 2, 1])

        # Micro average of the recall scores for each class.
        >>> graphlab.evaluation.recall(targets, predictions,
        ...                            average = 'micro')
        0.375

        # Macro average of the recall scores for each class.
        >>> graphlab.evaluation.recall(targets, predictions,
        ...                            average = 'macro')
        0.375

        # Recall score for each class.
        >>> graphlab.evaluation.recall(targets, predictions,
        ...                            average = None)
        {0: 0.0, 1: 0.5, 2: 1.0, 3: 0.0}

    This metric also works for string classes.

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets = graphlab.SArray(
        ...      ["cat", "dog", "foosa", "snake", "cat", "dog", "foosa", "snake"])
        >>> predictions = graphlab.SArray(
        ...      ["dog", "cat", "foosa", "dog", "snake", "dog", "cat", "dog"])

        # Micro average of the recall scores for each class.
        >>> graphlab.evaluation.recall(targets, predictions,
        ...                            average = 'micro')
        0.375

        # Macro average of the recall scores for each class.
        >>> graphlab.evaluation.recall(targets, predictions,
        ...                            average = 'macro')
        0.375

        # Recall score for each class.
        >>> graphlab.evaluation.recall(targets, predictions,
        ...                            average = None)
        {0: 0.0, 1: 0.5, 2: 1.0, 3: 0.0}
    """
    _mt._get_metric_tracker().track('evaluation.precision')
    _supervised_evaluation_error_checking(targets, predictions)
    _check_categorical_option_type('average', average,
                         ['micro', 'macro', None])
    _check_same_type_not_float(targets, predictions)
    opts = {"average": average}
    return _graphlab.extensions._supervised_streaming_evaluator(targets,
                          predictions, "recall", opts)
Esempio n. 7
0
def fbeta_score(targets, predictions, beta=1.0, average='macro'):
    r"""
    Compute the F-beta score. The F-beta score is the weighted harmonic mean of
    precision and recall. The score lies in the range [0,1] with 1 being ideal
    and 0 being the worst.

    The `beta` value is the weight given to `precision` vs `recall` in the
    combined score. `beta=0` considers only precision, as `beta` increases, more
    weight is given to recall with `beta > 1` favoring recall over precision.

    The F-beta score is defined as:

        .. math::
            f_{\beta} = (1 + \beta^2) \times \frac{(p \times r)}{(\beta^2 p + r)}

    Where :math:`p` is the precision and :math:`r` is the recall.

    Parameters
    ----------
    targets : SArray
        An SArray of ground truth class labels. Can be of any type except
        float.

    predictions : SArray
        The prediction that corresponds to each target value.  This SArray must
        have the same length as ``targets`` and must be of the same type
        as the ``targets`` SArray.

    beta: float
        Weight of the `precision` term in the harmonic mean.

    average : string, [None, 'macro' (default), 'micro']
        Metric averaging strategies for multiclass classification. Averaging
        strategies can be one of the following:

            - None: No averaging is performed and a single metric is returned
              for each class.
            - 'micro': Calculate metrics globally by counting the total true
              positives, false negatives and false positives.
            - 'macro': Calculate metrics for each label, and find their
              unweighted mean. This does not take label imbalance into account.

        For a more precise definition of `micro` and `macro` averaging refer
        to [1] below.

    Returns
    -------
    out : float (for binary classification) or dict[float] (for multi-class, average=None)
        Score for the positive class (for binary classification) or an average
        score for each class for multi-class classification.  If
        `average=None`, then a dictionary is returned where the key is the
        class label and the value is the score for the corresponding class
        label.

    Notes
    -----
     - For binary classification, if the target label is of type "string",
       then the labels are sorted alphanumerically and the largest label is
       chosen as the "positive" label.  For example, if the classifier labels
       are {"cat", "dog"}, then "dog" is chosen as the positive label for the
       binary classification case.


    See Also
    --------
    confusion_matrix, accuracy, precision, recall, f1_score

    Examples
    --------

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets = graphlab.SArray([0, 1, 2, 3, 0, 1, 2, 3])
        >>> predictions = graphlab.SArray([1, 0, 2, 1, 3, 1, 0, 1])

        # Micro average of the F-Beta score
        >>> graphlab.evaluation.fbeta_score(targets, predictions,
        ...                                 beta=2.0, average = 'micro')
        0.25

        # Macro average of the F-Beta score
        >>> graphlab.evaluation.fbeta_score(targets, predictions,
        ...                                 beta=2.0, average = 'macro')
        0.24305555555555558

        # F-Beta score for each class.
        >>> graphlab.evaluation.fbeta_score(targets, predictions,
        ...                                 beta=2.0, average = None)
        {0: 0.0, 1: 0.4166666666666667, 2: 0.5555555555555556, 3: 0.0}

    This metric also works when the targets are of type `str`

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets = graphlab.SArray(
        ...      ["cat", "dog", "foosa", "snake", "cat", "dog", "foosa", "snake"])
        >>> predictions = graphlab.SArray(
        ...      ["dog", "cat", "foosa", "dog", "snake", "dog", "cat", "dog"])

        # Micro average of the F-Beta score
        >>> graphlab.evaluation.fbeta_score(targets, predictions,
        ...                                 beta=2.0, average = 'micro')
        0.25

        # Macro average of the F-Beta score
        >>> graphlab.evaluation.fbeta_score(targets, predictions,
        ...                                 beta=2.0, average = 'macro')
        0.24305555555555558

        # F-Beta score for each class.
        >>> graphlab.evaluation.fbeta_score(targets, predictions,
        ...                                 beta=2.0, average = None)
        {'cat': 0.0, 'dog': 0.4166666666666667, 'foosa': 0.5555555555555556, 'snake': 0.0}

    References
    ----------
    - [1] Sokolova, Marina, and Guy Lapalme. "A systematic analysis of
      performance measures for classification tasks." Information Processing &
      Management 45.4 (2009): 427-437.

    """
    _mt._get_metric_tracker().track('evaluation.fbeta_score')
    _supervised_evaluation_error_checking(targets, predictions)
    _check_categorical_option_type('average', average,
                         ['micro', 'macro', None])
    _check_same_type_not_float(targets, predictions)

    opts = {"beta"    : beta,
            "average" : average}
    return _graphlab.extensions._supervised_streaming_evaluator(targets,
                          predictions, "fbeta_score", opts)
Esempio n. 8
0
def auc(targets, predictions, average='macro'):
    r"""
    Compute the area under the ROC curve for the given targets and predictions.

    Parameters
    ----------
    targets : SArray
        An SArray containing the observed values. For binary classification,
        the alpha-numerically first category is considered the reference
        category.

    predictions : SArray
        Prediction probability that corresponds to each target value. This must
        be of same length as ``targets``.

    average : string, [None, 'macro' (default)]
        Metric averaging strategies for multiclass classification. Averaging
        strategies can be one of the following:

            - None: No averaging is performed and a single metric is returned
              for each class.
            - 'macro': Calculate metrics for each label, and find their
              unweighted mean. This does not take label imbalance into account.

    Returns
    -------
    out : float (for binary classification) or dict[float]
        Score for the positive class (for binary classification) or an average
        score for each class for multi-class classification.  If
        `average=None`, then a dictionary is returned where the key is the
        class label and the value is the score for the corresponding class
        label.

    See Also
    --------
    roc_curve, confusion_matrix

    Examples
    --------
    .. sourcecode:: python

        >>> targets = graphlab.SArray([0, 1, 1, 0])
        >>> predictions = graphlab.SArray([0.1, 0.35, 0.7, 0.99])

        # Calculate the auc-score
        >>> auc =  graphlab.evaluation.auc(targets, predictions)
        0.5

    This metric also works when the targets are strings (Here "cat" is chosen
    as the reference class).

    .. sourcecode:: python

        >>> targets = graphlab.SArray(["cat", "dog", "dog", "cat"])
        >>> predictions = graphlab.SArray([0.1, 0.35, 0.7, 0.99])

        # Calculate the auc-score
        >>> auc =  graphlab.evaluation.auc(targets, predictions)
        0.5


    For the multi-class setting, the auc-score can be averaged.

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets     = graphlab.SArray([ 1, 0, 2, 1])
        >>> predictions = graphlab.SArray([[.1, .8, 0.1],
        ...                                [.9, .1, 0.0],
        ...                                [.8, .1, 0.1],
        ...                                [.3, .6, 0.1]])

        #  Macro average of the scores for each class.
        >>> graphlab.evaluation.auc(targets, predictions, average = 'macro')
        0.8888888888888888

        # Scores for each class.
        >>> graphlab.evaluation.auc(targets, predictions, average = None)
        {0: 1.0, 1: 1.0, 2: 0.6666666666666666}

    This metric also works for "string" targets in the multi-class setting

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets     = graphlab.SArray([ "dog", "cat", "foosa", "dog"])
        >>> predictions = graphlab.SArray([[.1, .8, 0.1],
                                           [.9, .1, 0.0],
                                           [.8, .1, 0.1],
                                           [.3, .6, 0.1]])

        # Macro average.
        >>> auc =  graphlab.evaluation.auc(targets, predictions)
        0.8888888888888888

        # Score for each class.
        >>> auc =  graphlab.evaluation.auc(targets, predictions, average=None)
        {'cat': 1.0, 'dog': 1.0, 'foosa': 0.6666666666666666}

    """
    _mt._get_metric_tracker().track('evaluation.auc')
    _supervised_evaluation_error_checking(targets, predictions)
    _check_categorical_option_type('average', average,
                         ['macro', None])
    _check_prob_and_prob_vector(predictions)
    _check_target_not_float(targets)
    opts = {"average": average,
            "binary": predictions.dtype() in [int, float]}
    return _graphlab.extensions._supervised_streaming_evaluator(targets,
                      predictions, "auc", opts)
def create(dataset,
           num_topics=10,
           initial_topics=None,
           alpha=None,
           beta=.1,
           num_iterations=10,
           num_burnin=5,
           associations=None,
           verbose=False,
           print_interval=10,
           validation_set=None,
           method='auto'):
    """
    Create a topic model from the given data set. A topic model assumes each
    document is a mixture of a set of topics, where for each topic some words
    are more likely than others. One statistical approach to do this is called a
    "topic model". This method learns a topic model for the given document
    collection.

    Parameters
    ----------
    dataset : SArray of type dict or SFrame with a single column of type dict
        A bag of words representation of a document corpus.
        Each element is a dictionary representing a single document, where
        the keys are words and the values are the number of times that word
        occurs in that document.

    num_topics : int, optional
        The number of topics to learn.

    initial_topics : SFrame, optional
        An SFrame with a column of unique words representing the vocabulary
        and a column of dense vectors representing
        probability of that word given each topic. When provided,
        these values are used to initialize the algorithm.

    alpha : float, optional
        Hyperparameter that controls the diversity of topics in a document.
        Smaller values encourage fewer topics per document.
        Provided value must be positive. Default value is 50/num_topics.

    beta : float, optional
        Hyperparameter that controls the diversity of words in a topic.
        Smaller values encourage fewer words per topic. Provided value
        must be positive.

    num_iterations : int, optional
        The number of iterations to perform.

    num_burnin : int, optional
        The number of iterations to perform when inferring the topics for
        documents at prediction time.

    verbose : bool, optional
        When True, print most probable words for each topic while printing
        progress.

    print_interval : int, optional
        The number of iterations to wait between progress reports.

    associations : SFrame, optional
        An SFrame with two columns named "word" and "topic" containing words
        and the topic id that the word should be associated with. These words
        are not considered during learning.

    validation_set : SArray of type dict or SFrame with a single column
        A bag of words representation of a document corpus, similar to the
        format required for `dataset`. This will be used to monitor model
        performance during training. Each document in the provided validation
        set is randomly split: the first portion is used estimate which topic
        each document belongs to, and the second portion is used to estimate
        the model's performance at predicting the unseen words in the test data.

    method : {'cgs', 'alias'}, optional
        The algorithm used for learning the model.

        - *cgs:* Collapsed Gibbs sampling
        - *alias:* AliasLDA method.

    Returns
    -------
    out : TopicModel
        A fitted topic model. This can be used with
        :py:func:`~TopicModel.get_topics()` and
        :py:func:`~TopicModel.predict()`. While fitting is in progress, several
        metrics are shown, including:

        +------------------+---------------------------------------------------+
        |      Field       | Description                                       |
        +==================+===================================================+
        | Elapsed Time     | The number of elapsed seconds.                    |
        +------------------+---------------------------------------------------+
        | Tokens/second    | The number of unique words processed per second   |
        +------------------+---------------------------------------------------+
        | Est. Perplexity  | An estimate of the model's ability to model the   |
        |                  | training data. See the documentation on evaluate. |
        +------------------+---------------------------------------------------+

    See Also
    --------
    TopicModel, TopicModel.get_topics, TopicModel.predict,
    graphlab.SArray.dict_trim_by_keys, TopicModel.evaluate

    References
    ----------
    - `Wikipedia - Latent Dirichlet allocation
      <http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation>`_

    - Alias method: Li, A. et al. (2014) `Reducing the Sampling Complexity of
      Topic Models. <http://www.sravi.org/pubs/fastlda-kdd2014.pdf>`_.
      KDD 2014.

    Examples
    --------
    The following example includes an SArray of documents, where
    each element represents a document in "bag of words" representation
    -- a dictionary with word keys and whose values are the number of times
    that word occurred in the document:

    >>> docs = graphlab.SArray('https://static.turi.com/datasets/nytimes')

    Once in this form, it is straightforward to learn a topic model.

    >>> m = graphlab.topic_model.create(docs)

    It is also easy to create a new topic model from an old one  -- whether
    it was created using GraphLab Create or another package.

    >>> m2 = graphlab.topic_model.create(docs, initial_topics=m['topics'])

    To manually fix several words to always be assigned to a topic, use
    the `associations` argument. The following will ensure that topic 0
    has the most probability for each of the provided words:

    >>> from graphlab import SFrame
    >>> associations = SFrame({'word':['hurricane', 'wind', 'storm'],
                               'topic': [0, 0, 0]})
    >>> m = graphlab.topic_model.create(docs,
                                        associations=associations)

    More advanced usage allows you  to control aspects of the model and the
    learning method.

    >>> import graphlab as gl
    >>> m = gl.topic_model.create(docs,
                                  num_topics=20,       # number of topics
                                  num_iterations=10,   # algorithm parameters
                                  alpha=.01, beta=.1)  # hyperparameters

    To evaluate the model's ability to generalize, we can create a train/test
    split where a portion of the words in each document are held out from
    training.

    >>> train, test = gl.text_analytics.random_split(.8)
    >>> m = gl.topic_model.create(train)
    >>> results = m.evaluate(test)
    >>> print results['perplexity']

    """
    _mt._get_metric_tracker().track('toolkit.text.topic_model.create')

    dataset = _check_input(dataset)

    _check_categorical_option_type("method", method, ['auto', 'cgs', 'alias'])
    if method == 'cgs' or method == 'auto':
        model_name = 'cgs_topic_model'
    else:
        model_name = 'alias_topic_model'

    # If associations are provided, check they are in the proper format
    if associations is None:
        associations = _graphlab.SFrame({'word': [], 'topic': []})
    if isinstance(associations, _graphlab.SFrame) and \
       associations.num_rows() > 0:
        assert set(associations.column_names()) == set(['word', 'topic']), \
            "Provided associations must be an SFrame containing a word column\
             and a topic column."
        assert associations['word'].dtype() == str, \
            "Words must be strings."
        assert associations['topic'].dtype() == int, \
            "Topic ids must be of int type."
    if alpha is None:
        alpha = float(50) / num_topics

    if validation_set is not None:
        _check_input(validation_set)  # Must be a single column
        if isinstance(validation_set, _graphlab.SFrame):
            column_name = validation_set.column_names()[0]
            validation_set = validation_set[column_name]
        (validation_train, validation_test) = _random_split(validation_set)
    else:
        validation_train = _SArray()
        validation_test = _SArray()

    opts = {'model_name': model_name,
            'data': dataset,
            'verbose': verbose,
            'num_topics': num_topics,
            'num_iterations': num_iterations,
            'print_interval': print_interval,
            'alpha': alpha,
            'beta': beta,
            'num_burnin': num_burnin,
            'associations': associations}

    # Initialize the model with basic parameters
    response = _graphlab.toolkits._main.run("text_topicmodel_init", opts)
    m = TopicModel(response['model'])

    # If initial_topics provided, load it into the model
    if isinstance(initial_topics, _graphlab.SFrame):
        assert set(['vocabulary', 'topic_probabilities']) ==              \
               set(initial_topics.column_names()),                        \
            "The provided initial_topics does not have the proper format, \
             e.g. wrong column names."
        observed_topics = initial_topics['topic_probabilities'].apply(lambda x: len(x))
        assert all(observed_topics == num_topics),                        \
            "Provided num_topics value does not match the number of provided initial_topics."

        # Rough estimate of total number of words
        weight = dataset.size() * 1000

        opts = {'model': m.__proxy__,
                'topics': initial_topics['topic_probabilities'],
                'vocabulary': initial_topics['vocabulary'],
                'weight': weight}
        response = _graphlab.toolkits._main.run("text_topicmodel_set_topics", opts)
        m = TopicModel(response['model'])

    # Train the model on the given data set and retrieve predictions
    opts = {'model': m.__proxy__,
            'data': dataset,
            'verbose': verbose,
            'validation_train': validation_train,
            'validation_test': validation_test}

    response = _graphlab.toolkits._main.run("text_topicmodel_train", opts)
    m = TopicModel(response['model'])

    return m
def roc_curve(targets, predictions, average=None):
    r"""
    Compute an ROC curve for the given targets and predictions. Currently,
    only binary classification is supported.

    Parameters
    ----------
    targets : SArray
        An SArray containing the observed values. For binary classification,
        the alpha-numerically first category is considered the reference
        category.

    predictions : SArray
        The prediction that corresponds to each target value.  This vector must
        have the same length as ``targets``. Target scores, can either be
        probability estimates of the positive class, confidence values, or
        binary decisions.

    average : string, [None (default)]
        Metric averaging strategies for multiclass classification. Averaging
        strategies can be one of the following:

            - None: No averaging is performed and a single metric is returned
              for each class.

    Returns
    -------
    out : SFrame
        Each row represents the predictive performance when using a given
        cutoff threshold, where all predictions above that cutoff are
        considered "positive". Four columns are used to describe the
        performance:

        tpr   : True positive rate, the number of true positives divided by
                the number of positives.
        fpr   : False positive rate, the number of false positives divided
                by the number of negatives.
        p     : Total number of positive values.
        n     : Total number of negative values.
        class : Reference class for this ROC curve.

    See Also
    --------
    confusion_matrix, auc

    References
    ----------
    `An introduction to ROC analysis. Tom Fawcett.
    <https://ccrma.stanford.edu/workshops/mir2009/references/ROCintro.pdf>`_

    Notes
    -----
     - For binary classification, when the target label is of type "string",
       then the labels are sorted alphanumerically and the largest label is
       chosen as the "positive" label.  For example, if the classifier labels
       are {"cat", "dog"}, then "dog" is chosen as the positive label for the
       binary classification case.
     - For multi-class classification, when the target label is of type
       "string", then the probability vector is assumed to be a vector of
       probabilities of classes as sorted alphanumerically. Hence, for the
       probability vector [0.1, 0.2, 0.7] for a dataset with classes "cat",
       "dog", and "rat"; the 0.1 corresponds to "cat", the 0.2 to "dog" and the
       0.7 to "rat".
     - The ROC curve is computed using a binning approximation with 1M bins and
       is hence accurate only to the 5th decimal.


    Examples
    --------
    .. sourcecode:: python

        >>> targets = graphlab.SArray([0, 1, 1, 0])
        >>> predictions = graphlab.SArray([0.1, 0.35, 0.7, 0.99])

        # Calculate the roc-curve.
        >>> roc_curve =  graphlab.evaluation.roc_curve(targets, predictions)
        +-------------------+-----+-----+---+---+
        |     threshold     | fpr | tpr | p | n |
        +-------------------+-----+-----+---+---+
        |        0.0        | 1.0 | 1.0 | 2 | 2 |
        | 9.99999974738e-06 | 1.0 | 1.0 | 2 | 2 |
        | 1.99999994948e-05 | 1.0 | 1.0 | 2 | 2 |
        | 2.99999992421e-05 | 1.0 | 1.0 | 2 | 2 |
        | 3.99999989895e-05 | 1.0 | 1.0 | 2 | 2 |
        | 4.99999987369e-05 | 1.0 | 1.0 | 2 | 2 |
        | 5.99999984843e-05 | 1.0 | 1.0 | 2 | 2 |
        | 7.00000018696e-05 | 1.0 | 1.0 | 2 | 2 |
        |  7.9999997979e-05 | 1.0 | 1.0 | 2 | 2 |
        | 9.00000013644e-05 | 1.0 | 1.0 | 2 | 2 |
        +-------------------+-----+-----+---+---+
        [100001 rows x 5 columns]

    For the multi-class setting, an ROC curve is returned for each class.

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets = graphlab.SArray([0, 1, 2, 3, 0, 1, 2, 3])
        >>> predictions = graphlab.SArray([1, 0, 2, 1, 3, 1, 2, 1])

        # Micro average of the recall scores for each class.
        >>> graphlab.evaluation.recall(targets, predictions,
        ...                            average = 'micro')
        0.375

        # Macro average of the recall scores for each class.
        >>> graphlab.evaluation.recall(targets, predictions,
        ...                            average = 'macro')
        0.375

        # Recall score for each class.
        >>> graphlab.evaluation.recall(targets, predictions,
        ...                            average = None)
        {0: 0.0, 1: 0.5, 2: 1.0, 3: 0.0}

    This metric also works in the multi-class setting.

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets     = graphlab.SArray([ 1, 0, 2, 1])
        >>> predictions = graphlab.SArray([[.1, .8, 0.1],
                                           [.9, .1, 0.0],
                                           [.8, .1, 0.1],
                                           [.3, .6, 0.1]])

        # Compute the ROC curve.
        >>> roc_curve = graphlab.evaluation.roc_curve(targets, predictions)
        +-----------+-----+-----+---+---+-------+
        | threshold | fpr | tpr | p | n | class |
        +-----------+-----+-----+---+---+-------+
        |    0.0    | 1.0 | 1.0 | 1 | 3 |   0   |
        |   1e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        |   2e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        |   3e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        |   4e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        |   5e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        |   6e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        |   7e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        |   8e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        |   9e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        +-----------+-----+-----+---+---+-------+
        [300003 rows x 6 columns]

    This metric also works for string classes.

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets     = graphlab.SArray(["cat", "dog", "foosa", "dog"])
        >>> predictions = graphlab.SArray([[.1, .8, 0.1],
                                           [.9, .1, 0.0],
                                           [.8, .1, 0.1],
                                           [.3, .6, 0.1]])

        # Compute the ROC curve.
        >>> roc_curve = graphlab.evaluation.roc_curve(targets, predictions)
        +-----------+-----+-----+---+---+-------+
        | threshold | fpr | tpr | p | n | class |
        +-----------+-----+-----+---+---+-------+
        |    0.0    | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   1e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   2e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   3e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   4e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   5e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   6e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   7e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   8e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   9e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        +-----------+-----+-----+---+---+-------+
        [300003 rows x 6 columns]
    """
    _mt._get_metric_tracker().track('evaluation.roc_curve')
    _supervised_evaluation_error_checking(targets, predictions)
    _check_categorical_option_type('average', average, [None])
    _check_prob_and_prob_vector(predictions)
    _check_target_not_float(targets)
    opts = {"average": average,
            "binary": predictions.dtype() in [int, float]}
    return _graphlab.extensions._supervised_streaming_evaluator(targets,
                       predictions, "roc_curve", opts)
def fbeta_score(targets, predictions, beta=1.0, average='macro'):
    r"""
    Compute the F-beta score. The F-beta score is the weighted harmonic mean of
    precision and recall. The score lies in the range [0,1] with 1 being ideal
    and 0 being the worst.

    The `beta` value is the weight given to `precision` vs `recall` in the
    combined score. `beta=0` considers only precision, as `beta` increases, more
    weight is given to recall with `beta > 1` favoring recall over precision.

    The F-beta score is defined as:

        .. math::
            f_{\beta} = (1 + \beta^2) \times \frac{(p \times r)}{(\beta^2 p + r)}

    Where :math:`p` is the precision and :math:`r` is the recall.

    Parameters
    ----------
    targets : SArray
        An SArray of ground truth class labels. Can be of any type except
        float.

    predictions : SArray
        The prediction that corresponds to each target value.  This SArray must
        have the same length as ``targets`` and must be of the same type
        as the ``targets`` SArray.

    beta: float
        Weight of the `precision` term in the harmonic mean.

    average : string, [None, 'macro' (default), 'micro']
        Metric averaging strategies for multiclass classification. Averaging
        strategies can be one of the following:

            - None: No averaging is performed and a single metric is returned
              for each class.
            - 'micro': Calculate metrics globally by counting the total true
              positives, false negatives and false positives.
            - 'macro': Calculate metrics for each label, and find their
              unweighted mean. This does not take label imbalance into account.

        For a more precise definition of `micro` and `macro` averaging refer
        to [1] below.

    Returns
    -------
    out : float (for binary classification) or dict[float] (for multi-class, average=None)
        Score for the positive class (for binary classification) or an average
        score for each class for multi-class classification.  If
        `average=None`, then a dictionary is returned where the key is the
        class label and the value is the score for the corresponding class
        label.

    Notes
    -----
     - For binary classification, if the target label is of type "string",
       then the labels are sorted alphanumerically and the largest label is
       chosen as the "positive" label.  For example, if the classifier labels
       are {"cat", "dog"}, then "dog" is chosen as the positive label for the
       binary classification case.


    See Also
    --------
    confusion_matrix, accuracy, precision, recall, f1_score

    Examples
    --------

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets = graphlab.SArray([0, 1, 2, 3, 0, 1, 2, 3])
        >>> predictions = graphlab.SArray([1, 0, 2, 1, 3, 1, 0, 1])

        # Micro average of the F-Beta score
        >>> graphlab.evaluation.fbeta_score(targets, predictions,
        ...                                 beta=2.0, average = 'micro')
        0.25

        # Macro average of the F-Beta score
        >>> graphlab.evaluation.fbeta_score(targets, predictions,
        ...                                 beta=2.0, average = 'macro')
        0.24305555555555558

        # F-Beta score for each class.
        >>> graphlab.evaluation.fbeta_score(targets, predictions,
        ...                                 beta=2.0, average = None)
        {0: 0.0, 1: 0.4166666666666667, 2: 0.5555555555555556, 3: 0.0}

    This metric also works when the targets are of type `str`

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets = graphlab.SArray(
        ...      ["cat", "dog", "foosa", "snake", "cat", "dog", "foosa", "snake"])
        >>> predictions = graphlab.SArray(
        ...      ["dog", "cat", "foosa", "dog", "snake", "dog", "cat", "dog"])

        # Micro average of the F-Beta score
        >>> graphlab.evaluation.fbeta_score(targets, predictions,
        ...                                 beta=2.0, average = 'micro')
        0.25

        # Macro average of the F-Beta score
        >>> graphlab.evaluation.fbeta_score(targets, predictions,
        ...                                 beta=2.0, average = 'macro')
        0.24305555555555558

        # F-Beta score for each class.
        >>> graphlab.evaluation.fbeta_score(targets, predictions,
        ...                                 beta=2.0, average = None)
        {'cat': 0.0, 'dog': 0.4166666666666667, 'foosa': 0.5555555555555556, 'snake': 0.0}

    References
    ----------
    - [1] Sokolova, Marina, and Guy Lapalme. "A systematic analysis of
      performance measures for classification tasks." Information Processing &
      Management 45.4 (2009): 427-437.

    """
    _mt._get_metric_tracker().track('evaluation.fbeta_score')
    _supervised_evaluation_error_checking(targets, predictions)
    _check_categorical_option_type('average', average,
                         ['micro', 'macro', None])
    _check_same_type_not_float(targets, predictions)

    opts = {"beta"    : beta,
            "average" : average}
    return _graphlab.extensions._supervised_streaming_evaluator(targets,
                          predictions, "fbeta_score", opts)
    def predict(self, dataset, output_type='class', missing_value_action='auto'):
        """
        A flexible and advanced prediction API.

        The target column is provided during
        :func:`~graphlab.boosted_trees.create`. If the target column is in the
        `dataset` it will be ignored.

        Parameters
        ----------
        dataset : SFrame
          A dataset that has the same columns that were used during training.
          If the target column exists in ``dataset`` it will be ignored
          while making predictions.

        output_type : {'probability', 'margin', 'class', 'probability_vector'}, optional.
            Form of the predictions which are one of:

            - 'probability': Prediction probability associated with the True
               class (not applicable for multi-class classification)
            - 'margin': Margin associated with the prediction (not applicable
              for multi-class classification)
            - 'probability_vector': Prediction probability associated with each
              class as a vector. The probability of the first class (sorted
              alphanumerically by name of the class in the training set) is in
              position 0 of the vector, the second in position 1 and so on.
            - 'class': Class prediction. For multi-class classification, this
               returns the class with maximum probability.

        missing_value_action : str, optional
            Action to perform when missing values are encountered. Can be
            one of:

            - 'auto': By default the model will treat missing value as is.
            - 'impute': Proceed with evaluation by filling in the missing
              values with the mean of the training data. Missing
              values are also imputed if an entire column of data is
              missing during evaluation.
            - 'error': Do not proceed with evaluation and terminate with
              an error message.


        Returns
        -------
        out : SArray
           Predicted target value for each example (i.e. row) in the dataset.

        See Also
        ----------
        create, evaluate, classify

        Examples
        --------
        >>> m.predict(testdata)
        >>> m.predict(testdata, output_type='probability')
        >>> m.predict(testdata, output_type='margin')
        """
        _mt._get_metric_tracker().track('toolkit.classifier.boosted_trees_classifier.predict')
        _check_categorical_option_type('output_type', output_type,
                ['class', 'margin', 'probability', 'probability_vector'])
        return super(_Classifier, self).predict(dataset,
                                                output_type=output_type,
                                                missing_value_action=missing_value_action)
    def predict_topk(self,
                     dataset,
                     output_type="probability",
                     k=3,
                     missing_value_action='auto'):
        """
        Return top-k predictions for the ``dataset``, using the trained model.
        Predictions are returned as an SFrame with three columns: `row_id`,
        `class`, and `probability`, `margin`,  or `rank`, depending on the ``output_type``
        parameter. Input dataset size must be the same as for training of the model.

        Parameters
        ----------
        dataset : SFrame
            A dataset that has the same columns that were used during training.
            If the target column exists in ``dataset`` it will be ignored
            while making predictions.

        output_type : {'probability', 'rank', 'margin'}, optional
            Choose the return type of the prediction:

            - `probability`: Probability associated with each label in the prediction.
            - `rank`       : Rank associated with each label in the prediction.
            - `margin`     : Margin associated with each label in the prediction.

        k : int, optional
            Number of classes to return for each input example.

        missing_value_action : str, optional
            Action to perform when missing values are encountered. Can be
            one of:

            - 'auto': By default the model will treat missing value as is.
            - 'impute': Proceed with evaluation by filling in the missing
              values with the mean of the training data. Missing
              values are also imputed if an entire column of data is
              missing during evaluation.
            - 'error': Do not proceed with evaluation and terminate with
              an error message.

        Returns
        -------
        out : SFrame
            An SFrame with model predictions.

        See Also
        --------
        predict, classify, evaluate

        Examples
        --------
        >>> pred = m.predict_topk(validation_data, k=3)
        >>> pred
        +--------+-------+-------------------+
        | row_id | class |   probability     |
        +--------+-------+-------------------+
        |   0    |   4   |   0.995623886585  |
        |   0    |   9   |  0.0038311756216  |
        |   0    |   7   | 0.000301006948575 |
        |   1    |   1   |   0.928708016872  |
        |   1    |   3   |  0.0440889261663  |
        |   1    |   2   |  0.0176190119237  |
        |   2    |   3   |   0.996967732906  |
        |   2    |   2   |  0.00151345680933 |
        |   2    |   7   | 0.000637513934635 |
        |   3    |   1   |   0.998070061207  |
        |  ...   |  ...  |        ...        |
        +--------+-------+-------------------+
        [35688 rows x 3 columns]
        """
        _mt._get_metric_tracker().track(
            'toolkit.classifier.random_forest_classifier.predict_topk')
        _check_categorical_option_type('output_type', output_type,
                                       ['rank', 'margin', 'probability'])
        if missing_value_action == 'auto':
            missing_value_action = _sl.select_default_missing_value_policy(
                self, 'predict')

        # Low latency path
        if isinstance(dataset, list):
            return _graphlab.extensions._fast_predict_topk(
                self.__proxy__, dataset, output_type, missing_value_action, k)
        if isinstance(dataset, dict):
            return _graphlab.extensions._fast_predict_topk(
                self.__proxy__, [dataset], output_type, missing_value_action,
                k)

        options = dict()
        options.update({
            'model': self.__proxy__,
            'model_name': self.__name__,
            'dataset': dataset,
            'output_type': output_type,
            'topk': k,
            'missing_value_action': missing_value_action
        })
        target = _graphlab.toolkits._main.run(
            'supervised_learning_predict_topk', options)
        return _map_unity_proxy_to_object(target['predicted'])
    def predict(self,
                dataset,
                output_type='class',
                missing_value_action='auto'):
        """
        A flexible and advanced prediction API.

        The target column is provided during
        :func:`~graphlab.random_forest.create`. If the target column is in the
        `dataset` it will be ignored.

        Parameters
        ----------
        dataset : SFrame
          A dataset that has the same columns that were used during training.
          If the target column exists in ``dataset`` it will be ignored
          while making predictions.

        output_type : {'probability', 'margin', 'class', 'probability_vector'}, optional.
            Form of the predictions which are one of:

            - 'probability': Prediction probability associated with the True
               class (not applicable for multi-class classification)
            - 'margin': Margin associated with the prediction (not applicable
              for multi-class classification)
            - 'probability_vector': Prediction probability associated with each
              class as a vector. The probability of the first class (sorted
              alphanumerically by name of the class in the training set) is in
              position 0 of the vector, the second in position 1 and so on.
            - 'class': Class prediction. For multi-class classification, this
               returns the class with maximum probability.

        missing_value_action : str, optional
            Action to perform when missing values are encountered. Can be
            one of:

            - 'auto': By default the model will treat missing value as is.
            - 'impute': Proceed with evaluation by filling in the missing
              values with the mean of the training data. Missing
              values are also imputed if an entire column of data is
              missing during evaluation.
            - 'error': Do not proceed with evaluation and terminate with
              an error message.

        Returns
        -------
        out : SArray
           Predicted target value for each example (i.e. row) in the dataset.

        See Also
        ----------
        create, evaluate, classify

        Examples
        --------
        >>> m.predict(testdata)
        >>> m.predict(testdata, output_type='probability')
        >>> m.predict(testdata, output_type='margin')
        """
        _mt._get_metric_tracker().track(
            'toolkit.classifier.random_forest_classifier.predict')
        _check_categorical_option_type(
            'output_type', output_type,
            ['class', 'margin', 'probability', 'probability_vector'])
        return super(_Classifier,
                     self).predict(dataset,
                                   output_type=output_type,
                                   missing_value_action=missing_value_action)
    def predict_topk(self, dataset, output_type="probability", k=3):
        """
        Return top-k predictions for the ``dataset``, using the trained model.
        Predictions are returned as an SFrame with three columns: `row_id`,
        `class`, and `probability`, `margin`,  or `rank`, depending on the ``output_type``
        parameter. Input dataset size must be the same as for training of the model.

        Parameters
        ----------
        dataset : SFrame
            A dataset that has the same columns that were used during training.
            If the target column exists in ``dataset`` it will be ignored
            while making predictions.

        output_type : {'probability', 'rank', 'margin'}, optional
            Choose the return type of the prediction:

            - `probability`: Probability associated with each label in the prediction.
            - `rank`       : Rank associated with each label in the prediction.
            - `margin`     : Margin associated with each label in the prediction.

        k : int, optional
            Number of classes to return for each input example.

        Returns
        -------
        out : SFrame
            An SFrame with model predictions.

        See Also
        --------
        predict, classify, evaluate

        Examples
        --------
        >>> pred = m.predict_topk(validation_data, k=3)
        >>> pred
        +--------+-------+-------------------+
        | row_id | class |   probability     |
        +--------+-------+-------------------+
        |   0    |   4   |   0.995623886585  |
        |   0    |   9   |  0.0038311756216  |
        |   0    |   7   | 0.000301006948575 |
        |   1    |   1   |   0.928708016872  |
        |   1    |   3   |  0.0440889261663  |
        |   1    |   2   |  0.0176190119237  |
        |   2    |   3   |   0.996967732906  |
        |   2    |   2   |  0.00151345680933 |
        |   2    |   7   | 0.000637513934635 |
        |   3    |   1   |   0.998070061207  |
        |  ...   |  ...  |        ...        |
        +--------+-------+-------------------+
        [35688 rows x 3 columns]
        """
        _mt._get_metric_tracker().track('toolkit.classifier.boosted_trees_classifier.predict_topk')
        _raise_error_if_not_sframe(dataset, "dataset")
        _check_categorical_option_type('output_type', output_type, ['rank', 'margin', 'probability'])
        options = dict()
        options.update({'model': self.__proxy__,
                        'model_name': self.__name__,
                        'dataset': dataset,
                        'output_type': output_type,
                        'topk': k,
                        'missing_value_action': 'error'})
        target = _graphlab.toolkits._main.run('supervised_learning_predict_topk', options)
        return _map_unity_proxy_to_object(target['predicted'])
Esempio n. 16
0
def roc_curve(targets, predictions, average=None):
    r"""
    Compute an ROC curve for the given targets and predictions. Currently,
    only binary classification is supported.

    Parameters
    ----------
    targets : SArray
        An SArray containing the observed values. For binary classification,
        the alpha-numerically first category is considered the reference
        category.

    predictions : SArray
        The prediction that corresponds to each target value.  This vector must
        have the same length as ``targets``. Target scores, can either be
        probability estimates of the positive class, confidence values, or
        binary decisions.

    average : string, [None (default)]
        Metric averaging strategies for multiclass classification. Averaging
        strategies can be one of the following:

            - None: No averaging is performed and a single metric is returned
              for each class.

    Returns
    -------
    out : SFrame
        Each row represents the predictive performance when using a given
        cutoff threshold, where all predictions above that cutoff are
        considered "positive". Four columns are used to describe the
        performance:

            - tpr   : True positive rate, the number of true positives divided by the number of positives.
            - fpr   : False positive rate, the number of false positives divided by the number of negatives.
            - p     : Total number of positive values.
            - n     : Total number of negative values.
            - class : Reference class for this ROC curve.

    See Also
    --------
    confusion_matrix, auc

    References
    ----------
    `An introduction to ROC analysis. Tom Fawcett.
    <https://ccrma.stanford.edu/workshops/mir2009/references/ROCintro.pdf>`_

    Notes
    -----
     - For binary classification, when the target label is of type "string",
       then the labels are sorted alphanumerically and the largest label is
       chosen as the "positive" label.  For example, if the classifier labels
       are {"cat", "dog"}, then "dog" is chosen as the positive label for the
       binary classification case.
     - For multi-class classification, when the target label is of type
       "string", then the probability vector is assumed to be a vector of
       probabilities of classes as sorted alphanumerically. Hence, for the
       probability vector [0.1, 0.2, 0.7] for a dataset with classes "cat",
       "dog", and "rat"; the 0.1 corresponds to "cat", the 0.2 to "dog" and the
       0.7 to "rat".
     - The ROC curve is computed using a binning approximation with 1M bins and
       is hence accurate only to the 5th decimal.


    Examples
    --------
    .. sourcecode:: python

        >>> targets = graphlab.SArray([0, 1, 1, 0])
        >>> predictions = graphlab.SArray([0.1, 0.35, 0.7, 0.99])

        # Calculate the roc-curve.
        >>> roc_curve =  graphlab.evaluation.roc_curve(targets, predictions)
        +-------------------+-----+-----+---+---+
        |     threshold     | fpr | tpr | p | n |
        +-------------------+-----+-----+---+---+
        |        0.0        | 1.0 | 1.0 | 2 | 2 |
        | 9.99999974738e-06 | 1.0 | 1.0 | 2 | 2 |
        | 1.99999994948e-05 | 1.0 | 1.0 | 2 | 2 |
        | 2.99999992421e-05 | 1.0 | 1.0 | 2 | 2 |
        | 3.99999989895e-05 | 1.0 | 1.0 | 2 | 2 |
        | 4.99999987369e-05 | 1.0 | 1.0 | 2 | 2 |
        | 5.99999984843e-05 | 1.0 | 1.0 | 2 | 2 |
        | 7.00000018696e-05 | 1.0 | 1.0 | 2 | 2 |
        |  7.9999997979e-05 | 1.0 | 1.0 | 2 | 2 |
        | 9.00000013644e-05 | 1.0 | 1.0 | 2 | 2 |
        +-------------------+-----+-----+---+---+
        [100001 rows x 5 columns]

    For the multi-class setting, an ROC curve is returned for each class.

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets = graphlab.SArray([0, 1, 2, 3, 0, 1, 2, 3])
        >>> predictions = graphlab.SArray([1, 0, 2, 1, 3, 1, 2, 1])

        # Micro average of the recall scores for each class.
        >>> graphlab.evaluation.recall(targets, predictions,
        ...                            average = 'micro')
        0.375

        # Macro average of the recall scores for each class.
        >>> graphlab.evaluation.recall(targets, predictions,
        ...                            average = 'macro')
        0.375

        # Recall score for each class.
        >>> graphlab.evaluation.recall(targets, predictions,
        ...                            average = None)
        {0: 0.0, 1: 0.5, 2: 1.0, 3: 0.0}

    This metric also works in the multi-class setting.

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets     = graphlab.SArray([ 1, 0, 2, 1])
        >>> predictions = graphlab.SArray([[.1, .8, 0.1],
        ...                                [.9, .1, 0.0],
        ...                                [.8, .1, 0.1],
        ...                                [.3, .6, 0.1]])

        # Compute the ROC curve.
        >>> roc_curve = graphlab.evaluation.roc_curve(targets, predictions)
        +-----------+-----+-----+---+---+-------+
        | threshold | fpr | tpr | p | n | class |
        +-----------+-----+-----+---+---+-------+
        |    0.0    | 1.0 | 1.0 | 1 | 3 |   0   |
        |   1e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        |   2e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        |   3e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        |   4e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        |   5e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        |   6e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        |   7e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        |   8e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        |   9e-05   | 1.0 | 1.0 | 1 | 3 |   0   |
        +-----------+-----+-----+---+---+-------+
        [300003 rows x 6 columns]

    This metric also works for string classes.

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets     = graphlab.SArray(["cat", "dog", "foosa", "dog"])
        >>> predictions = graphlab.SArray([[.1, .8, 0.1],
        ...                                [.9, .1, 0.0],
        ...                                [.8, .1, 0.1],
        ...                                [.3, .6, 0.1]])

        # Compute the ROC curve.
        >>> roc_curve = graphlab.evaluation.roc_curve(targets, predictions)
        +-----------+-----+-----+---+---+-------+
        | threshold | fpr | tpr | p | n | class |
        +-----------+-----+-----+---+---+-------+
        |    0.0    | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   1e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   2e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   3e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   4e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   5e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   6e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   7e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   8e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        |   9e-05   | 1.0 | 1.0 | 1 | 3 |  cat  |
        +-----------+-----+-----+---+---+-------+
        [300003 rows x 6 columns]
    """
    _mt._get_metric_tracker().track('evaluation.roc_curve')
    _supervised_evaluation_error_checking(targets, predictions)
    _check_categorical_option_type('average', average, [None])
    _check_prob_and_prob_vector(predictions)
    _check_target_not_float(targets)
    opts = {"average": average,
            "binary": predictions.dtype() in [int, float]}
    return _graphlab.extensions._supervised_streaming_evaluator(targets,
                       predictions, "roc_curve", opts)
def auc(targets, predictions, average='macro'):
    r"""
    Compute the area under the ROC curve for the given targets and predictions.
    Currently, only binary classification is supported.

    Parameters
    ----------
    targets : SArray
        An SArray containing the observed values. For binary classification,
        the alpha-numerically first category is considered the reference
        category.

    predictions : SArray
        The prediction that corresponds to each target value.  This vector must
        have the same length as ``targets``.  have the same length as
        ``targets``. Target scores, can either be probability estimates of the
        positive class, confidence values, or binary decisions.

    average : string, [None, 'macro' (default)]
        Metric averaging strategies for multiclass classification. Averaging
        strategies can be one of the following:

            - None: No averaging is performed and a single metric is returned
              for each class.
            - 'macro': Calculate metrics for each label, and find their
              unweighted mean. This does not take label imbalance into account.

    Returns
    -------
    out : float (for binary classification) or dict[float]
        Score for the positive class (for binary classification) or an average
        score for each class for multi-class classification.  If
        `average=None`, then a dictionary is returned where the key is the
        class label and the value is the score for the corresponding class
        label.

    See Also
    --------
    roc_curve, confusion_matrix

    Examples
    --------
    .. sourcecode:: python

        >>> targets = graphlab.SArray([0, 1, 1, 0])
        >>> predictions = graphlab.SArray([0.1, 0.35, 0.7, 0.99])

        # Calculate the auc-score
        >>> auc =  graphlab.evaluation.auc(targets, predictions)
        0.5

    This metric also works when the targets are strings (Here "cat" is chosen
    as the reference class).

    .. sourcecode:: python

        >>> targets = graphlab.SArray(["cat", "dog", "dog", "cat"])
        >>> predictions = graphlab.SArray([0.1, 0.35, 0.7, 0.99])

        # Calculate the auc-score
        >>> auc =  graphlab.evaluation.auc(targets, predictions)
        0.5


    For the multi-class setting, the auc-score can be averaged.

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets     = graphlab.SArray([ 1, 0, 2, 1])
        >>> predictions = graphlab.SArray([[.1, .8, 0.1],
        ...                                [.9, .1, 0.0],
        ...                                [.8, .1, 0.1],
        ...                                [.3, .6, 0.1]])

        # Micro average of the scores for each class.
        >>> graphlab.evaluation.recall(targets, predictions, average = 'macro')
        0.8888888888888888

        # Scores for each class.
        >>> graphlab.evaluation.recall(targets, predictions, average = None)
        {0: 1.0, 1: 1.0, 2: 0.6666666666666666}

    This metric also works for "string" targets in the multi-class setting

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets     = graphlab.SArray([ "dog", "cat", "foosa", "dog"])
        >>> predictions = graphlab.SArray([[.1, .8, 0.1],
                                           [.9, .1, 0.0],
                                           [.8, .1, 0.1],
                                           [.3, .6, 0.1]])

        # Macro average.
        >>> auc =  graphlab.evaluation.auc(targets, predictions)
        0.8888888888888888

        # Score for each class.
        >>> auc =  graphlab.evaluation.auc(targets, predictions, average=None)
        {'cat': 1.0, 'dog': 1.0, 'foosa': 0.6666666666666666}

    """
    _mt._get_metric_tracker().track('evaluation.auc')
    _supervised_evaluation_error_checking(targets, predictions)
    _check_categorical_option_type('average', average,
                         ['macro', None])
    _check_prob_and_prob_vector(predictions)
    _check_target_not_float(targets)
    opts = {"average": average,
            "binary": predictions.dtype() in [int, float]}
    return _graphlab.extensions._supervised_streaming_evaluator(targets,
                      predictions, "auc", opts)
    def predict(self, dataset, output_type='class', missing_value_action='auto'):
        """
        Return predictions for ``dataset``, using the trained logistic
        regression model. Predictions can be generated as class labels (0 or
        1), or margins (i.e. the distance of the observations from the hyperplane
        separating the classes). By default, the predict method returns class
        labels.

        For each new example in ``dataset``, the margin---also known as the
        linear predictor---is the inner product of the example and the model
        coefficients plus the intercept term. Predicted classes are obtained by
        thresholding the margins at 0.

        Parameters
        ----------
        dataset : SFrame | dict
            Dataset of new observations. Must include columns with the same
            names as the features used for model training, but does not require
            a target column. Additional columns are ignored.

        output_type : {'margin', 'class'}, optional
            Form of the predictions which are one of:

            - 'margin': Distance of the observations from the hyperplane
              separating the classes.
            - 'class': Class prediction.

        missing_value_action : str, optional
            Action to perform when missing values are encountered. This can be
            one of:

            - 'auto': Default to 'impute'
            - 'impute': Proceed with evaluation by filling in the missing
              values with the mean of the training data. Missing
              values are also imputed if an entire column of data is
              missing during evaluation.
            - 'error' : Do not proceed with prediction and terminate with
              an error message.

        Returns
        -------
        out : SArray
            An SArray with model predictions.

        See Also
        ----------
        create, evaluate, classify

        Examples
        ----------
        >>> data =  graphlab.SFrame('http://s3.amazonaws.com/dato-datasets/regression/houses.csv')

        >>> data['is_expensive'] = data['price'] > 30000
        >>> model = graphlab.svm_classifier.create(data,
                                  target='is_expensive',
                                  features=['bath', 'bedroom', 'size'])

        >>> class_predictions = model.predict(data)
        >>> margin_predictions = model.predict(data, output_type='margin')

        """

        _mt._get_metric_tracker().track('toolkit.classifier.svm_classifier.predict')
        _check_categorical_option_type('output_type', output_type,
                                               ['class', 'margin'])
        return super(_Classifier, self).predict(dataset,
                                                output_type=output_type,
                                                missing_value_action=missing_value_action)
def recall(targets, predictions, average='macro'):
    r"""
    Compute the recall score for classification tasks. The recall score
    quantifies the ability of a classifier to predict `positive` examples.
    Recall can be interpreted as the probability that a randomly selected
    `positive` example is correctly identified by the classifier. The score
    is in the range [0,1] with 0 being the worst, and 1 being perfect.


    The recall score is defined as the ratio:
        .. math::
            \frac{tp}{tp + fn}

    where `tp` is the number of true positives and `fn` the number of false
    negatives.

    Parameters
    ----------
    targets : SArray
        Ground truth class labels. The SArray can be of any type.

    predictions : SArray
        The prediction that corresponds to each target value.  This SArray must
        have the same length as ``targets`` and must be of the same type
        as the ``targets`` SArray.

    average : string, [None, 'macro' (default), 'micro']
        Metric averaging strategies for multiclass classification. Averaging
        strategies can be one of the following:

            - None: No averaging is performed and a single metric is returned
              for each class.
            - 'micro': Calculate metrics globally by counting the total true
              positives, false negatives, and false positives.
            - 'macro': Calculate metrics for each label and find their
              unweighted mean. This does not take label imbalance into account.

    Returns
    -------
    out : float (for binary classification) or dict[float]
        Score for the positive class (for binary classification) or an average
        score for each class for multi-class classification.  If
        `average=None`, then a dictionary is returned where the key is the
        class label and the value is the score for the corresponding class
        label.

    Notes
    -----
     - For binary classification, when the target label is of type "string",
       then the labels are sorted alphanumerically and the largest label is
       chosen as the "positive" label.  For example, if the classifier labels
       are {"cat", "dog"}, then "dog" is chosen as the positive label for the
       binary classification case.

    See Also
    --------
    confusion_matrix, accuracy, precision, f1_score

    Examples
    --------

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets = graphlab.SArray([0, 1, 2, 3, 0, 1, 2, 3])
        >>> predictions = graphlab.SArray([1, 0, 2, 1, 3, 1, 2, 1])

        # Micro average of the recall scores for each class.
        >>> graphlab.evaluation.recall(targets, predictions,
        ...                            average = 'micro')
        0.375

        # Macro average of the recall scores for each class.
        >>> graphlab.evaluation.recall(targets, predictions,
        ...                            average = 'macro')
        0.375

        # Recall score for each class.
        >>> graphlab.evaluation.recall(targets, predictions,
        ...                            average = None)
        {0: 0.0, 1: 0.5, 2: 1.0, 3: 0.0}

    This metric also works for string classes.

    .. sourcecode:: python

        # Targets and Predictions
        >>> targets = graphlab.SArray(
        ...      ["cat", "dog", "foosa", "snake", "cat", "dog", "foosa", "snake"])
        >>> predictions = graphlab.SArray(
        ...      ["dog", "cat", "foosa", "dog", "snake", "dog", "cat", "dog"])

        # Micro average of the recall scores for each class.
        >>> graphlab.evaluation.recall(targets, predictions,
        ...                            average = 'micro')
        0.375

        # Macro average of the recall scores for each class.
        >>> graphlab.evaluation.recall(targets, predictions,
        ...                            average = 'macro')
        0.375

        # Recall score for each class.
        >>> graphlab.evaluation.recall(targets, predictions,
        ...                            average = None)
        {0: 0.0, 1: 0.5, 2: 1.0, 3: 0.0}
    """
    _mt._get_metric_tracker().track('evaluation.precision')
    _supervised_evaluation_error_checking(targets, predictions)
    _check_categorical_option_type('average', average,
                         ['micro', 'macro', None])
    _check_same_type_not_float(targets, predictions)
    opts = {"average": average}
    return _graphlab.extensions._supervised_streaming_evaluator(targets,
                          predictions, "recall", opts)
def create(dataset,
           num_topics=10,
           initial_topics=None,
           alpha=None,
           beta=.1,
           num_iterations=10,
           associations=None,
           verbose=False,
           print_interval=10,
           validation_set=None,
           method='auto'):
    """
    Create a topic model from the given data set. A topic model assumes each
    document is a mixture of a set of topics, where for each topic some words
    are more likely than others. One statistical approach to do this is called a
    "topic model". This method learns a topic model for the given document
    collection.

    Parameters
    ----------
    dataset : SArray of type dict or SFrame with a single column of type dict
        A bag of words representation of a document corpus.
        Each element is a dictionary representing a single document, where
        the keys are words and the values are the number of times that word
        occurs in that document.

    num_topics : int, optional
        The number of topics to learn.

    initial_topics : SFrame, optional
        An SFrame with a column of unique words representing the vocabulary
        and a column of dense vectors representing
        probability of that word given each topic. When provided,
        these values are used to initialize the algorithm.

    num_iterations : int, optional
        The number of iterations to perform.

    alpha : float, optional
        Hyperparameter that controls the diversity of topics in a document.
        Smaller values encourage fewer topics per document.
        Provided value must be positive. Default value is 50/num_topics.

    beta : float, optional
        Hyperparameter that controls the diversity of words in a topic.
        Smaller values encourage fewer words per topic. Provided value
        must be positive.

    verbose : bool, optional
        When True, print most probable words for each topic while printing
        progress.

    print_interval : int, optional
        The number of iterations to wait between progress reports.

    associations : SFrame, optional
        An SFrame with two columns named "word" and "topic" containing words
        and the topic id that the word should be associated with. These words
        are not considered during learning.

    validation_set : SArray of type dict or SFrame with a single column
        A bag of words representation of a document corpus, similar to the
        format required for `dataset`. This will be used to monitor model
        performance during training. Each document in the provided validation
        set is randomly split: the first portion is used estimate which topic
        each document belongs to, and the second portion is used to estimate
        the model's performance at predicting the unseen words in the test data.

    method : {'cgs', 'alias'}, optional
        The algorithm used for learning the model.

        - *cgs:* Collapsed Gibbs sampling
        - *alias:* AliasLDA method.

    Returns
    -------
    out : TopicModel
        A fitted topic model. This can be used with
        :py:func:`~TopicModel.get_topics()` and
        :py:func:`~TopicModel.predict()`. While fitting is in progress, several
        metrics are shown, including:

        +------------------+---------------------------------------------------+
        |      Field       | Description                                       |
        +==================+===================================================+
        | Elapsed Time     | The number of elapsed seconds.                    |
        +------------------+---------------------------------------------------+
        | Tokens/second    | The number of unique words processed per second   |
        +------------------+---------------------------------------------------+
        | Est. Perplexity  | An estimate of the model's ability to model the   |
        |                  | training data. See the documentation on evaluate. |
        +------------------+---------------------------------------------------+

    See Also
    --------
    TopicModel, TopicModel.get_topics, TopicModel.predict,
    graphlab.SArray.dict_trim_by_keys

    References
    ----------
    - `Wikipedia - Latent Dirichlet allocation
      <http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation>`_

    - Alias method: Li, A. et al. (2014) `Reducing the Sampling Complexity of
      Topic Models. <http://www.sravi.org/pubs/fastlda-kdd2014.pdf>`_.
      KDD 2014.

    Examples
    --------
    The following example includes an SArray of documents, where
    each element represents a document in "bag of words" representation
    -- a dictionary with word keys and whose values are the number of times
    that word occurred in the document:

    >>> docs = graphlab.SArray('http://s3.amazonaws.com/GraphLab-Datasets/nytimes')

    Once in this form, it is straightforward to learn a topic model.

    >>> m = graphlab.topic_model.create(docs)

    It is also easy to create a new topic model from an old one  -- whether
    it was created using GraphLab Create or another package.

    >>> m2 = graphlab.topic_model.create(docs, initial_topics=m['topics'])

    To manually fix several words to always be assigned to a topic, use
    the `associations` argument. The following will ensure that topic 0
    has the most probability for each of the provided words:

    >>> from graphlab import SFrame
    >>> associations = SFrame({'word':['hurricane', 'wind', 'storm'],
                               'topic': [0, 0, 0]})
    >>> m = graphlab.topic_model.create(docs,
                                        associations=associations)

    More advanced usage allows you  to control aspects of the model and the
    learning method.

    >>> import graphlab as gl
    >>> m = gl.topic_model.create(docs,
                                  num_topics=20,       # number of topics
                                  num_iterations=10,   # algorithm parameters
                                  alpha=.01, beta=.1)  # hyperparameters
    """
    _mt._get_metric_tracker().track('toolkit.text.topic_model.create')

    dataset = _check_input(dataset)

    _check_categorical_option_type("method", method, ['auto', 'cgs', 'alias'])
    if method == 'cgs' or method == 'auto':
        model_name = 'cgs_topic_model'
    else:
        model_name = 'alias_topic_model'

    # If associations are provided, check they are in the proper format
    if associations is None:
        associations = _graphlab.SFrame({'word': [], 'topic': []})
    if isinstance(associations, _graphlab.SFrame) and \
       associations.num_rows() > 0:
        assert set(associations.column_names()) == set(['word', 'topic']), \
            "Provided associations must be an SFrame containing a word column\
             and a topic column."
        assert associations['word'].dtype() == str, \
            "Words must be strings."
        assert associations['topic'].dtype() == int, \
            "Topic ids must be of int type."
    if alpha is None:
        alpha = float(50) / num_topics

    if validation_set is not None:
        _check_input(validation_set)  # Must be a single column
        if isinstance(validation_set, _graphlab.SFrame):
            column_name = validation_set.column_names()[0]
            validation_set = validation_set[column_name]
        (validation_train, validation_test) = _random_split(validation_set)
    else:
        validation_train = _SArray()
        validation_test = _SArray()

    opts = {'model_name': model_name,
            'data': dataset,
            'verbose': verbose,
            'num_topics': num_topics,
            'num_iterations': num_iterations,
            'print_interval': print_interval,
            'alpha': alpha,
            'beta': beta,
            'associations': associations}

    # Initialize the model with basic parameters
    response = _graphlab.toolkits._main.run("text_topicmodel_init", opts)
    m = TopicModel(response['model'])

    # If initial_topics provided, load it into the model
    if isinstance(initial_topics, _graphlab.SFrame):
        assert set(['vocabulary', 'topic_probabilities']) ==              \
               set(initial_topics.column_names()),                        \
            "The provided initial_topics does not have the proper format, \
             e.g. wrong column names."
        observed_topics = initial_topics['topic_probabilities'].apply(lambda x: len(x))
        assert all(observed_topics == num_topics),                        \
            "Provided num_topics value does not match the number of provided initial_topics."

        # Rough estimate of total number of words
        weight = dataset.size() * 1000

        opts = {'model': m.__proxy__,
                'topics': initial_topics['topic_probabilities'],
                'vocabulary': initial_topics['vocabulary'],
                'weight': weight}
        response = _graphlab.toolkits._main.run("text_topicmodel_set_topics", opts)
        m = TopicModel(response['model'])

    # Train the model on the given data set and retrieve predictions
    opts = {'model': m.__proxy__,
            'data': dataset,
            'verbose': verbose,
            'validation_train': validation_train,
            'validation_test': validation_test}

    response = _graphlab.toolkits._main.run("text_topicmodel_train", opts)
    m = TopicModel(response['model'])

    return m
    def get_topics(self, topic_ids=None, num_words=5, cdf_cutoff=1.0,
                   output_type='topic_probabilities'):

        """
        Get the words associated with a given topic. The score column is the
        probability of choosing that word given that you have chosen a
        particular topic.

        Parameters
        ----------
        topic_ids : list of int, optional
            The topics to retrieve words. Topic ids are zero-based.
            Throws an error if greater than or equal to m['num_topics'], or
            if the requested topic name is not present.

        num_words : int, optional
            The number of words to show.

        cdf_cutoff : float, optional
            Allows one to only show the most probable words whose cumulative
            probability is below this cutoff. For example if there exist
            three words where

            .. math::
               p(word_1 | topic_k) = .1

               p(word_2 | topic_k) = .2

               p(word_3 | topic_k) = .05

            then setting :math:`cdf_{cutoff}=.3` would return only
            :math:`word_1` and :math:`word_2` since
            :math:`p(word_1 | topic_k) + p(word_2 | topic_k) <= cdf_{cutoff}`

        output_type : {'topic_probabilities' | 'topic_words'}, optional
            Determine the type of desired output. See below.

        Returns
        -------
        out : SFrame
            If output_type is 'topic_probabilities', then the returned value is
            an SFrame with a column of words ranked by a column of scores for
            each topic. Otherwise, the returned value is a SArray where
            each element is a list of the most probable words for each topic.

        Examples
        --------
        Get the highest ranked words for all topics.

        >>> docs = graphlab.SArray('http://s3.amazonaws.com/GraphLab-Datasets/nips-text')
        >>> m = graphlab.topic_model.create(docs,
                                            num_iterations=50)
        >>> m.get_topics()
        +-------+----------+-----------------+
        | topic |   word   |      score      |
        +-------+----------+-----------------+
        |   0   |   cell   |  0.028974400831 |
        |   0   |  input   | 0.0259470208503 |
        |   0   |  image   | 0.0215721599763 |
        |   0   |  visual  | 0.0173635081992 |
        |   0   |  object  | 0.0172447874156 |
        |   1   | function | 0.0482834508265 |
        |   1   |  input   | 0.0456270024091 |
        |   1   |  point   | 0.0302662839454 |
        |   1   |  result  | 0.0239474934631 |
        |   1   | problem  | 0.0231750116011 |
        |  ...  |   ...    |       ...       |
        +-------+----------+-----------------+

        Get the highest ranked words for topics 0 and 1 and show 15 words per
        topic.

        >>> m.get_topics([0, 1], num_words=15)
        +-------+----------+------------------+
        | topic |   word   |      score       |
        +-------+----------+------------------+
        |   0   |   cell   |  0.028974400831  |
        |   0   |  input   | 0.0259470208503  |
        |   0   |  image   | 0.0215721599763  |
        |   0   |  visual  | 0.0173635081992  |
        |   0   |  object  | 0.0172447874156  |
        |   0   | response | 0.0139740298286  |
        |   0   |  layer   | 0.0122585145062  |
        |   0   | features | 0.0115343177265  |
        |   0   | feature  | 0.0103530459301  |
        |   0   | spatial  | 0.00823387994361 |
        |  ...  |   ...    |       ...        |
        +-------+----------+------------------+

        If one wants to instead just get the top words per topic, one may
        change the format of the output as follows.

        >>> topics = m.get_topics(output_type='topic_words')
        dtype: list
        Rows: 10
        [['cell', 'image', 'input', 'object', 'visual'],
         ['algorithm', 'data', 'learning', 'method', 'set'],
         ['function', 'input', 'point', 'problem', 'result'],
         ['model', 'output', 'pattern', 'set', 'unit'],
         ['action', 'learning', 'net', 'problem', 'system'],
         ['error', 'function', 'network', 'parameter', 'weight'],
         ['information', 'level', 'neural', 'threshold', 'weight'],
         ['control', 'field', 'model', 'network', 'neuron'],
         ['hidden', 'layer', 'system', 'training', 'vector'],
         ['component', 'distribution', 'local', 'model', 'optimal']]
        """
        _mt._get_metric_tracker().track('toolkit.text.topic_model.get_topics')

        _check_categorical_option_type('output_type', output_type,
            ['topic_probabilities', 'topic_words'])

        if topic_ids is None:
            topic_ids = range(self.get('num_topics'))

        assert isinstance(topic_ids, list), \
            "The provided topic_ids is not a list."

        if any([type(x) == str for x in topic_ids]):
            raise ValueError, \
                "Only integer topic_ids can be used at this point in time."
        if not all([x >= 0 and x < self['num_topics']]):
            raise ValueError, \
                "Topic id values must be non-negative and less than the " + \
                "number of topics used to fit the model."

        opts = {'model': self.__proxy__,
                'topic_ids': topic_ids,
                'num_words': num_words,
                'cdf_cutoff': cdf_cutoff}
        response = _graphlab.toolkits._main.run('text_topicmodel_get_topic',
                                               opts)
        ret = _map_unity_proxy_to_object(response['top_words'])

        if output_type != 'topic_probabilities':
            sa = ret.unstack(['word','score'], 'word')['word'].dict_keys()
            ret = _SFrame({'words': sa})

        return ret
    def predict_topk(self, dataset, output_type="probability", k=3, missing_value_action='auto'):
        """
        Return top-k predictions for the ``dataset``, using the trained model.
        Predictions are returned as an SFrame with three columns: `row_id`,
        `class`, and `probability`, `margin`,  or `rank`, depending on the ``output_type``
        parameter. Input dataset size must be the same as for training of the model.

        Parameters
        ----------
        dataset : SFrame
            A dataset that has the same columns that were used during training.
            If the target column exists in ``dataset`` it will be ignored
            while making predictions.

        output_type : {'probability', 'rank', 'margin'}, optional
            Choose the return type of the prediction:

            - `probability`: Probability associated with each label in the prediction.
            - `rank`       : Rank associated with each label in the prediction.
            - `margin`     : Margin associated with each label in the prediction.

        k : int, optional
            Number of classes to return for each input example.

        missing_value_action : str, optional
            Action to perform when missing values are encountered. Can be
            one of:

            - 'auto': Default to 'impute'
            - 'impute': Proceed with evaluation by filling in the missing
              values with the mean of the training data. Missing
              values are also imputed if an entire column of data is
              missing during evaluation.
            - 'error': Do not proceed with evaluation and terminate with
              an error message.

        Returns
        -------
        out : SFrame
            An SFrame with model predictions.

        See Also
        --------
        predict, classify, evaluate

        Examples
        --------
        >>> pred = m.predict_topk(validation_data, k=3)
        >>> pred
        +--------+-------+-------------------+
        | row_id | class |   probability     |
        +--------+-------+-------------------+
        |   0    |   4   |   0.995623886585  |
        |   0    |   9   |  0.0038311756216  |
        |   0    |   7   | 0.000301006948575 |
        |   1    |   1   |   0.928708016872  |
        |   1    |   3   |  0.0440889261663  |
        |   1    |   2   |  0.0176190119237  |
        |   2    |   3   |   0.996967732906  |
        |   2    |   2   |  0.00151345680933 |
        |   2    |   7   | 0.000637513934635 |
        |   3    |   1   |   0.998070061207  |
        |  ...   |  ...  |        ...        |
        +--------+-------+-------------------+
        [35688 rows x 3 columns]
        """
        _mt._get_metric_tracker().track('toolkit.classifier.logistic_classifier.predict_topk')
        _check_categorical_option_type('output_type', output_type,
                                       ['rank', 'margin', 'probability'])
        _check_categorical_option_type('missing_value_action', missing_value_action,
                                       ['auto', 'impute', 'error'])
        if missing_value_action == 'auto':
            missing_value_action = 'impute'

        # Low latency path
        if isinstance(dataset, list):
            return _graphlab.extensions._fast_predict_topk(self.__proxy__, dataset,
                    output_type, missing_value_action, k)
        if isinstance(dataset, dict):
            return _graphlab.extensions._fast_predict_topk(self.__proxy__, [dataset],
                    output_type, missing_value_action, k)
        # Fast path
        _raise_error_if_not_sframe(dataset, "dataset")
        options = dict()
        if (missing_value_action == 'auto'):
            missing_value_action = _sl.select_default_missing_value_policy(
                                                              self, 'predict')
        options.update({'model': self.__proxy__,
                        'model_name': self.__name__,
                        'dataset': dataset,
                        'output_type': output_type,
                        'topk': k,
                        'missing_value_action': missing_value_action})
        target = _graphlab.toolkits._main.run(
                  'supervised_learning_predict_topk', options)
        return _map_unity_proxy_to_object(target['predicted'])
    def get_topics(self, topic_ids=None, num_words=5, cdf_cutoff=1.0,
                   output_type='topic_probabilities'):

        """
        Get the words associated with a given topic. The score column is the
        probability of choosing that word given that you have chosen a
        particular topic.

        Parameters
        ----------
        topic_ids : list of int, optional
            The topics to retrieve words. Topic ids are zero-based.
            Throws an error if greater than or equal to m['num_topics'], or
            if the requested topic name is not present.

        num_words : int, optional
            The number of words to show.

        cdf_cutoff : float, optional
            Allows one to only show the most probable words whose cumulative
            probability is below this cutoff. For example if there exist
            three words where

            .. math::
               p(word_1 | topic_k) = .1

               p(word_2 | topic_k) = .2

               p(word_3 | topic_k) = .05

            then setting :math:`cdf_{cutoff}=.3` would return only
            :math:`word_1` and :math:`word_2` since
            :math:`p(word_1 | topic_k) + p(word_2 | topic_k) <= cdf_{cutoff}`

        output_type : {'topic_probabilities' | 'topic_words'}, optional
            Determine the type of desired output. See below.

        Returns
        -------
        out : SFrame
            If output_type is 'topic_probabilities', then the returned value is
            an SFrame with a column of words ranked by a column of scores for
            each topic. Otherwise, the returned value is a SArray where
            each element is a list of the most probable words for each topic.

        Examples
        --------
        Get the highest ranked words for all topics.

        >>> docs = graphlab.SArray('https://static.turi.com/datasets/nips-text')
        >>> m = graphlab.topic_model.create(docs,
                                            num_iterations=50)
        >>> m.get_topics()
        +-------+----------+-----------------+
        | topic |   word   |      score      |
        +-------+----------+-----------------+
        |   0   |   cell   |  0.028974400831 |
        |   0   |  input   | 0.0259470208503 |
        |   0   |  image   | 0.0215721599763 |
        |   0   |  visual  | 0.0173635081992 |
        |   0   |  object  | 0.0172447874156 |
        |   1   | function | 0.0482834508265 |
        |   1   |  input   | 0.0456270024091 |
        |   1   |  point   | 0.0302662839454 |
        |   1   |  result  | 0.0239474934631 |
        |   1   | problem  | 0.0231750116011 |
        |  ...  |   ...    |       ...       |
        +-------+----------+-----------------+

        Get the highest ranked words for topics 0 and 1 and show 15 words per
        topic.

        >>> m.get_topics([0, 1], num_words=15)
        +-------+----------+------------------+
        | topic |   word   |      score       |
        +-------+----------+------------------+
        |   0   |   cell   |  0.028974400831  |
        |   0   |  input   | 0.0259470208503  |
        |   0   |  image   | 0.0215721599763  |
        |   0   |  visual  | 0.0173635081992  |
        |   0   |  object  | 0.0172447874156  |
        |   0   | response | 0.0139740298286  |
        |   0   |  layer   | 0.0122585145062  |
        |   0   | features | 0.0115343177265  |
        |   0   | feature  | 0.0103530459301  |
        |   0   | spatial  | 0.00823387994361 |
        |  ...  |   ...    |       ...        |
        +-------+----------+------------------+

        If one wants to instead just get the top words per topic, one may
        change the format of the output as follows.

        >>> topics = m.get_topics(output_type='topic_words')
        dtype: list
        Rows: 10
        [['cell', 'image', 'input', 'object', 'visual'],
         ['algorithm', 'data', 'learning', 'method', 'set'],
         ['function', 'input', 'point', 'problem', 'result'],
         ['model', 'output', 'pattern', 'set', 'unit'],
         ['action', 'learning', 'net', 'problem', 'system'],
         ['error', 'function', 'network', 'parameter', 'weight'],
         ['information', 'level', 'neural', 'threshold', 'weight'],
         ['control', 'field', 'model', 'network', 'neuron'],
         ['hidden', 'layer', 'system', 'training', 'vector'],
         ['component', 'distribution', 'local', 'model', 'optimal']]
        """
        _mt._get_metric_tracker().track('toolkit.text.topic_model.get_topics')

        _check_categorical_option_type('output_type', output_type,
            ['topic_probabilities', 'topic_words'])

        if topic_ids is None:
            topic_ids = list(range(self.get('num_topics')))

        assert isinstance(topic_ids, list), \
            "The provided topic_ids is not a list."

        if any([type(x) == str for x in topic_ids]):
            raise ValueError("Only integer topic_ids can be used at this point in time.")
        if not all([x >= 0 and x < self['num_topics'] for x in topic_ids]):
            raise ValueError("Topic id values must be non-negative and less than the " + \
                "number of topics used to fit the model.")

        opts = {'model': self.__proxy__,
                'topic_ids': topic_ids,
                'num_words': num_words,
                'cdf_cutoff': cdf_cutoff}
        response = _graphlab.toolkits._main.run('text_topicmodel_get_topic',
                                               opts)
        ret = _map_unity_proxy_to_object(response['top_words'])

        def sort_wordlist_by_prob(z):
            words = sorted(z.items(), key=_operator.itemgetter(1), reverse=True)
            return [word for (word, prob) in words]

        if output_type != 'topic_probabilities':
            ret = ret.groupby('topic',
                    {'word': _graphlab.aggregate.CONCAT('word', 'score')})
            words = ret.sort('topic')['word'].apply(sort_wordlist_by_prob)
            ret = _SFrame({'words': words})

        return ret
    def predict(self,
                dataset,
                output_type='class',
                missing_value_action='auto'):
        """
        Return predictions for ``dataset``, using the trained logistic
        regression model. Predictions can be generated as class labels (0 or
        1), or margins (i.e. the distance of the observations from the hyperplane
        separating the classes). By default, the predict method returns class
        labels.

        For each new example in ``dataset``, the margin---also known as the
        linear predictor---is the inner product of the example and the model
        coefficients plus the intercept term. Predicted classes are obtained by
        thresholding the margins at 0.

        Parameters
        ----------
        dataset : SFrame | dict
            Dataset of new observations. Must include columns with the same
            names as the features used for model training, but does not require
            a target column. Additional columns are ignored.

        output_type : {'margin', 'class'}, optional
            Form of the predictions which are one of:

            - 'margin': Distance of the observations from the hyperplane
              separating the classes.
            - 'class': Class prediction.

        missing_value_action : str, optional
            Action to perform when missing values are encountered. This can be
            one of:

            - 'auto': Default to 'impute'
            - 'impute': Proceed with evaluation by filling in the missing
              values with the mean of the training data. Missing
              values are also imputed if an entire column of data is
              missing during evaluation.
            - 'error' : Do not proceed with prediction and terminate with
              an error message.

        Returns
        -------
        out : SArray
            An SArray with model predictions.

        See Also
        ----------
        create, evaluate, classify

        Examples
        ----------
        >>> data =  graphlab.SFrame('https://static.turi.com/datasets/regression/houses.csv')

        >>> data['is_expensive'] = data['price'] > 30000
        >>> model = graphlab.svm_classifier.create(data,
                                  target='is_expensive',
                                  features=['bath', 'bedroom', 'size'])

        >>> class_predictions = model.predict(data)
        >>> margin_predictions = model.predict(data, output_type='margin')

        """

        _mt._get_metric_tracker().track(
            'toolkit.classifier.svm_classifier.predict')
        _check_categorical_option_type('output_type', output_type,
                                       ['class', 'margin'])
        return super(_Classifier,
                     self).predict(dataset,
                                   output_type=output_type,
                                   missing_value_action=missing_value_action)