def predict(self, dataset):
        """
        Use the trained :class:`~graphlab.vowpal_wabbit.VowpalWabbitModel` to make
        predictions about the target column that was provided during
        :func:`~graphlab.vowpal_wabbit.create`.

        Parameters
        ----------
        dataset : SFrame
            A dataset that has the same columns that were used during training.
            If the target column exists in ``dataset`` it will be ignored while
            making predictions.

        Returns
        -------
        out : SArray
            Predicted target value for each example (i.e. row) in the dataset.

        See Also
        --------
        evaluate
        """
        _mt._get_metric_tracker().track('toolkit.vowpal_wabbit.predict')

        opts = {'model': self.__proxy__,
                'data': dataset}
        response = _graphlab.toolkits._main.run("vw_predict", opts)

        # Convert predictions to an SArray
        return _SArray(None, _proxy=response['predictions'])
    def predict(self, dataset, output_type='assignment', num_burnin=None):
        """
        Use the model to predict topics for each document. The provided
        `dataset` should be an SArray object where each element is a dict
        representing a single document in bag-of-words format, where keys
        are words and values are their corresponding counts. If `dataset` is
        an SFrame, then it must contain a single column of dict type.

        The current implementation will make inferences about each document
        given its estimates of the topics learned when creating the model.
        This is done via Gibbs sampling.

        Parameters
        ----------
        dataset : SArray, SFrame of type dict
            A set of documents to use for making predictions.

        output_type : str, optional
            The type of output desired. This can either be

            - assignment: the returned values are integers in [0, num_topics)
            - probability: each returned prediction is a vector with length
              num_topics, where element k prepresents the probability that
              document belongs to topic k.

        num_burnin : int, optional
            The number of iterations of Gibbs sampling to perform when
            inferring the topics for documents at prediction time.
            If provided this will override the burnin value set during
            training.

        Returns
        -------
        out : SArray

        See Also
        --------
        evaluate

        Examples
        --------
        Make predictions about which topic each document belongs to.

        >>> docs = graphlab.SArray('https://static.turi.com/datasets/nips-text')
        >>> m = graphlab.topic_model.create(docs)
        >>> pred = m.predict(docs)

        If one is interested in the probability of each topic

        >>> pred = m.predict(docs, output_type='probability')

        Notes
        -----
        For each unique word w in a document d, we sample an assignment to
        topic k with probability proportional to

        .. math::
            p(z_{dw} = k) \propto (n_{d,k} + \\alpha) * \Phi_{w,k}

        where

        - :math:`W` is the size of the vocabulary,
        - :math:`n_{d,k}` is the number of other times we have assigned a word in
          document to d to topic :math:`k`,
        - :math:`\Phi_{w,k}` is the probability under the model of choosing word
          :math:`w` given the word is of topic :math:`k`. This is the matrix
          returned by calling `m['topics']`.

        This represents a collapsed Gibbs sampler for the document assignments
        while we keep the topics learned during training fixed.
        This process is done in parallel across all documents, five times per
        document.

        """
        _mt._get_metric_tracker().track('toolkit.text.topic_model.predict')

        dataset = _check_input(dataset)

        if num_burnin is None:
            num_burnin = self.get('num_burnin')

        opts = {'model': self.__proxy__,
                'data': dataset,
                'num_burnin': num_burnin}
        response = _graphlab.toolkits._main.run("text_topicmodel_predict", opts)
        preds = _SArray(None, _proxy=response['predictions'])

        # Get most likely topic if probabilities are not requested
        if output_type not in ['probability', 'probabilities', 'prob']:
            # equivalent to numpy.argmax(x)
            preds = preds.apply(lambda x: max(_izip(x, _xrange(len(x))))[1])

        return preds
def create(dataset,
           num_topics=10,
           initial_topics=None,
           alpha=None,
           beta=.1,
           num_iterations=10,
           num_burnin=5,
           associations=None,
           verbose=False,
           print_interval=10,
           validation_set=None,
           method='auto'):
    """
    Create a topic model from the given data set. A topic model assumes each
    document is a mixture of a set of topics, where for each topic some words
    are more likely than others. One statistical approach to do this is called a
    "topic model". This method learns a topic model for the given document
    collection.

    Parameters
    ----------
    dataset : SArray of type dict or SFrame with a single column of type dict
        A bag of words representation of a document corpus.
        Each element is a dictionary representing a single document, where
        the keys are words and the values are the number of times that word
        occurs in that document.

    num_topics : int, optional
        The number of topics to learn.

    initial_topics : SFrame, optional
        An SFrame with a column of unique words representing the vocabulary
        and a column of dense vectors representing
        probability of that word given each topic. When provided,
        these values are used to initialize the algorithm.

    alpha : float, optional
        Hyperparameter that controls the diversity of topics in a document.
        Smaller values encourage fewer topics per document.
        Provided value must be positive. Default value is 50/num_topics.

    beta : float, optional
        Hyperparameter that controls the diversity of words in a topic.
        Smaller values encourage fewer words per topic. Provided value
        must be positive.

    num_iterations : int, optional
        The number of iterations to perform.

    num_burnin : int, optional
        The number of iterations to perform when inferring the topics for
        documents at prediction time.

    verbose : bool, optional
        When True, print most probable words for each topic while printing
        progress.

    print_interval : int, optional
        The number of iterations to wait between progress reports.

    associations : SFrame, optional
        An SFrame with two columns named "word" and "topic" containing words
        and the topic id that the word should be associated with. These words
        are not considered during learning.

    validation_set : SArray of type dict or SFrame with a single column
        A bag of words representation of a document corpus, similar to the
        format required for `dataset`. This will be used to monitor model
        performance during training. Each document in the provided validation
        set is randomly split: the first portion is used estimate which topic
        each document belongs to, and the second portion is used to estimate
        the model's performance at predicting the unseen words in the test data.

    method : {'cgs', 'alias'}, optional
        The algorithm used for learning the model.

        - *cgs:* Collapsed Gibbs sampling
        - *alias:* AliasLDA method.

    Returns
    -------
    out : TopicModel
        A fitted topic model. This can be used with
        :py:func:`~TopicModel.get_topics()` and
        :py:func:`~TopicModel.predict()`. While fitting is in progress, several
        metrics are shown, including:

        +------------------+---------------------------------------------------+
        |      Field       | Description                                       |
        +==================+===================================================+
        | Elapsed Time     | The number of elapsed seconds.                    |
        +------------------+---------------------------------------------------+
        | Tokens/second    | The number of unique words processed per second   |
        +------------------+---------------------------------------------------+
        | Est. Perplexity  | An estimate of the model's ability to model the   |
        |                  | training data. See the documentation on evaluate. |
        +------------------+---------------------------------------------------+

    See Also
    --------
    TopicModel, TopicModel.get_topics, TopicModel.predict,
    graphlab.SArray.dict_trim_by_keys, TopicModel.evaluate

    References
    ----------
    - `Wikipedia - Latent Dirichlet allocation
      <http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation>`_

    - Alias method: Li, A. et al. (2014) `Reducing the Sampling Complexity of
      Topic Models. <http://www.sravi.org/pubs/fastlda-kdd2014.pdf>`_.
      KDD 2014.

    Examples
    --------
    The following example includes an SArray of documents, where
    each element represents a document in "bag of words" representation
    -- a dictionary with word keys and whose values are the number of times
    that word occurred in the document:

    >>> docs = graphlab.SArray('https://static.turi.com/datasets/nytimes')

    Once in this form, it is straightforward to learn a topic model.

    >>> m = graphlab.topic_model.create(docs)

    It is also easy to create a new topic model from an old one  -- whether
    it was created using GraphLab Create or another package.

    >>> m2 = graphlab.topic_model.create(docs, initial_topics=m['topics'])

    To manually fix several words to always be assigned to a topic, use
    the `associations` argument. The following will ensure that topic 0
    has the most probability for each of the provided words:

    >>> from graphlab import SFrame
    >>> associations = SFrame({'word':['hurricane', 'wind', 'storm'],
                               'topic': [0, 0, 0]})
    >>> m = graphlab.topic_model.create(docs,
                                        associations=associations)

    More advanced usage allows you  to control aspects of the model and the
    learning method.

    >>> import graphlab as gl
    >>> m = gl.topic_model.create(docs,
                                  num_topics=20,       # number of topics
                                  num_iterations=10,   # algorithm parameters
                                  alpha=.01, beta=.1)  # hyperparameters

    To evaluate the model's ability to generalize, we can create a train/test
    split where a portion of the words in each document are held out from
    training.

    >>> train, test = gl.text_analytics.random_split(.8)
    >>> m = gl.topic_model.create(train)
    >>> results = m.evaluate(test)
    >>> print results['perplexity']

    """
    _mt._get_metric_tracker().track('toolkit.text.topic_model.create')

    dataset = _check_input(dataset)

    _check_categorical_option_type("method", method, ['auto', 'cgs', 'alias'])
    if method == 'cgs' or method == 'auto':
        model_name = 'cgs_topic_model'
    else:
        model_name = 'alias_topic_model'

    # If associations are provided, check they are in the proper format
    if associations is None:
        associations = _graphlab.SFrame({'word': [], 'topic': []})
    if isinstance(associations, _graphlab.SFrame) and \
       associations.num_rows() > 0:
        assert set(associations.column_names()) == set(['word', 'topic']), \
            "Provided associations must be an SFrame containing a word column\
             and a topic column."
        assert associations['word'].dtype() == str, \
            "Words must be strings."
        assert associations['topic'].dtype() == int, \
            "Topic ids must be of int type."
    if alpha is None:
        alpha = float(50) / num_topics

    if validation_set is not None:
        _check_input(validation_set)  # Must be a single column
        if isinstance(validation_set, _graphlab.SFrame):
            column_name = validation_set.column_names()[0]
            validation_set = validation_set[column_name]
        (validation_train, validation_test) = _random_split(validation_set)
    else:
        validation_train = _SArray()
        validation_test = _SArray()

    opts = {'model_name': model_name,
            'data': dataset,
            'verbose': verbose,
            'num_topics': num_topics,
            'num_iterations': num_iterations,
            'print_interval': print_interval,
            'alpha': alpha,
            'beta': beta,
            'num_burnin': num_burnin,
            'associations': associations}

    # Initialize the model with basic parameters
    response = _graphlab.toolkits._main.run("text_topicmodel_init", opts)
    m = TopicModel(response['model'])

    # If initial_topics provided, load it into the model
    if isinstance(initial_topics, _graphlab.SFrame):
        assert set(['vocabulary', 'topic_probabilities']) ==              \
               set(initial_topics.column_names()),                        \
            "The provided initial_topics does not have the proper format, \
             e.g. wrong column names."
        observed_topics = initial_topics['topic_probabilities'].apply(lambda x: len(x))
        assert all(observed_topics == num_topics),                        \
            "Provided num_topics value does not match the number of provided initial_topics."

        # Rough estimate of total number of words
        weight = dataset.size() * 1000

        opts = {'model': m.__proxy__,
                'topics': initial_topics['topic_probabilities'],
                'vocabulary': initial_topics['vocabulary'],
                'weight': weight}
        response = _graphlab.toolkits._main.run("text_topicmodel_set_topics", opts)
        m = TopicModel(response['model'])

    # Train the model on the given data set and retrieve predictions
    opts = {'model': m.__proxy__,
            'data': dataset,
            'verbose': verbose,
            'validation_train': validation_train,
            'validation_test': validation_test}

    response = _graphlab.toolkits._main.run("text_topicmodel_train", opts)
    m = TopicModel(response['model'])

    return m
    def get(self, field):
        """
        Return the value of a given field. The list of all queryable fields is
        detailed below, and can be obtained with the
        :py:func:`~TopicModel.list_fields` method.

        +-----------------------+----------------------------------------------+
        |      Field            | Description                                  |
        +=======================+==============================================+
        | topics                | An SFrame containing a column with the unique|
        |                       | words observed during training, and a column |
        |                       | of arrays containing the probability values  |
        |                       | for each word given each of the topics.      |
        +-----------------------+----------------------------------------------+
        | vocabulary            | An SArray containing the words used. This is |
        |                       | same as the vocabulary column in the topics  |
        |                       | field above.                                 |
        +-----------------------+----------------------------------------------+

        Parameters
        ----------
        field : string
            Name of the field to be retrieved.

        Returns
        -------
        out
            Value of the requested field.

        See Also
        --------
        list_fields

        Examples
        --------
        >>> docs = graphlab.SArray('https://static.turi.com/datasets/nips-text')
        >>> m = graphlab.topic_model.create(docs)
        >>> m.get('topics')
        +--------------------------------+------------+
        |      topic_probabilities       | vocabulary |
        +--------------------------------+------------+
        | array('d', [0.000514752462 ... |  limited   |
        | array('d', [6.120718939647 ... |  consider  |
        | array('d', [0.000337251613 ... | represent  |
        | array('d', [0.000104664293 ... |    lack    |
        | array('d', [6.120718939647 ... | desirable  |
        | array('d', [6.120718939647 ... |   focus    |
        | array('d', [6.120718939647 ... | generaliza |
        | array('d', [6.120718939647 ... | generalize |
        | array('d', [6.120718939647 ... |    row     |
        | array('d', [6.120718939647 ... |   depend   |
        |              ...               |    ...     |
        +--------------------------------+------------+

        You may also do m['topics'].
        """

        opts = {'model': self.__proxy__, 'field': field}
        response = _graphlab.toolkits._main.run("text_topicmodel_get_value", opts)
        if field == 'vocabulary':
            return _SArray(None, _proxy=response['value'])
        elif field == 'topics':
            return _SFrame(None, _proxy=response['value'])
        return response['value']
    def predict(self, dataset, output_type='assignment'):
        """
        Use the model to predict topics for each document. The provided
        `dataset` should be an SArray object where each element is a dict
        representing a single document in bag-of-words format, where keys
        are words and values are their corresponding counts. If `dataset` is
        an SFrame, then it must contain a single column of dict type.

        The current implementation will make inferences about each document
        given its estimates of the topics learned when creating the model.
        This is done via Gibbs sampling.

        Parameters
        ----------
        dataset : SArray, SFrame of type dict
            A set of documents to use for making predictions.

        output_type : str, optional
            The type of output desired. This can either be

            - assignment: the returned values are integers in [0, num_topics)
            - probability: each returned prediction is a vector with length
              num_topics, where element k prepresents the probability that
              document belongs to topic k.

        Returns
        -------
        out : SArray

        See Also
        --------
        evaluate

        Examples
        --------
        Make predictions about which topic each document belongs to.

        >>> docs = graphlab.SArray('http://s3.amazonaws.com/GraphLab-Datasets/nips-text')
        >>> m = graphlab.topic_model.create(docs)
        >>> pred = m.predict(docs)

        If one is interested in the probability of each topic

        >>> pred = m.predict(docs, output_type='probability')

        Notes
        -----
        For each unique word w in a document d, we sample an assignment to
        topic k with probability proportional to

        .. math::
            p(z_{dw} = k) \propto (n_{d,k} + \\alpha) * \Phi_{w,k}

        where

        - :math:`W` is the size of the vocabulary,
        - :math:`n_{d,k}` is the number of other times we have assigned a word in
          document to d to topic :math:`k`,
        - :math:`\Phi_{w,k}` is the probability under the model of choosing word
          :math:`w` given the word is of topic :math:`k`. This is the matrix
          returned by calling `m['topics']`.

        This represents a collapsed Gibbs sampler for the document assignments
        while we keep the topics learned during training fixed.
        This process is done in parallel across all documents, five times per
        document.

        """
        _mt._get_metric_tracker().track('toolkit.text.topic_model.predict')

        dataset = _check_input(dataset)

        opts = {'model': self.__proxy__,
                'data': dataset}
        response = _graphlab.toolkits._main.run("text_topicmodel_predict", opts)
        preds = _SArray(None, _proxy=response['predictions'])

        # Get most likely topic if probabilities are not requested
        if output_type not in ['probability', 'probabilities', 'prob']:
            # equivalent to numpy.argmax(x)
            preds = preds.apply(lambda x: max(_izip(x, xrange(len(x))))[1])

        return preds
    def get(self, field):
        """
        Return the value of a given field. The list of all queryable fields is
        detailed below, and can be obtained with the
        :py:func:`~TopicModel.list_fields` method.

        +-----------------------+----------------------------------------------+
        |      Field            | Description                                  |
        +=======================+==============================================+
        | topics                | An SFrame containing a column with the unique|
        |                       | words observed during training, and a column |
        |                       | of arrays containing the probability values  |
        |                       | for each word given each of the topics.      |
        +-----------------------+----------------------------------------------+
        | vocabulary            | An SArray containing the words used. This is |
        |                       | same as the vocabulary column in the topics  |
        |                       | field above.                                 |
        +-----------------------+----------------------------------------------+

        Parameters
        ----------
        field : string
            Name of the field to be retrieved.

        Returns
        -------
        out
            Value of the requested field.

        See Also
        --------
        list_fields

        Examples
        --------
        >>> docs = graphlab.SArray('http://s3.amazonaws.com/GraphLab-Datasets/nips-text')
        >>> m = graphlab.topic_model.create(docs)
        >>> m.get('topics')
        +--------------------------------+------------+
        |      topic_probabilities       | vocabulary |
        +--------------------------------+------------+
        | array('d', [0.000514752462 ... |  limited   |
        | array('d', [6.120718939647 ... |  consider  |
        | array('d', [0.000337251613 ... | represent  |
        | array('d', [0.000104664293 ... |    lack    |
        | array('d', [6.120718939647 ... | desirable  |
        | array('d', [6.120718939647 ... |   focus    |
        | array('d', [6.120718939647 ... | generaliza |
        | array('d', [6.120718939647 ... | generalize |
        | array('d', [6.120718939647 ... |    row     |
        | array('d', [6.120718939647 ... |   depend   |
        |              ...               |    ...     |
        +--------------------------------+------------+

        You may also do m['topics'].
        """

        _mt._get_metric_tracker().track('toolkit.text.topic_model.get')
        opts = {'model': self.__proxy__, 'field': field}
        response = _graphlab.toolkits._main.run("text_topicmodel_get_value", opts)
        if field == 'vocabulary':
            return _SArray(None, _proxy=response['value'])
        elif field == 'topics':
            return _SFrame(None, _proxy=response['value'])
        return response['value']
def create(dataset,
           num_topics=10,
           initial_topics=None,
           alpha=None,
           beta=.1,
           num_iterations=10,
           associations=None,
           verbose=False,
           print_interval=10,
           validation_set=None,
           method='auto'):
    """
    Create a topic model from the given data set. A topic model assumes each
    document is a mixture of a set of topics, where for each topic some words
    are more likely than others. One statistical approach to do this is called a
    "topic model". This method learns a topic model for the given document
    collection.

    Parameters
    ----------
    dataset : SArray of type dict or SFrame with a single column of type dict
        A bag of words representation of a document corpus.
        Each element is a dictionary representing a single document, where
        the keys are words and the values are the number of times that word
        occurs in that document.

    num_topics : int, optional
        The number of topics to learn.

    initial_topics : SFrame, optional
        An SFrame with a column of unique words representing the vocabulary
        and a column of dense vectors representing
        probability of that word given each topic. When provided,
        these values are used to initialize the algorithm.

    num_iterations : int, optional
        The number of iterations to perform.

    alpha : float, optional
        Hyperparameter that controls the diversity of topics in a document.
        Smaller values encourage fewer topics per document.
        Provided value must be positive. Default value is 50/num_topics.

    beta : float, optional
        Hyperparameter that controls the diversity of words in a topic.
        Smaller values encourage fewer words per topic. Provided value
        must be positive.

    verbose : bool, optional
        When True, print most probable words for each topic while printing
        progress.

    print_interval : int, optional
        The number of iterations to wait between progress reports.

    associations : SFrame, optional
        An SFrame with two columns named "word" and "topic" containing words
        and the topic id that the word should be associated with. These words
        are not considered during learning.

    validation_set : SArray of type dict or SFrame with a single column
        A bag of words representation of a document corpus, similar to the
        format required for `dataset`. This will be used to monitor model
        performance during training. Each document in the provided validation
        set is randomly split: the first portion is used estimate which topic
        each document belongs to, and the second portion is used to estimate
        the model's performance at predicting the unseen words in the test data.

    method : {'cgs', 'alias'}, optional
        The algorithm used for learning the model.

        - *cgs:* Collapsed Gibbs sampling
        - *alias:* AliasLDA method.

    Returns
    -------
    out : TopicModel
        A fitted topic model. This can be used with
        :py:func:`~TopicModel.get_topics()` and
        :py:func:`~TopicModel.predict()`. While fitting is in progress, several
        metrics are shown, including:

        +------------------+---------------------------------------------------+
        |      Field       | Description                                       |
        +==================+===================================================+
        | Elapsed Time     | The number of elapsed seconds.                    |
        +------------------+---------------------------------------------------+
        | Tokens/second    | The number of unique words processed per second   |
        +------------------+---------------------------------------------------+
        | Est. Perplexity  | An estimate of the model's ability to model the   |
        |                  | training data. See the documentation on evaluate. |
        +------------------+---------------------------------------------------+

    See Also
    --------
    TopicModel, TopicModel.get_topics, TopicModel.predict,
    graphlab.SArray.dict_trim_by_keys

    References
    ----------
    - `Wikipedia - Latent Dirichlet allocation
      <http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation>`_

    - Alias method: Li, A. et al. (2014) `Reducing the Sampling Complexity of
      Topic Models. <http://www.sravi.org/pubs/fastlda-kdd2014.pdf>`_.
      KDD 2014.

    Examples
    --------
    The following example includes an SArray of documents, where
    each element represents a document in "bag of words" representation
    -- a dictionary with word keys and whose values are the number of times
    that word occurred in the document:

    >>> docs = graphlab.SArray('http://s3.amazonaws.com/GraphLab-Datasets/nytimes')

    Once in this form, it is straightforward to learn a topic model.

    >>> m = graphlab.topic_model.create(docs)

    It is also easy to create a new topic model from an old one  -- whether
    it was created using GraphLab Create or another package.

    >>> m2 = graphlab.topic_model.create(docs, initial_topics=m['topics'])

    To manually fix several words to always be assigned to a topic, use
    the `associations` argument. The following will ensure that topic 0
    has the most probability for each of the provided words:

    >>> from graphlab import SFrame
    >>> associations = SFrame({'word':['hurricane', 'wind', 'storm'],
                               'topic': [0, 0, 0]})
    >>> m = graphlab.topic_model.create(docs,
                                        associations=associations)

    More advanced usage allows you  to control aspects of the model and the
    learning method.

    >>> import graphlab as gl
    >>> m = gl.topic_model.create(docs,
                                  num_topics=20,       # number of topics
                                  num_iterations=10,   # algorithm parameters
                                  alpha=.01, beta=.1)  # hyperparameters
    """
    _mt._get_metric_tracker().track('toolkit.text.topic_model.create')

    dataset = _check_input(dataset)

    _check_categorical_option_type("method", method, ['auto', 'cgs', 'alias'])
    if method == 'cgs' or method == 'auto':
        model_name = 'cgs_topic_model'
    else:
        model_name = 'alias_topic_model'

    # If associations are provided, check they are in the proper format
    if associations is None:
        associations = _graphlab.SFrame({'word': [], 'topic': []})
    if isinstance(associations, _graphlab.SFrame) and \
       associations.num_rows() > 0:
        assert set(associations.column_names()) == set(['word', 'topic']), \
            "Provided associations must be an SFrame containing a word column\
             and a topic column."
        assert associations['word'].dtype() == str, \
            "Words must be strings."
        assert associations['topic'].dtype() == int, \
            "Topic ids must be of int type."
    if alpha is None:
        alpha = float(50) / num_topics

    if validation_set is not None:
        _check_input(validation_set)  # Must be a single column
        if isinstance(validation_set, _graphlab.SFrame):
            column_name = validation_set.column_names()[0]
            validation_set = validation_set[column_name]
        (validation_train, validation_test) = _random_split(validation_set)
    else:
        validation_train = _SArray()
        validation_test = _SArray()

    opts = {'model_name': model_name,
            'data': dataset,
            'verbose': verbose,
            'num_topics': num_topics,
            'num_iterations': num_iterations,
            'print_interval': print_interval,
            'alpha': alpha,
            'beta': beta,
            'associations': associations}

    # Initialize the model with basic parameters
    response = _graphlab.toolkits._main.run("text_topicmodel_init", opts)
    m = TopicModel(response['model'])

    # If initial_topics provided, load it into the model
    if isinstance(initial_topics, _graphlab.SFrame):
        assert set(['vocabulary', 'topic_probabilities']) ==              \
               set(initial_topics.column_names()),                        \
            "The provided initial_topics does not have the proper format, \
             e.g. wrong column names."
        observed_topics = initial_topics['topic_probabilities'].apply(lambda x: len(x))
        assert all(observed_topics == num_topics),                        \
            "Provided num_topics value does not match the number of provided initial_topics."

        # Rough estimate of total number of words
        weight = dataset.size() * 1000

        opts = {'model': m.__proxy__,
                'topics': initial_topics['topic_probabilities'],
                'vocabulary': initial_topics['vocabulary'],
                'weight': weight}
        response = _graphlab.toolkits._main.run("text_topicmodel_set_topics", opts)
        m = TopicModel(response['model'])

    # Train the model on the given data set and retrieve predictions
    opts = {'model': m.__proxy__,
            'data': dataset,
            'verbose': verbose,
            'validation_train': validation_train,
            'validation_test': validation_test}

    response = _graphlab.toolkits._main.run("text_topicmodel_train", opts)
    m = TopicModel(response['model'])

    return m
def create(dataset, target,
           loss_function='squared',
           quadratic=[],
           l1_penalty=0.0, l2_penalty=0.0,
           bigram=False,
           step_size=0.5, num_bits=18, verbose=False,
           max_iterations=1,
           command_line_args=''):
    """
    create(dataset, target, loss_function='squared', quadratic=list(), 
    l1_penalty=0.0, l2_penalty=0.0, bigram=False, step_size=0.5, num_bits=18, 
    verbose=False, max_iterations=1, command_line_args='')

    Learn a large linear model using Vowpal Wabbit.

    Parameters
    ----------
    dataset : SFrame
        A data set. Due to the way Vowpal Wabbit creates features from each
        entry, ':' and '|' characters are not allowed in any columns containing
        strings. Each row of the dataset is translated into a string and passed
        to Vowpal Wabbit. Currently, the upper bound on the size of the string
        is 1MB. Based on the type of the SArray column, the values are passed in
        the following ways.

        - *integer* or *float*: the value is passed directly to VW.

        - *str*: the name of the column is used as the namespace, followed by
          the entire string.

        - *dict*: the name of the column is used as the namespace, and each
          key-value pair is a feature. The keys of the dictionary must be string
          or numeric and the values must be numeric (integer or float).

        - *array*: the name of the column is used as the namespace, the index of
          the array element is used as the name of the feature, and only numeric
          elements in the array are passed to VW.

        - *list (recursive type)*: the name of the column is used as the
          namespace, the index of the list element is used as the name of the
          feature, and currently only numeric elements (integer or float) are
          passed to VW.

        See the `VW input format guidelines
        <https://github.com/JohnLangford/vowpal_wabbit/wiki/Input-format>`_ for
        more details.

    target : string
        The name of the column in ``dataset`` that is the prediction target.
        This column must have a numeric type.

    loss_function : {'squared', 'hinge', 'logistic', 'quantile'}, optional
        This defines the `loss function
        <http://en.wikipedia.org/wiki/Loss_function>`_ used during optimization.
        Typical choices:

        - *real-valued target*: `squared error loss
          <http://en.wikipedia.org/wiki/Mean_squared_error>`_.

        - *binary target*: `logistic
          <http://en.wikipedia.org/wiki/Logistic_regression>`_. The target
          column must only contain -1 or 1.

        The `hinge loss <http://en.wikipedia.org/wiki/Hinge_loss>`_ is also used
        for classification, while `quantile loss
        <http://en.wikipedia.org/wiki/Quantile_regression>`_ can be good when
        one aims to predict quantities other than the mean.

    quadratic : list of pairs, optional
        This will add `interaction terms
        <http://en.wikipedia.org/wiki/Interaction_(statistics)>`_ to a linear
        model between a pair of columns. Quadratic terms add a parameter in the
        model for the product of two features, i.e. if we include an interaction
        between :math:`x_1` and :math:`x_2`, we can add a parameter :math:`b_3`.

            .. math:: y_i =  a + b_1 * x_{i1} + b_2 * x_{i2} + b_3 * x_{i1} * x_{i2}

        Multiple quadratic terms can be added by including multiple pairs, e.g.
        ``quadratic = [('a', 'b'), ('b', 'c')]`` would add interaction terms
        between columns names 'a' and 'b' as well as terms for interactions
        between 'b' and 'c'. Including ':' as one of the items in the pairs is a
        shortcut for adding quadratic terms for all pairs of features. Due to
        Vowpal Wabbit's implementation, quadratic terms are determined by the
        first letter of the column name.

    l1_penalty : float, optional
        This defines how strongly you want to keep parameters to be zero.

    l2_penalty : float, optional
        This defines how strongly you want to keep parameters near zero.
        Specifically it adds a penalty of :math:`.5 * \lambda * |w|_2^2` to the
        weight vector w, where lambda is the provided regularization value.

    bigram : bool, optional
        Add bigram features. For columns containing the text "my name is bob"
        this will add bigram features for "my name", "name is", "is bob".

    step_size : float, optional
        Set the learning rate for online learning.

    verbose : bool, optional
        Print first 10 rows as they are seen by VowpalWabbit.
        This is useful for debugging.

    max_iterations : int, optional
        Number of passes to take over the data set.

    command_line_args : string, optional
        Additional arguments to pass to Vowpal Wabbit, just as one would use
        when using VW via the command line.

    Returns
    -------
    out : VowpalWabbitModel
        A model that can be used for predicting new cases.

    See Also
    --------
    VowpalWabbitModel.predict, VowpalWabbitModel.evaluate

    Notes
    -----
    - Other desired command line arguments can be provided manually through the
      command_line_args keyword argument. See the `VW documentation <http://gith
      ub.com/JohnLangford/vowpal_wabbit/wiki/Command-line-arguments>`_ for more
      details.

    - Several Vowpal Wabbit features are not yet supported, including importance
      weighted learning.

    Examples
    --------
    >>> data =  graphlab.SFrame('http://s3.amazonaws.com/GraphLab-Datasets/regression/houses.csv')
    >>> data['price'] = data['price'].apply(lambda x: 1 if x > 30000 else -1)
    >>> m = graphlab.vowpal_wabbit.create(data, 'price')

    To add quadratic terms between 'user' and 'movie' columns:

    >>> m = graphlab.vowpal_wabbit.create(sf, 'rating', quadratic=[('user', 'movie')])

    If a column contains text, each space-separated word is used as a
    unique feature. Often times it is useful to also include bigrams as
    features. This can be done easily with the ``bigram`` argument:

    >>> m = graphlab.vowpal_wabbit.create(sf, 'rating', bigram=True)
    """

    _mt._get_metric_tracker().track('toolkit.vowpal_wabbit.create')

    if not (isinstance(dataset, _SFrame)):
        raise TypeError("Input 'dataset' must be an SFrame")

    if type(dataset) != _SFrame:
        dataset = _SFrame(dataset)

    assert target in dataset.column_names(), "No target provided."

    quadratic_command = ''
    for (feature_a, feature_b) in quadratic:
        # VW uses first letter to describe namespace
        quadratic_command += ' -q ' + feature_a[0] + feature_b[0]

    opts = {'verbose': verbose,
            'target': target,
            'loss_function': loss_function,
            'quadratic': quadratic_command,
            'step_size': step_size,
            'l1_penalty': l1_penalty,
            'l2_penalty': l2_penalty,
            'num_bits' : num_bits,
            'max_iterations': max_iterations,
            'bigram': bigram,
            'extra_command_line_args': command_line_args}

    # Initialize the model with basic parameters
    response = _graphlab.toolkits._main.run("vw_init", opts)
    m = VowpalWabbitModel(response['model'])

    # Train the model on the given data set and retrieve predictions
    opts = {'model': m.__proxy__,
            'data': dataset}
    response = _graphlab.toolkits._main.run("vw_train", opts)
    m = VowpalWabbitModel(response['model'])

    yhat = _SArray(None, _proxy=response['predictions'])

    # Evaluate model
    start_time = _time.time()
    y = dataset[target]

    if loss_function == 'logistic':
        is_one_or_neg_one = y.apply(lambda x: x == 1 or x == -1)
        if not all(is_one_or_neg_one):
            raise TypeError('When using `logistic` as a loss function, the target column must contain only 1\'s and -1\'s.')
        y = y.apply(lambda x: int(x*.5 + .5))
        m = m._set('training_accuracy', _graphlab.evaluation.accuracy(y, yhat))
    else:
        m = m._set('training_rmse', _graphlab.evaluation.rmse(y, yhat))
    return m