def perplexity(test_data, predictions, topics, vocabulary): """ Compute the perplexity of a set of test documents given a set of predicted topics. Let theta be the matrix of document-topic probabilities, where theta_ik = p(topic k | document i). Let Phi be the matrix of term-topic probabilities, where phi_jk = p(word j | topic k). Then for each word in each document, we compute for a given word w and document d .. math:: p(word | \theta[doc_id,:], \phi[word_id,:]) = \sum_k \theta[doc_id, k] * \phi[word_id, k] We compute loglikelihood to be: .. math:: l(D) = \sum_{i \in D} \sum_{j in D_i} count_{i,j} * log Pr(word_{i,j} | \theta, \phi) and perplexity to be .. math:: \exp \{ - l(D) / \sum_i \sum_j count_{i,j} \} Parameters ---------- test_data : SArray of type dict or SFrame with a single colum of type dict Documents in bag-of-words format. predictions : SArray An SArray of vector type, where each vector contains estimates of the probability that this document belongs to each of the topics. This must have the same size as test_data; otherwise an exception occurs. This can be the output of :py:func:`~graphlab.topic_model.TopicModel.predict`, for example. topics : SFrame An SFrame containing two columns: 'vocabulary' and 'topic_probabilities'. The value returned by m['topics'] is a valid input for this argument, where m is a trained :py:class:`~graphlab.topic_model.TopicModel`. vocabulary : SArray An SArray of words to use. All words in test_data that are not in this vocabulary will be ignored. Notes ----- For more details, see equations 13-16 of [PattersonTeh2013]. References ---------- .. [PERP] `Wikipedia - perplexity <http://en.wikipedia.org/wiki/Perplexity>`_ .. [PattersonTeh2013] Patterson, Teh. `"Stochastic Gradient Riemannian Langevin Dynamics on the Probability Simplex" <http://www.stats.ox.ac.uk/~teh/research/compstats/PatTeh2013a.pdf>`_ NIPS, 2013. Examples -------- >>> from graphlab import topic_model >>> train_data, test_data = graphlab.text_analytics.random_split(docs) >>> m = topic_model.create(train_data) >>> pred = m.predict(train_data) >>> topics = m['topics'] >>> p = topic_model.perplexity(test_data, pred, topics['topic_probabilities'], topics['vocabulary']) >>> p 1720.7 # lower values are better """ _mt._get_metric_tracker().track('toolkit.text.perplexity') test_data = _check_input(test_data) assert isinstance(predictions, _SArray), \ "Predictions must be an SArray of vector type." assert predictions.dtype() == _array.array, \ "Predictions must be probabilities. Try using m.predict() with " + \ "output_type='probability'." opts = {'test_data': test_data, 'predictions': predictions, 'topics': topics, 'vocabulary': vocabulary} response = _graphlab.toolkits._main.run("text_topicmodel_get_perplexity", opts) return response['perplexity']
def predict(self, dataset, output_type='assignment', num_burnin=None): """ Use the model to predict topics for each document. The provided `dataset` should be an SArray object where each element is a dict representing a single document in bag-of-words format, where keys are words and values are their corresponding counts. If `dataset` is an SFrame, then it must contain a single column of dict type. The current implementation will make inferences about each document given its estimates of the topics learned when creating the model. This is done via Gibbs sampling. Parameters ---------- dataset : SArray, SFrame of type dict A set of documents to use for making predictions. output_type : str, optional The type of output desired. This can either be - assignment: the returned values are integers in [0, num_topics) - probability: each returned prediction is a vector with length num_topics, where element k prepresents the probability that document belongs to topic k. num_burnin : int, optional The number of iterations of Gibbs sampling to perform when inferring the topics for documents at prediction time. If provided this will override the burnin value set during training. Returns ------- out : SArray See Also -------- evaluate Examples -------- Make predictions about which topic each document belongs to. >>> docs = graphlab.SArray('https://static.turi.com/datasets/nips-text') >>> m = graphlab.topic_model.create(docs) >>> pred = m.predict(docs) If one is interested in the probability of each topic >>> pred = m.predict(docs, output_type='probability') Notes ----- For each unique word w in a document d, we sample an assignment to topic k with probability proportional to .. math:: p(z_{dw} = k) \propto (n_{d,k} + \\alpha) * \Phi_{w,k} where - :math:`W` is the size of the vocabulary, - :math:`n_{d,k}` is the number of other times we have assigned a word in document to d to topic :math:`k`, - :math:`\Phi_{w,k}` is the probability under the model of choosing word :math:`w` given the word is of topic :math:`k`. This is the matrix returned by calling `m['topics']`. This represents a collapsed Gibbs sampler for the document assignments while we keep the topics learned during training fixed. This process is done in parallel across all documents, five times per document. """ _mt._get_metric_tracker().track('toolkit.text.topic_model.predict') dataset = _check_input(dataset) if num_burnin is None: num_burnin = self.get('num_burnin') opts = {'model': self.__proxy__, 'data': dataset, 'num_burnin': num_burnin} response = _graphlab.toolkits._main.run("text_topicmodel_predict", opts) preds = _SArray(None, _proxy=response['predictions']) # Get most likely topic if probabilities are not requested if output_type not in ['probability', 'probabilities', 'prob']: # equivalent to numpy.argmax(x) preds = preds.apply(lambda x: max(_izip(x, _xrange(len(x))))[1]) return preds
def evaluate(self, train_data, test_data=None, metric='perplexity'): """ Estimate the model's ability to predict new data. Imagine you have a corpus of books. One common approach to evaluating topic models is to train on the first half of all of the books and see how well the model predicts the second half of each book. This method returns a metric called perplexity, which is related to the likelihood of observing these words under the given model. See :py:func:`~graphlab.topic_model.perplexity` for more details. The provided `train_data` and `test_data` must have the same length, i.e., both data sets must have the same number of documents; the model will use train_data to estimate which topic the document belongs to, and this is used to estimate the model's performance at predicting the unseen words in the test data. See :py:func:`~graphlab.topic_model.TopicModel.predict` for details on how these predictions are made, and see :py:func:`~graphlab.text_analytics.random_split` for a helper function that can be used for making train/test splits. Parameters ---------- train_data : SArray or SFrame A set of documents to predict topics for. test_data : SArray or SFrame, optional A set of documents to evaluate performance on. By default this will set to be the same as train_data. metric : str The chosen metric to use for evaluating the topic model. Currently only 'perplexity' is supported. Returns ------- out : dict The set of estimated evaluation metrics. See Also -------- predict, graphlab.toolkits.text_analytics.random_split Examples -------- >>> docs = graphlab.SArray('https://static.turi.com/datasets/nips-text') >>> train_data, test_data = graphlab.text_analytics.random_split(docs) >>> m = graphlab.topic_model.create(train_data) >>> m.evaluate(train_data, test_data) {'perplexity': 2467.530370396021} """ _mt._get_metric_tracker().track('toolkit.text.topic_model.evaluate') train_data = _check_input(train_data) if test_data is None: test_data = train_data else: test_data = _check_input(test_data) predictions = self.predict(train_data, output_type='probability') topics = self.get('topics') ret = {} ret['perplexity'] = perplexity(test_data, predictions, topics['topic_probabilities'], topics['vocabulary']) return ret
def create(dataset, num_topics=10, initial_topics=None, alpha=None, beta=.1, num_iterations=10, num_burnin=5, associations=None, verbose=False, print_interval=10, validation_set=None, method='auto'): """ Create a topic model from the given data set. A topic model assumes each document is a mixture of a set of topics, where for each topic some words are more likely than others. One statistical approach to do this is called a "topic model". This method learns a topic model for the given document collection. Parameters ---------- dataset : SArray of type dict or SFrame with a single column of type dict A bag of words representation of a document corpus. Each element is a dictionary representing a single document, where the keys are words and the values are the number of times that word occurs in that document. num_topics : int, optional The number of topics to learn. initial_topics : SFrame, optional An SFrame with a column of unique words representing the vocabulary and a column of dense vectors representing probability of that word given each topic. When provided, these values are used to initialize the algorithm. alpha : float, optional Hyperparameter that controls the diversity of topics in a document. Smaller values encourage fewer topics per document. Provided value must be positive. Default value is 50/num_topics. beta : float, optional Hyperparameter that controls the diversity of words in a topic. Smaller values encourage fewer words per topic. Provided value must be positive. num_iterations : int, optional The number of iterations to perform. num_burnin : int, optional The number of iterations to perform when inferring the topics for documents at prediction time. verbose : bool, optional When True, print most probable words for each topic while printing progress. print_interval : int, optional The number of iterations to wait between progress reports. associations : SFrame, optional An SFrame with two columns named "word" and "topic" containing words and the topic id that the word should be associated with. These words are not considered during learning. validation_set : SArray of type dict or SFrame with a single column A bag of words representation of a document corpus, similar to the format required for `dataset`. This will be used to monitor model performance during training. Each document in the provided validation set is randomly split: the first portion is used estimate which topic each document belongs to, and the second portion is used to estimate the model's performance at predicting the unseen words in the test data. method : {'cgs', 'alias'}, optional The algorithm used for learning the model. - *cgs:* Collapsed Gibbs sampling - *alias:* AliasLDA method. Returns ------- out : TopicModel A fitted topic model. This can be used with :py:func:`~TopicModel.get_topics()` and :py:func:`~TopicModel.predict()`. While fitting is in progress, several metrics are shown, including: +------------------+---------------------------------------------------+ | Field | Description | +==================+===================================================+ | Elapsed Time | The number of elapsed seconds. | +------------------+---------------------------------------------------+ | Tokens/second | The number of unique words processed per second | +------------------+---------------------------------------------------+ | Est. Perplexity | An estimate of the model's ability to model the | | | training data. See the documentation on evaluate. | +------------------+---------------------------------------------------+ See Also -------- TopicModel, TopicModel.get_topics, TopicModel.predict, graphlab.SArray.dict_trim_by_keys, TopicModel.evaluate References ---------- - `Wikipedia - Latent Dirichlet allocation <http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation>`_ - Alias method: Li, A. et al. (2014) `Reducing the Sampling Complexity of Topic Models. <http://www.sravi.org/pubs/fastlda-kdd2014.pdf>`_. KDD 2014. Examples -------- The following example includes an SArray of documents, where each element represents a document in "bag of words" representation -- a dictionary with word keys and whose values are the number of times that word occurred in the document: >>> docs = graphlab.SArray('https://static.turi.com/datasets/nytimes') Once in this form, it is straightforward to learn a topic model. >>> m = graphlab.topic_model.create(docs) It is also easy to create a new topic model from an old one -- whether it was created using GraphLab Create or another package. >>> m2 = graphlab.topic_model.create(docs, initial_topics=m['topics']) To manually fix several words to always be assigned to a topic, use the `associations` argument. The following will ensure that topic 0 has the most probability for each of the provided words: >>> from graphlab import SFrame >>> associations = SFrame({'word':['hurricane', 'wind', 'storm'], 'topic': [0, 0, 0]}) >>> m = graphlab.topic_model.create(docs, associations=associations) More advanced usage allows you to control aspects of the model and the learning method. >>> import graphlab as gl >>> m = gl.topic_model.create(docs, num_topics=20, # number of topics num_iterations=10, # algorithm parameters alpha=.01, beta=.1) # hyperparameters To evaluate the model's ability to generalize, we can create a train/test split where a portion of the words in each document are held out from training. >>> train, test = gl.text_analytics.random_split(.8) >>> m = gl.topic_model.create(train) >>> results = m.evaluate(test) >>> print results['perplexity'] """ _mt._get_metric_tracker().track('toolkit.text.topic_model.create') dataset = _check_input(dataset) _check_categorical_option_type("method", method, ['auto', 'cgs', 'alias']) if method == 'cgs' or method == 'auto': model_name = 'cgs_topic_model' else: model_name = 'alias_topic_model' # If associations are provided, check they are in the proper format if associations is None: associations = _graphlab.SFrame({'word': [], 'topic': []}) if isinstance(associations, _graphlab.SFrame) and \ associations.num_rows() > 0: assert set(associations.column_names()) == set(['word', 'topic']), \ "Provided associations must be an SFrame containing a word column\ and a topic column." assert associations['word'].dtype() == str, \ "Words must be strings." assert associations['topic'].dtype() == int, \ "Topic ids must be of int type." if alpha is None: alpha = float(50) / num_topics if validation_set is not None: _check_input(validation_set) # Must be a single column if isinstance(validation_set, _graphlab.SFrame): column_name = validation_set.column_names()[0] validation_set = validation_set[column_name] (validation_train, validation_test) = _random_split(validation_set) else: validation_train = _SArray() validation_test = _SArray() opts = {'model_name': model_name, 'data': dataset, 'verbose': verbose, 'num_topics': num_topics, 'num_iterations': num_iterations, 'print_interval': print_interval, 'alpha': alpha, 'beta': beta, 'num_burnin': num_burnin, 'associations': associations} # Initialize the model with basic parameters response = _graphlab.toolkits._main.run("text_topicmodel_init", opts) m = TopicModel(response['model']) # If initial_topics provided, load it into the model if isinstance(initial_topics, _graphlab.SFrame): assert set(['vocabulary', 'topic_probabilities']) == \ set(initial_topics.column_names()), \ "The provided initial_topics does not have the proper format, \ e.g. wrong column names." observed_topics = initial_topics['topic_probabilities'].apply(lambda x: len(x)) assert all(observed_topics == num_topics), \ "Provided num_topics value does not match the number of provided initial_topics." # Rough estimate of total number of words weight = dataset.size() * 1000 opts = {'model': m.__proxy__, 'topics': initial_topics['topic_probabilities'], 'vocabulary': initial_topics['vocabulary'], 'weight': weight} response = _graphlab.toolkits._main.run("text_topicmodel_set_topics", opts) m = TopicModel(response['model']) # Train the model on the given data set and retrieve predictions opts = {'model': m.__proxy__, 'data': dataset, 'verbose': verbose, 'validation_train': validation_train, 'validation_test': validation_test} response = _graphlab.toolkits._main.run("text_topicmodel_train", opts) m = TopicModel(response['model']) return m
def evaluate(self, train_data, test_data=None, metric='perplexity'): """ Estimate the model's ability to predict new data. Imagine you have a corpus of books. One common approach to evaluating topic models is to train on the first half of all of the books and see how well the model predicts the second half of each book. This method returns a metric called perplexity, which is related to the likelihood of observing these words under the given model. See :py:func:`~graphlab.topic_model.perplexity` for more details. The provided `train_data` and `test_data` must have the same length, i.e., both data sets must have the same number of documents; the model will use train_data to estimate which topic the document belongs to, and this is used to estimate the model's performance at predicting the unseen words in the test data. See :py:func:`~graphlab.topic_model.TopicModel.predict` for details on how these predictions are made, and see :py:func:`~graphlab.text_analytics.random_split` for a helper function that can be used for making train/test splits. Parameters ---------- train_data : SArray or SFrame A set of documents to predict topics for. test_data : SArray or SFrame, optional A set of documents to evaluate performance on. By default this will set to be the same as train_data. metric : str The chosen metric to use for evaluating the topic model. Currently only 'perplexity' is supported. Returns ------- out : dict The set of estimated evaluation metrics. See Also -------- predict, graphlab.toolkits.text_analytics.random_split Examples -------- >>> docs = graphlab.SArray('http://s3.amazonaws.com/GraphLab-Datasets/nips-text') >>> train_data, test_data = graphlab.text_analytics.random_split(docs) >>> m = graphlab.topic_model.create(train_data) >>> m.evaluate(train_data, test_data) {'perplexity': 2467.530370396021} """ _mt._get_metric_tracker().track('toolkit.text.topic_model.evaluate') train_data = _check_input(train_data) if test_data is None: test_data = train_data else: test_data = _check_input(test_data) predictions = self.predict(train_data, output_type='probability') topics = self.get('topics') ret = {} ret['perplexity'] = perplexity(test_data, predictions, topics['topic_probabilities'], topics['vocabulary']) return ret
def predict(self, dataset, output_type='assignment'): """ Use the model to predict topics for each document. The provided `dataset` should be an SArray object where each element is a dict representing a single document in bag-of-words format, where keys are words and values are their corresponding counts. If `dataset` is an SFrame, then it must contain a single column of dict type. The current implementation will make inferences about each document given its estimates of the topics learned when creating the model. This is done via Gibbs sampling. Parameters ---------- dataset : SArray, SFrame of type dict A set of documents to use for making predictions. output_type : str, optional The type of output desired. This can either be - assignment: the returned values are integers in [0, num_topics) - probability: each returned prediction is a vector with length num_topics, where element k prepresents the probability that document belongs to topic k. Returns ------- out : SArray See Also -------- evaluate Examples -------- Make predictions about which topic each document belongs to. >>> docs = graphlab.SArray('http://s3.amazonaws.com/GraphLab-Datasets/nips-text') >>> m = graphlab.topic_model.create(docs) >>> pred = m.predict(docs) If one is interested in the probability of each topic >>> pred = m.predict(docs, output_type='probability') Notes ----- For each unique word w in a document d, we sample an assignment to topic k with probability proportional to .. math:: p(z_{dw} = k) \propto (n_{d,k} + \\alpha) * \Phi_{w,k} where - :math:`W` is the size of the vocabulary, - :math:`n_{d,k}` is the number of other times we have assigned a word in document to d to topic :math:`k`, - :math:`\Phi_{w,k}` is the probability under the model of choosing word :math:`w` given the word is of topic :math:`k`. This is the matrix returned by calling `m['topics']`. This represents a collapsed Gibbs sampler for the document assignments while we keep the topics learned during training fixed. This process is done in parallel across all documents, five times per document. """ _mt._get_metric_tracker().track('toolkit.text.topic_model.predict') dataset = _check_input(dataset) opts = {'model': self.__proxy__, 'data': dataset} response = _graphlab.toolkits._main.run("text_topicmodel_predict", opts) preds = _SArray(None, _proxy=response['predictions']) # Get most likely topic if probabilities are not requested if output_type not in ['probability', 'probabilities', 'prob']: # equivalent to numpy.argmax(x) preds = preds.apply(lambda x: max(_izip(x, xrange(len(x))))[1]) return preds
def create(dataset, num_topics=10, initial_topics=None, alpha=None, beta=.1, num_iterations=10, associations=None, verbose=False, print_interval=10, validation_set=None, method='auto'): """ Create a topic model from the given data set. A topic model assumes each document is a mixture of a set of topics, where for each topic some words are more likely than others. One statistical approach to do this is called a "topic model". This method learns a topic model for the given document collection. Parameters ---------- dataset : SArray of type dict or SFrame with a single column of type dict A bag of words representation of a document corpus. Each element is a dictionary representing a single document, where the keys are words and the values are the number of times that word occurs in that document. num_topics : int, optional The number of topics to learn. initial_topics : SFrame, optional An SFrame with a column of unique words representing the vocabulary and a column of dense vectors representing probability of that word given each topic. When provided, these values are used to initialize the algorithm. num_iterations : int, optional The number of iterations to perform. alpha : float, optional Hyperparameter that controls the diversity of topics in a document. Smaller values encourage fewer topics per document. Provided value must be positive. Default value is 50/num_topics. beta : float, optional Hyperparameter that controls the diversity of words in a topic. Smaller values encourage fewer words per topic. Provided value must be positive. verbose : bool, optional When True, print most probable words for each topic while printing progress. print_interval : int, optional The number of iterations to wait between progress reports. associations : SFrame, optional An SFrame with two columns named "word" and "topic" containing words and the topic id that the word should be associated with. These words are not considered during learning. validation_set : SArray of type dict or SFrame with a single column A bag of words representation of a document corpus, similar to the format required for `dataset`. This will be used to monitor model performance during training. Each document in the provided validation set is randomly split: the first portion is used estimate which topic each document belongs to, and the second portion is used to estimate the model's performance at predicting the unseen words in the test data. method : {'cgs', 'alias'}, optional The algorithm used for learning the model. - *cgs:* Collapsed Gibbs sampling - *alias:* AliasLDA method. Returns ------- out : TopicModel A fitted topic model. This can be used with :py:func:`~TopicModel.get_topics()` and :py:func:`~TopicModel.predict()`. While fitting is in progress, several metrics are shown, including: +------------------+---------------------------------------------------+ | Field | Description | +==================+===================================================+ | Elapsed Time | The number of elapsed seconds. | +------------------+---------------------------------------------------+ | Tokens/second | The number of unique words processed per second | +------------------+---------------------------------------------------+ | Est. Perplexity | An estimate of the model's ability to model the | | | training data. See the documentation on evaluate. | +------------------+---------------------------------------------------+ See Also -------- TopicModel, TopicModel.get_topics, TopicModel.predict, graphlab.SArray.dict_trim_by_keys References ---------- - `Wikipedia - Latent Dirichlet allocation <http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation>`_ - Alias method: Li, A. et al. (2014) `Reducing the Sampling Complexity of Topic Models. <http://www.sravi.org/pubs/fastlda-kdd2014.pdf>`_. KDD 2014. Examples -------- The following example includes an SArray of documents, where each element represents a document in "bag of words" representation -- a dictionary with word keys and whose values are the number of times that word occurred in the document: >>> docs = graphlab.SArray('http://s3.amazonaws.com/GraphLab-Datasets/nytimes') Once in this form, it is straightforward to learn a topic model. >>> m = graphlab.topic_model.create(docs) It is also easy to create a new topic model from an old one -- whether it was created using GraphLab Create or another package. >>> m2 = graphlab.topic_model.create(docs, initial_topics=m['topics']) To manually fix several words to always be assigned to a topic, use the `associations` argument. The following will ensure that topic 0 has the most probability for each of the provided words: >>> from graphlab import SFrame >>> associations = SFrame({'word':['hurricane', 'wind', 'storm'], 'topic': [0, 0, 0]}) >>> m = graphlab.topic_model.create(docs, associations=associations) More advanced usage allows you to control aspects of the model and the learning method. >>> import graphlab as gl >>> m = gl.topic_model.create(docs, num_topics=20, # number of topics num_iterations=10, # algorithm parameters alpha=.01, beta=.1) # hyperparameters """ _mt._get_metric_tracker().track('toolkit.text.topic_model.create') dataset = _check_input(dataset) _check_categorical_option_type("method", method, ['auto', 'cgs', 'alias']) if method == 'cgs' or method == 'auto': model_name = 'cgs_topic_model' else: model_name = 'alias_topic_model' # If associations are provided, check they are in the proper format if associations is None: associations = _graphlab.SFrame({'word': [], 'topic': []}) if isinstance(associations, _graphlab.SFrame) and \ associations.num_rows() > 0: assert set(associations.column_names()) == set(['word', 'topic']), \ "Provided associations must be an SFrame containing a word column\ and a topic column." assert associations['word'].dtype() == str, \ "Words must be strings." assert associations['topic'].dtype() == int, \ "Topic ids must be of int type." if alpha is None: alpha = float(50) / num_topics if validation_set is not None: _check_input(validation_set) # Must be a single column if isinstance(validation_set, _graphlab.SFrame): column_name = validation_set.column_names()[0] validation_set = validation_set[column_name] (validation_train, validation_test) = _random_split(validation_set) else: validation_train = _SArray() validation_test = _SArray() opts = {'model_name': model_name, 'data': dataset, 'verbose': verbose, 'num_topics': num_topics, 'num_iterations': num_iterations, 'print_interval': print_interval, 'alpha': alpha, 'beta': beta, 'associations': associations} # Initialize the model with basic parameters response = _graphlab.toolkits._main.run("text_topicmodel_init", opts) m = TopicModel(response['model']) # If initial_topics provided, load it into the model if isinstance(initial_topics, _graphlab.SFrame): assert set(['vocabulary', 'topic_probabilities']) == \ set(initial_topics.column_names()), \ "The provided initial_topics does not have the proper format, \ e.g. wrong column names." observed_topics = initial_topics['topic_probabilities'].apply(lambda x: len(x)) assert all(observed_topics == num_topics), \ "Provided num_topics value does not match the number of provided initial_topics." # Rough estimate of total number of words weight = dataset.size() * 1000 opts = {'model': m.__proxy__, 'topics': initial_topics['topic_probabilities'], 'vocabulary': initial_topics['vocabulary'], 'weight': weight} response = _graphlab.toolkits._main.run("text_topicmodel_set_topics", opts) m = TopicModel(response['model']) # Train the model on the given data set and retrieve predictions opts = {'model': m.__proxy__, 'data': dataset, 'verbose': verbose, 'validation_train': validation_train, 'validation_test': validation_test} response = _graphlab.toolkits._main.run("text_topicmodel_train", opts) m = TopicModel(response['model']) return m