def test_index_accumulation(self): accumulator = CorpusAccumulator(self.top_ids).accumulate(self.corpus) inverted_index = accumulator.index_to_dict() expected = {10: {0, 2, 3}, 15: {0}, 20: {0}, 21: {1, 2, 3}, 17: {1, 2}} self.assertDictEqual(expected, inverted_index) self.assertEqual(3, accumulator.get_occurrences(10)) self.assertEqual(2, accumulator.get_occurrences(17)) self.assertEqual(2, accumulator.get_co_occurrences(10, 21)) self.assertEqual(1, accumulator.get_co_occurrences(10, 17))
def test_index_accumulation(self): accumulator = CorpusAccumulator(self.top_ids).accumulate(self.corpus) inverted_index = accumulator.index_to_dict() expected = { 10: {0, 2, 3}, 15: {0}, 20: {0}, 21: {1, 2, 3}, 17: {1, 2} } self.assertDictEqual(expected, inverted_index) self.assertEqual(3, accumulator.get_occurrences(10)) self.assertEqual(2, accumulator.get_occurrences(17)) self.assertEqual(2, accumulator.get_co_occurrences(10, 21)) self.assertEqual(1, accumulator.get_co_occurrences(10, 17))
def p_boolean_document(corpus, segmented_topics): """Perform the boolean document probability estimation. Boolean document estimates the probability of a single word as the number of documents in which the word occurs divided by the total number of documents. Parameters ---------- corpus : iterable of list of (int, int) The corpus of documents. segmented_topics: list of (int, int). Each tuple (word_id_set1, word_id_set2) is either a single integer, or a `numpy.ndarray` of integers. Returns ------- :class:`~gensim.topic_coherence.text_analysis.CorpusAccumulator` Word occurrence accumulator instance that can be used to lookup token frequencies and co-occurrence frequencies. Examples --------- .. sourcecode:: pycon >>> from gensim.topic_coherence import probability_estimation >>> from gensim.corpora.hashdictionary import HashDictionary >>> >>> >>> texts = [ ... ['human', 'interface', 'computer'], ... ['eps', 'user', 'interface', 'system'], ... ['system', 'human', 'system', 'eps'], ... ['user', 'response', 'time'], ... ['trees'], ... ['graph', 'trees'] ... ] >>> dictionary = HashDictionary(texts) >>> w2id = dictionary.token2id >>> >>> # create segmented_topics >>> segmented_topics = [ ... [ ... (w2id['system'], w2id['graph']), ... (w2id['computer'], w2id['graph']), ... (w2id['computer'], w2id['system']) ... ], ... [ ... (w2id['computer'], w2id['graph']), ... (w2id['user'], w2id['graph']), ... (w2id['user'], w2id['computer'])] ... ] >>> # create corpus >>> corpus = [dictionary.doc2bow(text) for text in texts] >>> >>> result = probability_estimation.p_boolean_document(corpus, segmented_topics) >>> result.index_to_dict() {10608: set([0]), 12736: set([1, 3]), 18451: set([5]), 5798: set([1, 2])} """ top_ids = unique_ids_from_segments(segmented_topics) return CorpusAccumulator(top_ids).accumulate(corpus)
def p_boolean_document(corpus, segmented_topics): """This function performs the boolean document probability estimation. Boolean document estimates the probability of a single word as the number of documents in which the word occurs divided by the total number of documents. Args: corpus : The corpus of documents. segmented_topics : Output from the segmentation of topics. Could be simply topics too. Returns: accumulator : word occurrence accumulator instance that can be used to lookup token frequencies and co-occurrence frequencies. """ top_ids = unique_ids_from_segments(segmented_topics) return CorpusAccumulator(top_ids).accumulate(corpus)