def test_new_corpus(self): """ Test if the bow representation of new corpus is consistent :return: :rtype: """ other_texts_without_unseen_word = [["computer", "time", "graph"], ["survey", "response", "eps"], ["human", "system", "computer"]] other_corpus_without_unseen_word = [common_dictionary.doc2bow(text) for text in other_texts_without_unseen_word] self.assertEqual(other_corpus_without_unseen_word[0], [(0, 1), (6, 1), (10, 1)]) other_texts_with_unseen_word = [["computer", "graph", "hardware", "time", ], ["survey", "response", "eps", "administrator"]] other_corpus_with_unseen_word = [common_dictionary.doc2bow(text) for text in other_texts_with_unseen_word] self.assertEqual(other_corpus_with_unseen_word[0], [(0, 1), (6, 1), (10, 1)])
def testEmptyDocument(self): local_texts = common_texts + [['only_occurs_once_in_corpus_and_alone_in_doc']] dictionary = Dictionary(local_texts) dictionary.filter_extremes(no_below=2) corpus = [dictionary.doc2bow(text) for text in local_texts] a2d = author2doc.copy() a2d['joaquin'] = [len(local_texts) - 1] self.class_(corpus, author2doc=a2d, id2word=dictionary, num_topics=2)
def setUp(self): self.cls = similarities.SoftCosineSimilarity self.dictionary = Dictionary(texts) self.corpus = [dictionary.doc2bow(document) for document in texts] similarity_matrix = scipy.sparse.identity(12, format="lil") similarity_matrix[dictionary.token2id["user"], dictionary.token2id["human"]] = 0.5 similarity_matrix[dictionary.token2id["human"], dictionary.token2id["user"]] = 0.5 self.similarity_matrix = similarity_matrix.tocsc()
def testEmptyDocument(self): local_texts = common_texts + [[ 'only_occurs_once_in_corpus_and_alone_in_doc' ]] dictionary = Dictionary(local_texts) dictionary.filter_extremes(no_below=2) corpus = [dictionary.doc2bow(text) for text in local_texts] a2d = author2doc.copy() a2d['joaquin'] = [len(local_texts) - 1] self.class_(corpus, author2doc=a2d, id2word=dictionary, num_topics=2)
def test_lda_update_2(self): """ Update with unseen text, which have new words. I add 'hardware', 'administrator' to test. :return: :rtype: """ other_texts = [["computer", "graph", "hardware", "time", ], ["survey", "response", "eps", "administrator"]] other_corpus = [common_dictionary.doc2bow(text) for text in other_texts] original_model = copy.deepcopy(self.lda) self.lda.update(other_corpus) self.assertEqual(self.lda.id2word.token2id, original_model.id2word.token2id)
def testNonIncreasing(self): """ Check that similarities are non-increasing when `num_best` is not `None`.""" # NOTE: this could be implemented for other similarities as well (i.e. in _TestSimilarityABC). index = self.cls(corpus, self.similarity_matrix, num_best=5) query = dictionary.doc2bow(texts[0]) sims = index[query] sims2 = numpy.asarray(sims)[:, 1] # Just the similarities themselves. # The difference of adjacent elements should be negative. cond = sum(numpy.diff(sims2) < 0) == len(sims2) - 1 self.assertTrue(cond)
def testNonIncreasing(self): """ Check that similarities are non-increasing when `num_best` is not `None`.""" # NOTE: this could be implemented for other similarities as well (i.e. in _TestSimilarityABC). index = self.cls(corpus, self.similarity_matrix, num_best=5) query = dictionary.doc2bow(texts[0]) sims = index[query] sims2 = numpy.asarray(sims)[:, 1] # Just the similarities themselves. # The difference of adjacent elements should be less than or equal to zero. cond = sum(numpy.diff(sims2) <= 0) == len(sims2) - 1 self.assertTrue(cond)
def test_lda_update_1(self): """ Update with unseen text, which doesn't have new words. :return: :rtype: """ other_texts = [["computer", "time", "graph"], ["survey", "response", "eps"], ["human", "system", "computer"]] other_corpus = [common_dictionary.doc2bow(text) for text in other_texts] original_model = copy.deepcopy(self.lda) # Inplace update self.lda.update(other_corpus) self.assertNotEqual(self.lda, original_model) self.assertEqual(self.lda.id2word.token2id, original_model.id2word.token2id)
def testChunking(self): # Override testChunking. index = self.cls(corpus, self.similarity_matrix) query = [dictionary.doc2bow(document) for document in texts[:3]] sims = index[query] for i in range(3): self.assertTrue(numpy.alltrue(sims[i, i] == 1.0)) # Similarity of a document with itself is 1.0. # test the same thing but with num_best index.num_best = 5 sims = index[query] for i, chunk in enumerate(sims): expected = i self.assertAlmostEqual(expected, chunk[0][0], places=2) expected = 1.0 self.assertAlmostEqual(expected, chunk[0][1], places=2)
def testFull(self, num_best=None): # Override testFull. # Single query index = self.cls(corpus, self.similarity_matrix, num_best=num_best) query = dictionary.doc2bow(texts[0]) sims = index[query] if num_best is not None: # Sparse array. for i, sim in sims: self.assertTrue(numpy.alltrue(sim <= 1.0)) self.assertTrue(numpy.alltrue(sim >= 0.0)) else: self.assertAlmostEqual( 1.0, sims[0]) # Similarity of a document with itself is 1.0. self.assertTrue(numpy.alltrue(sims[1:] >= 0.0)) self.assertTrue(numpy.alltrue(sims[1:] < 1.0)) expected = 2.1889350195476758 self.assertAlmostEqual(expected, numpy.sum(sims)) # Corpora for query in ( corpus, # Basic text corpus. self.tfidf[corpus] ): # Transformed corpus without slicing support. index = self.cls(query, self.similarity_matrix, num_best=num_best) sims = index[query] if num_best is not None: # Sparse array. for result in sims: for i, sim in result: self.assertTrue(numpy.alltrue(sim <= 1.0)) self.assertTrue(numpy.alltrue(sim >= 0.0)) else: for i, result in enumerate(sims): self.assertAlmostEqual( 1.0, result[i] ) # Similarity of a document with itself is 1.0. self.assertTrue(numpy.alltrue(result[:i] >= 0.0)) self.assertTrue(numpy.alltrue(result[:i] < 1.0)) self.assertTrue(numpy.alltrue(result[i + 1:] >= 0.0)) self.assertTrue(numpy.alltrue(result[i + 1:] < 1.0))
def testFull(self, num_best=None): # Override testFull. # Single query index = self.cls(corpus, self.similarity_matrix, num_best=num_best) query = dictionary.doc2bow(texts[0]) sims = index[query] if num_best is not None: # Sparse array. for i, sim in sims: self.assertTrue(numpy.alltrue(sim <= 1.0)) self.assertTrue(numpy.alltrue(sim >= 0.0)) else: self.assertAlmostEqual(1.0, sims[0]) # Similarity of a document with itself is 1.0. self.assertTrue(numpy.alltrue(sims[1:] >= 0.0)) self.assertTrue(numpy.alltrue(sims[1:] < 1.0)) expected = 2.1889350195476758 self.assertAlmostEqual(expected, numpy.sum(sims)) # Corpora for query in ( corpus, # Basic text corpus. self.tfidf[corpus]): # Transformed corpus without slicing support. index = self.cls(query, self.similarity_matrix, num_best=num_best) sims = index[query] if num_best is not None: # Sparse array. for result in sims: for i, sim in result: self.assertTrue(numpy.alltrue(sim <= 1.0)) self.assertTrue(numpy.alltrue(sim >= 0.0)) else: for i, result in enumerate(sims): self.assertAlmostEqual(1.0, result[i]) # Similarity of a document with itself is 1.0. self.assertTrue(numpy.alltrue(result[:i] >= 0.0)) self.assertTrue(numpy.alltrue(result[:i] < 1.0)) self.assertTrue(numpy.alltrue(result[i + 1:] >= 0.0)) self.assertTrue(numpy.alltrue(result[i + 1:] < 1.0))