def testAllIndexes(self): tfidf_model = models.LsiModel.load( os.path.join(self.output, "model.tfidf")) create_index(self.corpus, self.output, self.output, tfidf=True, lda=True, lsi=True, hdp=True) index = similarities.Similarity.load( os.path.join(self.output, "index.tfidf")) op = os.path.join(self.output, "tfidf") p = "(stored under {})".format(str(op)) expect = "Similarity index with 9 documents in 1 shards {}".format(p) self.assertEqual(expect, str(index)) doc = "Human computer interaction" vec_bow = self.dictionary.doc2bow( format_paragraph(doc, PorterStemmer())) self.log(tfidf_model) vec_tfidf = tfidf_model[vec_bow] sims = index[vec_tfidf] print(sims) sims = sorted(enumerate(sims), key=lambda item: -item[1]) expected = [(0, 0.81649655), (3, 0.34777319), (1, 0.31412902), (2, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)] self.log(sims) for index, t in enumerate(sims): self.assertEqual(expected[index][0], t[0]) self.assertAlmostEqual(expected[index][1], t[1])
def testLSI(self): tfidf_model = models.TfidfModel.load( os.path.join(self.output, "model.tfidf")) lsi_model = models.LsiModel.load(os.path.join(self.output, "model.lsi")) create_index(self.corpus, self.output, self.output, "test", lsi=True) index = similarities.Similarity.load( os.path.join(self.output, "index.lsi")) op = os.path.join(self.output, "lsi") p = "(stored under {})".format(str(op)) expect = "Similarity index with 9 documents in 1 shards {}".format(p) self.assertEqual(expect, str(index)) # search with the index doc = "Human computer interaction" vec_bow = self.dictionary.doc2bow( format_paragraph(doc, PorterStemmer())) self.log(lsi_model) vec_lsi = lsi_model[tfidf_model[vec_bow]] sims = index[vec_lsi] sims = sorted(enumerate(sims), key=lambda item: -item[1]) expected = [(0, 0.99994081), (2, 0.99990785), (3, 0.99984384), (4, 0.9992786), (1, 0.99330217), (8, 0.22248439), (7, -0.016480923), (6, -0.0515742), (5, -0.08804217)] self.log(sims) for index, t in enumerate(sims): self.assertEqual(expected[index][0], t[0]) self.assertAlmostEqual(expected[index][1], t[1], delta=0.001)
def testHDP(self): create_index(self.corpus, self.output, self.output, hdp=True) index = similarities.Similarity.load( os.path.join(self.output, "index.hdp")) op = os.path.join(self.output, "hdp") p = "(stored under {})".format(str(op)) expect = "Similarity index with 9 documents in 1 shards {}".format(p) self.assertEqual(expect, str(index))
def setUp(self): self.debug = True self.corpus = os.path.join(os.getcwd(), "test", "tutorialDocuments") self.models = os.path.join(os.getcwd(), "testModels") self.index = os.path.join(os.getcwd(), "testIndex") if not os.path.exists(self.models): os.makedirs(self.models) else: shutil.rmtree(self.models) os.makedirs(self.models) if not os.path.exists(self.index): os.makedirs(self.index) else: shutil.rmtree(self.index) os.makedirs(self.index) create_model(self.corpus, self.models, num_topics=2, lda=True, lsi=True, tfidf=True, hdp=True) # create the indexes create_index(self.corpus, self.index, self.models, lda=True, lsi=True, tfidf=True, hdp=True) # load the corpus and dictionary d_path = os.path.join(self.models, "corpus.dict") self.dictionary = corpora.Dictionary.load(d_path) self.corp = corpora.MmCorpus(os.path.join(self.models, "corpus.mm")) # load the models path = os.path.join(self.models, "model.tfidf") self.tfidf = models.TfidfModel.load(path) path = os.path.join(self.models, "model.lda") self.lda = models.LdaModel.load(path) path = os.path.join(self.models, "model.lsi") self.lsi = models.LsiModel.load(path) path = os.path.join(self.models, "model.hdp") self.hdp = models.HdpModel.load(path) # load the indexes path = os.path.join(self.index, "index.tfidf") self.tfidf_index = similarities.Similarity.load(path) path = os.path.join(self.index, "index.lsi") self.lsi_index = similarities.Similarity.load(path) path = os.path.join(self.index, "index.lda") self.lda_index = similarities.Similarity.load(path) path = os.path.join(self.index, "index.hdp") self.hdp_index = similarities.Similarity.load(path)