def testAllIndexes(self): tfidf_model = models.LsiModel.load( os.path.join(self.output, "model.tfidf")) create_index(self.corpus, self.output, self.output, tfidf=True, lda=True, lsi=True, hdp=True) index = similarities.Similarity.load( os.path.join(self.output, "index.tfidf")) op = os.path.join(self.output, "tfidf") p = "(stored under {})".format(str(op)) expect = "Similarity index with 9 documents in 1 shards {}".format(p) self.assertEqual(expect, str(index)) doc = "Human computer interaction" vec_bow = self.dictionary.doc2bow( format_paragraph(doc, PorterStemmer())) self.log(tfidf_model) vec_tfidf = tfidf_model[vec_bow] sims = index[vec_tfidf] print(sims) sims = sorted(enumerate(sims), key=lambda item: -item[1]) expected = [(0, 0.81649655), (3, 0.34777319), (1, 0.31412902), (2, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)] self.log(sims) for index, t in enumerate(sims): self.assertEqual(expected[index][0], t[0]) self.assertAlmostEqual(expected[index][1], t[1])
def testLSI(self): tfidf_model = models.TfidfModel.load( os.path.join(self.output, "model.tfidf")) lsi_model = models.LsiModel.load(os.path.join(self.output, "model.lsi")) create_index(self.corpus, self.output, self.output, "test", lsi=True) index = similarities.Similarity.load( os.path.join(self.output, "index.lsi")) op = os.path.join(self.output, "lsi") p = "(stored under {})".format(str(op)) expect = "Similarity index with 9 documents in 1 shards {}".format(p) self.assertEqual(expect, str(index)) # search with the index doc = "Human computer interaction" vec_bow = self.dictionary.doc2bow( format_paragraph(doc, PorterStemmer())) self.log(lsi_model) vec_lsi = lsi_model[tfidf_model[vec_bow]] sims = index[vec_lsi] sims = sorted(enumerate(sims), key=lambda item: -item[1]) expected = [(0, 0.99994081), (2, 0.99990785), (3, 0.99984384), (4, 0.9992786), (1, 0.99330217), (8, 0.22248439), (7, -0.016480923), (6, -0.0515742), (5, -0.08804217)] self.log(sims) for index, t in enumerate(sims): self.assertEqual(expected[index][0], t[0]) self.assertAlmostEqual(expected[index][1], t[1], delta=0.001)
def testLSI(self): create_model(self.corpus, self.output, num_topics=2, lsi=True) # check the model path = os.path.join(self.output, "model.lsi") lsi_model = models.LsiModel.load(path) self.log(lsi_model) doc_bow = [(0, 1), (1, 1)] answer = lsi_model[doc_bow] self.log(answer) self.assertAlmostEqual(len(answer), 2, delta=1) # make sure can vev bow the document doc = "Human computer interaction" self.dictionary = corpora.Dictionary.load( os.path.join(self.output, "corpus.dict")) vec_bow = self.dictionary.doc2bow( format_paragraph(doc, PorterStemmer())) self.log(lsi_model) vec_lsi = lsi_model[vec_bow] self.assertEqual(len(vec_lsi), 2) e1_1 = [(0.703, "tree"), (0.538, "graph"), (0.402, "minor"), (0.187, "survey"), (0.061, "system"), (0.060, "time"), (0.060, "respons"), (0.058, "user"), (0.049, "comput"), (0.035, "interfac")] e1_2 = [(0.703, "tree"), (0.538, "graph"), (0.402, "minor"), (0.187, "survey"), (0.061, "system"), (0.060, "respons"), (0.060, "time"), (0.058, "user"), (0.049, "comput"), (0.035, "interfac")] e1_3 = [(-t[0], t[1]) for t in e1_1] e1_4 = [(-t[0], t[1]) for t in e1_2] e2_1 = [(0.460, "system"), (0.373, "user"), (0.332, "ep"), (0.328, "interfac"), (0.320, "respons"), (0.320, "time"), (0.293, "comput"), (0.280, "human"), (0.171, "survey"), (-0.161, "tree")] e2_2 = [(0.460, "system"), (0.373, "user"), (0.332, "ep"), (0.328, "interfac"), (0.320, "time"), (0.320, "respons"), (0.293, "comput"), (0.280, "human"), (0.171, "survey"), (-0.161, "tree")] e2_3 = [(-t[0], t[1]) for t in e2_1] e2_4 = [(-t[0], t[1]) for t in e2_2] expect = [[ self.format_lsi(e1_1), self.format_lsi(e1_2), self.format_lsi(e1_3), self.format_lsi(e1_4) ], [ self.format_lsi(e2_1), self.format_lsi(e2_2), self.format_lsi(e2_3), self.format_lsi(e2_4) ]] for index, values in enumerate(lsi_model.print_topics()): self.assertEqual(values[1] in expect[index], True)
def __init__(self, topic): """Query Object Parameters: topic: the soup topic object (bs4) """ stemmer = PorterStemmer() keywords = [] for keyword in topic.find_all("keyword"): keywords.append(" ".join(format_paragraph(keyword.text, stemmer))) formulas = [] for formula in topic.find_all("formula"): form = convert_math_expression(str(formula)) formulas.append(form) self.formulas = formulas self.keywords = keywords self.name = topic.num.text
def testSearchTFIDF(self): indexer = Indexer(self.models, self.index, self.corpus) doc = "Human computer interaction" vec_bow = self.dictionary.doc2bow( format_paragraph(doc, PorterStemmer())) self.log(self.tfidf) vec_tfidf = self.tfidf[vec_bow] index = similarities.Similarity.load( os.path.join(self.index, "index.tfidf")) sims = index[vec_tfidf] sims = sorted(enumerate(sims), key=lambda item: -item[1]) self.log(sims) expected = [(0, 0.81649655), (3, 0.34777319), (1, 0.31412902), (2, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)] for index, t in enumerate(sims): self.assertEqual(expected[index][0], t[0]) self.assertAlmostEqual(expected[index][1], t[1]) doc = """ <num>test-query</num> <keyword>Human computer interaction<keyword> """ query = Query(BeautifulSoup(doc)) results = indexer.search(query, tfidf=True) expect = [ os.path.join(self.corpus, '1.html'), os.path.join(self.corpus, '4.html'), os.path.join(self.corpus, '2.html') ] self.log(results) self.assertEqual(results, expect) doc = """ <num>test-query</num> <keyword>tree ordering<keyword> """ query = Query(BeautifulSoup(doc)) results = indexer.search(query, tfidf=True) expect = [ os.path.join(self.corpus, '6.html'), os.path.join(self.corpus, '7.html'), os.path.join(self.corpus, '8.html') ] self.log(results) self.assertEqual(results, expect)
def __init__(self, topic): """Query: the NTCIR-MathIR query Parameters: topic: the soup topic object (bs4) """ stemmer = PorterStemmer() keywords = [] for keyword in topic.find_all("keyword"): keywords.append(" ".join(format_paragraph(keyword.text, stemmer))) formulas = [] for formula in topic.find_all("formula"): tokens = convert_math_expression(str(formula), eol=True, no_payload=True) tokens = (tokens.replace("#(start)#", "").replace("#(end)#", "").strip()) formulas.append(tokens) self.result = keywords + formulas self.name = topic.num.text self.result = [result for result in self.result if result != ""]
def testFormatParagraph2(self): test = """ <p> There are two ways to write the real number 1 as a <a href="recurring_decimal" title="wikilink">recurring decimal</a>: as 1.000..., and as <a class="uri" href="0.999..." title="wikilink">0.999...</a> (<em><a class="uri" href="q.v." title="wikilink">q.v.</a></em>). There is only one way to represent the real number 1 as a <a href="Dedekind_cut" title="wikilink">Dedekind cut</a> <math display="block" id="1_(number):1"> </p> """ stemmer = PorterStemmer() result = format_paragraph(test, stemmer) expect = [ 'there', 'two', 'way', 'write', 'real', 'number', 'recur', 'decim', 'there', 'one', 'way', 'repres', 'real', 'number', 'dedekind', 'cut' ] self.assertEqual(result, expect)
def testFormatParagraph(self): stemmer = PorterStemmer() result = format_paragraph("<h1> Hello</h1> <p>How are you</p>", stemmer) self.assertEqual(result, ['hello', 'how'])