Beispiel #1
0
    def testDistanceAsEditsSearching(self):

        w = self.getWriter()
        self._addDoc("foobar", w)
        self._addDoc("test", w)
        self._addDoc("working", w)

        reader = w.getReader()
        searcher = self.getSearcher(reader=reader)
        w.close()

        q = FuzzyQuery(Term("field", "fouba"), 2)
        hits = searcher.search(q, 10).scoreDocs
        self.assertEqual(1, len(hits))
        self.assertEqual("foobar", searcher.doc(hits[0].doc).get("field"))

        q = FuzzyQuery(Term("field", "foubara"), 2)
        hits = searcher.search(q, 10).scoreDocs
        self.assertEqual(1, len(hits))
        self.assertEqual("foobar", searcher.doc(hits[0].doc).get("field"))

        try:
            q = FuzzyQuery(Term("field", "t"), 3)
            self.fail()
        except JavaError as e:
            #expected
            pass
Beispiel #2
0
    def testGiga(self):

        w = self.getWriter(analyzer=StandardAnalyzer())

        self._addDoc("Lucene in Action", w)
        self._addDoc("Lucene for Dummies", w)

        self._addDoc("Giga byte", w)

        self._addDoc("ManagingGigabytesManagingGigabyte", w)
        self._addDoc("ManagingGigabytesManagingGigabytes", w)

        self._addDoc("The Art of Computer Science", w)
        self._addDoc("J. K. Rowling", w)
        self._addDoc("JK Rowling", w)
        self._addDoc("Joanne K Roling", w)
        self._addDoc("Bruce Willis", w)
        self._addDoc("Willis bruce", w)
        self._addDoc("Brute willis", w)
        self._addDoc("B. willis", w)

        r = w.getReader()
        w.close()

        q = FuzzyQuery(Term("field", "giga"), 0)

        searcher = self.getSearcher(reader=r)
        hits = searcher.search(q, 10).scoreDocs

        self.assertEqual(1, len(hits))
        self.assertEqual("Giga byte", searcher.doc(hits[0].doc).get("field"))
Beispiel #3
0
    def test2(self):

      writer = self.getWriter()

      self._addDoc("LANGE", writer)
      self._addDoc("LUETH", writer)
      self._addDoc("PIRSING", writer)
      self._addDoc("RIEGEL", writer)
      self._addDoc("TRZECZIAK", writer)
      self._addDoc("WALKER", writer)
      self._addDoc("WBR", writer)
      self._addDoc("WE", writer)
      self._addDoc("WEB", writer)
      self._addDoc("WEBE", writer)
      self._addDoc("WEBER", writer)
      self._addDoc("WEBERE", writer)
      self._addDoc("WEBREE", writer)
      self._addDoc("WEBEREI", writer)
      self._addDoc("WBRE", writer)
      self._addDoc("WITTKOPF", writer)
      self._addDoc("WOJNAROWSKI", writer)
      self._addDoc("WRICKE", writer)

      reader = writer.getReader()
      searcher = self.getSearcher(reader=reader)
      writer.close()

      query = FuzzyQuery(Term("field", "WEBER"), 2, 1)
      hits = searcher.search(query, 1000).scoreDocs
      self.assertEqual(8, len(hits))
Beispiel #4
0
def getSpanNearQuery(analyzer, s, field="title", slop=100, inOrder=True):
    keywords = tokenize_string(analyzer, s)
    spanTermQueries = [
        SpanMultiTermQueryWrapper(FuzzyQuery(Term(field, keyword)))
        for keyword in keywords
    ]
    return SpanNearQuery(spanTermQueries, slop, inOrder)
Beispiel #5
0
def l_searcher(query_string, directory, number_documents):
	lucene.initVM()

	# analyzer = StandardAnalyzer()
	reader = DirectoryReader.open(FSDirectory.open(Paths.get(directory)))
	searcher = IndexSearcher(reader)

	# Top 'n' documents as result
	topN = number_documents

	try:
		# query = QueryParser("question", analyzer).parse(query_string)
		query = FuzzyQuery(Term("question", query_string), 2)
		print("The query was: {}".format(query))

		hits = searcher.search(query, topN)

		print("The hits were: ")

		options = []
		options_answers = []

		# print(hits.totalHits)

		for hit in hits.scoreDocs:
			print(hit.doc)
			# print(hit.score, hit.doc, hit.toString())
			doc = searcher.doc(hit.doc)
			options_answers.append(doc.get("answer"))
			options.append(doc.get("question"))
			# print(doc.get("answer"))

		return options, options_answers
	except IndexError:
		return None
Beispiel #6
0
    def testBoostOnlyRewrite(self):
        # Test the TopTermsBoostOnlyBooleanQueryRewrite rewrite method.

        writer = self.getWriter()
        self._addDoc("Lucene", writer)
        self._addDoc("Lucene", writer)
        self._addDoc("Lucenne", writer)

        reader = writer.getReader()
        searcher = self.getSearcher(reader=reader)
        writer.close()

        query = FuzzyQuery(Term("field", "lucene"))
        query.setRewriteMethod(MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(50))
        hits = searcher.search(query, 1000).scoreDocs
        self.assertEqual(3, len(hits))

        # normally, 'Lucenne' would be the first result as IDF will skew the score.
        self.assertEqual("Lucene", reader.document(hits[0].doc).get("field"))
        self.assertEqual("Lucene", reader.document(hits[1].doc).get("field"))
        self.assertEqual("Lucenne", reader.document(hits[2].doc).get("field"))
    def testBoostOnlyRewrite(self):
        # Test the TopTermsBoostOnlyBooleanQueryRewrite rewrite method.

        writer = self.getWriter()
        self._addDoc("Lucene", writer)
        self._addDoc("Lucene", writer)
        self._addDoc("Lucenne", writer)

        reader = writer.getReader()
        searcher = self.getSearcher(reader=reader)
        writer.close()
    
        query = FuzzyQuery(Term("field", "lucene"))
        query.setRewriteMethod(MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(50))
        hits = searcher.search(query, None, 1000).scoreDocs
        self.assertEqual(3, len(hits))

        # normally, 'Lucenne' would be the first result as IDF will skew the score.
        self.assertEqual("Lucene", reader.document(hits[0].doc).get("field"))
        self.assertEqual("Lucene", reader.document(hits[1].doc).get("field"))
        self.assertEqual("Lucenne", reader.document(hits[2].doc).get("field"))
Beispiel #8
0
    def testTieBreaker(self):
      # MultiTermQuery provides (via attribute) information about which values
      # must be competitive to enter the priority queue. 
      #
      # FuzzyQuery optimizes itself around this information, if the attribute
      # is not implemented correctly, there will be problems!
      #
      directory = RAMDirectory()
      writer = self.getWriter(directory=directory)
      self._addDoc("a123456", writer)
      self._addDoc("c123456", writer)
      self._addDoc("d123456", writer)
      self._addDoc("e123456", writer)

      directory2 = RAMDirectory()
      writer2 = self.getWriter(directory=directory2)
      self._addDoc("a123456", writer2)
      self._addDoc("b123456", writer2)
      self._addDoc("b123456", writer2)
      self._addDoc("b123456", writer2)
      self._addDoc("c123456", writer2)
      self._addDoc("f123456", writer2)

      ir1 = writer.getReader()
      ir2 = writer2.getReader()

      mr = MultiReader([ir1, ir2])
      searcher = self.getSearcher(reader=mr)

      fq = FuzzyQuery(Term("field", "z123456"), 1, 0, 2, False)
      docs = searcher.search(fq, 2)
      self.assertEqual(5, docs.totalHits.value)  # 5 docs, from the a and b's

      mr.close()
      ir1.close()
      ir2.close()
      writer.close()
      writer2.close()
      directory.close()
      directory2.close()
Beispiel #9
0
    def testDefaultFuzziness(self):

        writer = self.getWriter()

        self._addDoc("aaaaa", writer)
        self._addDoc("aaaab", writer)
        self._addDoc("aaabb", writer)
        self._addDoc("aabbb", writer)
        self._addDoc("abbbb", writer)
        self._addDoc("bbbbb", writer)
        self._addDoc("ddddd", writer)
        writer.commit()
        writer.close()

        searcher = self.getSearcher()

        query = FuzzyQuery(Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 0)
        hits = searcher.search(query, 1000).scoreDocs
        self.assertEqual(3, len(hits))

        # same with prefix
        query = FuzzyQuery(Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 1)
        hits = searcher.search(query, 1000).scoreDocs
        self.assertEqual(3, len(hits))

        query = FuzzyQuery(Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 2)
        hits = searcher.search(query, 1000).scoreDocs
        self.assertEqual(3, len(hits))

        query = FuzzyQuery(Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 3)
        hits = searcher.search(query, 1000).scoreDocs
        self.assertEqual(3, len(hits))

        query = FuzzyQuery(Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 4)
        hits = searcher.search(query, 1000).scoreDocs
        self.assertEqual(2, len(hits))

        query = FuzzyQuery(Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 5)
        hits = searcher.search(query, 1000).scoreDocs
        self.assertEqual(1, len(hits))

        query = FuzzyQuery(Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 6)
        hits = searcher.search(query, 1000).scoreDocs
        self.assertEqual(1, len(hits))

        # test scoring
        query = FuzzyQuery(Term("field", "bbbbb"), FuzzyQuery.defaultMaxEdits, 0)
        hits = searcher.search(query, 1000).scoreDocs
        self.assertEqual(3, len(hits), "3 documents should match")

        order = ("bbbbb", "abbbb", "aabbb")
        for hit, o in zip(hits, order):
            term = searcher.doc(hit.doc).get("field")
            self.assertEqual(o, term)

        # test pq size by supplying maxExpansions=2
        # This query would normally return 3 documents, because 3 terms match
        # (see above):
        query = FuzzyQuery(Term("field", "bbbbb"), FuzzyQuery.defaultMaxEdits,
                           0, 2, False)
        hits = searcher.search(query, 1000).scoreDocs
        self.assertEqual(2, len(hits), "only 2 documents should match");
        order = ("bbbbb","abbbb")
        for hit, o in zip(hits, order):
            term = searcher.doc(hit.doc).get("field")
            self.assertEqual(o, term)

        # not similar enough:
        query = FuzzyQuery(Term("field", "xxxxx"))
        topDocs = searcher.search(query, 50)
        self.assertEqual(0, topDocs.totalHits.value)

        # edit distance to "aaaaa" = 3
        query = FuzzyQuery(Term("field", "aaccc"))
        topDocs = searcher.search(query, 50)
        self.assertEqual(0, topDocs.totalHits.value)

        # query identical to a word in the index:
        query = FuzzyQuery(Term("field", "aaaaa"))
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(3, len(scoreDocs))
        self.assertEqual(searcher.doc(scoreDocs[0].doc).get("field"), "aaaaa")

        # default allows for up to two edits:
        self.assertEqual(searcher.doc(scoreDocs[1].doc).get("field"), "aaaab")
        self.assertEqual(searcher.doc(scoreDocs[2].doc).get("field"), "aaabb")

        # query similar to a word in the index:
        query = FuzzyQuery(Term("field", "aaaac"))
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(3, len(scoreDocs))
        self.assertEqual(searcher.doc(scoreDocs[0].doc).get("field"), "aaaaa")
        self.assertEqual(searcher.doc(scoreDocs[1].doc).get("field"), "aaaab")
        self.assertEqual(searcher.doc(scoreDocs[2].doc).get("field"), "aaabb")

        # now with prefix
        query = FuzzyQuery(Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 1)
        hits = searcher.search(query, 1000).scoreDocs
        self.assertEqual(3, len(hits))
        self.assertEqual(searcher.doc(hits[0].doc).get("field"), ("aaaaa"))
        self.assertEqual(searcher.doc(hits[1].doc).get("field"), ("aaaab"))
        self.assertEqual(searcher.doc(hits[2].doc).get("field"), ("aaabb"))

        query = FuzzyQuery(Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 2)
        hits = searcher.search(query, 1000).scoreDocs
        self.assertEqual(3, len(hits))
        self.assertEqual(searcher.doc(hits[0].doc).get("field"), ("aaaaa"))
        self.assertEqual(searcher.doc(hits[1].doc).get("field"), ("aaaab"))
        self.assertEqual(searcher.doc(hits[2].doc).get("field"), ("aaabb"))

        query = FuzzyQuery(Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 3)
        hits = searcher.search(query, 1000).scoreDocs
        self.assertEqual(3, len(hits))
        self.assertEqual(searcher.doc(hits[0].doc).get("field"), ("aaaaa"))
        self.assertEqual(searcher.doc(hits[1].doc).get("field"), ("aaaab"))
        self.assertEqual(searcher.doc(hits[2].doc).get("field"), ("aaabb"))

        query = FuzzyQuery(Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 4)
        hits = searcher.search(query, 1000).scoreDocs
        self.assertEqual(2, len(hits))
        self.assertEqual(searcher.doc(hits[0].doc).get("field"), ("aaaaa"))
        self.assertEqual(searcher.doc(hits[1].doc).get("field"), ("aaaab"))
        query = FuzzyQuery(Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 5)
        hits = searcher.search(query, 1000).scoreDocs
        self.assertEqual(0, len(hits))

        query = FuzzyQuery(Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 0)
        hits = searcher.search(query, 1000).scoreDocs
        self.assertEqual(1, len(hits))
        self.assertEqual(searcher.doc(hits[0].doc).get("field"), ("ddddd"))

        # now with prefix
        query = FuzzyQuery(Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 1)
        hits = searcher.search(query, 1000).scoreDocs
        self.assertEqual(1, len(hits))
        self.assertEqual(searcher.doc(hits[0].doc).get("field"), ("ddddd"))

        query = FuzzyQuery(Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 2)
        hits = searcher.search(query, 1000).scoreDocs
        self.assertEqual(1, len(hits))
        self.assertEqual(searcher.doc(hits[0].doc).get("field"), ("ddddd"))

        query = FuzzyQuery(Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 3)
        hits = searcher.search(query, 1000).scoreDocs;
        self.assertEqual(1, len(hits))
        self.assertEqual(searcher.doc(hits[0].doc).get("field"), ("ddddd"))

        query = FuzzyQuery(Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 4)
        hits = searcher.search(query, 1000).scoreDocs
        self.assertEqual(1, len(hits))
        self.assertEqual(searcher.doc(hits[0].doc).get("field"), ("ddddd"))

        query = FuzzyQuery(Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 5)
        hits = searcher.search(query, 1000).scoreDocs
        self.assertEqual(0, len(hits))

        # different field = no match:
        query = FuzzyQuery(Term("anotherfield", "ddddX"), FuzzyQuery.defaultMaxEdits, 0)
        hits = searcher.search(query, 1000).scoreDocs
        self.assertEqual(0, len(hits))
    #print(writer.numDocs())

    field = FieldType()
    field.setStored(True)
    field.setTokenized(True)
    field.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    # Criação de diferentes campos para os diferentes campos parseados e adição desses documentos no index.
    addDoc("name", r['name'], writer)
    addDoc("research", r['research'], writer)
    writer.commit()
    writer.close()

# Nesse momento é realizada a busca dos termos dentro do índice.
searcher = IndexSearcher(DirectoryReader.open(store))
query = FuzzyQuery(Term("research", "programaçao"))

MAX = 1000
hits = searcher.search(query, MAX)

for hit in hits.scoreDocs:
    doc = searcher.doc(hit.doc)
    professorList.append(doc.get("name"))

app = Flask(__name__)


@app.route('/')
def index():
    return render_template('home.html')