Ejemplo n.º 1
0
    def testDistanceAsEditsSearching(self):

        w = self.getWriter()
        self._addDoc("foobar", w)
        self._addDoc("test", w)
        self._addDoc("working", w)

        reader = w.getReader()
        searcher = self.getSearcher(reader=reader)
        w.close()

        q = FuzzyQuery(Term("field", "fouba"), 2)
        hits = searcher.search(q, 10).scoreDocs
        self.assertEqual(1, len(hits))
        self.assertEqual("foobar", searcher.doc(hits[0].doc).get("field"))

        q = FuzzyQuery(Term("field", "foubara"), 2)
        hits = searcher.search(q, 10).scoreDocs
        self.assertEqual(1, len(hits))
        self.assertEqual("foobar", searcher.doc(hits[0].doc).get("field"))

        try:
            q = FuzzyQuery(Term("field", "t"), 3)
            self.fail()
        except JavaError as e:
            #expected
            pass
Ejemplo n.º 2
0
    def test_removeDocument(self):

        self.test_indexDocument()

        store = self.openStore()
        searcher = None
        writer = None

        try:
            searcher = self.getSearcher(store)
            query = TermQuery(Term("docid", str(1)))
            topDocs = searcher.search(query, 50)
            self.assertEqual(topDocs.totalHits, 1)
            # be careful with ids they are ephemeral
            docid = topDocs.scoreDocs[0].doc

            writer = self.getWriter(store)
            writer.deleteDocuments(Term("docid", str(1)))
        finally:
            self.closeStore(store, writer)

        store = self.openStore()
        searcher = None
        try:
            searcher = self.getSearcher(store)
            query = TermQuery(Term("docid", str(1)))
            topDocs = searcher.search(query, 50)
            self.assertEqual(topDocs.totalHits, 0)
        finally:
            self.closeStore(store)
Ejemplo n.º 3
0
    def testFilteredQuery(self):

        filteredquery = FilteredQuery(self.query, self.filter)
        topDocs = self.searcher.search(filteredquery, 50)
        self.assertEqual(1, topDocs.totalHits)
        self.assertEqual(1, topDocs.scoreDocs[0].doc)

        topDocs = self.searcher.search(filteredquery, None, 50,
                                       Sort(SortField("sorter",
                                                      SortField.Type.STRING)))
        self.assertEqual(1, topDocs.totalHits)
        self.assertEqual(1, topDocs.scoreDocs[0].doc)

        filteredquery = FilteredQuery(TermQuery(Term("field", "one")),
                                      self.filter)
        topDocs = self.searcher.search(filteredquery, 50)
        self.assertEqual(2, topDocs.totalHits)

        filteredquery = FilteredQuery(TermQuery(Term("field", "x")),
                                      self.filter)
        topDocs = self.searcher.search(filteredquery, 50)
        self.assertEqual(1, topDocs.totalHits)
        self.assertEqual(3, topDocs.scoreDocs[0].doc)

        filteredquery = FilteredQuery(TermQuery(Term("field", "y")),
                                      self.filter)
        topDocs = self.searcher.search(filteredquery, 50)
        self.assertEqual(0, topDocs.totalHits)
Ejemplo n.º 4
0
    def testNotCloseEnough(self):

        self.builder.setSlop(2)
        self.builder.add(Term("field", "one"))
        self.builder.add(Term("field", "five"))
        topDocs = self.searcher.search(self.builder.build(), 50)
        self.assertEqual(0, topDocs.totalHits.value)
Ejemplo n.º 5
0
 def testDiacriticsShouldBeNormalizedNFC(self):
     pq = PhraseQuery()
     pq.add(Term("title", "more"))
     pq.add(Term("title", "e"))
     self.assertConversion(pq, 'title=More\xcc\x81e') # Combined `
     from unicodedata import normalize
     self.assertConversion(TermQuery(Term('title', 'moree')), normalize('NFC', unicode('title=More\xcc\x81e')))
Ejemplo n.º 6
0
def lucene_sample_query_parse(sampleq, ftypes):
    fields = []
    queries = []
    booleans = []
    bq = BooleanQuery()
    for query_tuple in sampleq:
        (field, op_, value) = re.split(snapconf.RANGE_QUERY_OPS, query_tuple)
        m=snapconf.RANGE_QUERY_FIELD_PATTERN.search(query_tuple)
        if m is None or field is None:
            continue
        op=m.group(1)
        if op not in snapconf.operators:
            sys.stderr.write("bad operator %s in range query,exiting\n" % (str(op)))
            sys.exit(-1)
        field_w_type = snapconf.SAMPLE_HEADER_FIELDS_TYPE_MAP[field]
        (fieldtypechar, ftype_method) = ftypes[field_w_type]
        #range query
        if fieldtypechar == 'i' or fieldtypechar == 'f':
            bq.add(lucene_range_query_parse(field_w_type, op, value, fieldtypechar, ftype_method), BOOLEAN_OCCUR)
        #phrase query
        elif ' ' in value or '\t' in value:
            pquery = PhraseQuery()
            [pquery.add(Term(field_w_type, v.lower())) for v in re.split(r'\s+',value)]
            #force exact phrase matching only
            pquery.setSlop(0)
            bq.add(pquery, BOOLEAN_OCCUR)
        #term query
        else:
            bq.add(TermQuery(Term(field_w_type, value.lower())), BOOLEAN_OCCUR)
        sys.stderr.write("value + fields: %s %s\n" % (value.lower(), field_w_type))
    return bq
Ejemplo n.º 7
0
    def testNotCloseEnough(self):

        self.query.setSlop(2)
        self.query.add(Term("field", "one"))
        self.query.add(Term("field", "five"))
        topDocs = self.searcher.search(self.query, 50)
        self.assertEqual(0, topDocs.totalHits)
Ejemplo n.º 8
0
 def testIndexRelationTermOutput(self):
     self.assertConversion(TermQuery(Term("animal", "cats")), 'animal=cats')
     query = PhraseQuery()
     query.add(Term("animal", "cats"))
     query.add(Term("animal", "dogs"))
     self.assertConversion(query, 'animal="cats dogs"')
     self.assertConversion(query, 'animal="catS Dogs"')
Ejemplo n.º 9
0
    def testBarelyCloseEnough(self):

        self.builder.setSlop(3)
        self.builder.add(Term("field", "one"))
        self.builder.add(Term("field", "five"))
        topDocs = self.searcher.search(self.builder.build(), 50)
        self.assertEqual(1, topDocs.totalHits)
Ejemplo n.º 10
0
 def testGet(self):
     cache = LruCache(keyEqualsFunction=lambda q1, q2: q1.equals(q2),
                      createFunction=lambda q: QueryWrapperFilter(q))
     f1 = cache.get(TermQuery(Term("field1", "value1")))
     f2 = cache.get(TermQuery(Term("field1", "value1")))
     self.assertEquals(f1, f2)
     f3 = cache.get(TermQuery(Term("field1", "value2")))
     self.assertNotEquals(f1, f3)
Ejemplo n.º 11
0
    def cascadeSearch(self, searcher, reader, phrase, ratio):
        results = []

        # 1) Exact match
        query1 = TermQuery(Term("str", phrase))
        hits1 = searcher.search(query1, self._maxHits).scoreDocs

        if hits1 > 0:
            for hit in hits1:
                results.append((100 * ratio, self.getDocument(reader, hit)))

        # 2) Normalized match
        normPhrase = self.normalizeCasePunct(phrase)
        if results == [] or self._includeAllResults == True:
            query2 = TermQuery(Term("str_norm", normPhrase))
            hits2 = searcher.search(query2, self._maxHits).scoreDocs

            if hits2 > 0:
                for hit in hits2:
                    results.append((90 * ratio, self.getDocument(reader, hit)))

        # 3) Alpha sorted and normalized
        sortedPhrase = self.sortWords(normPhrase)
        if results == [] or self._includeAllResults == True:
            query3 = TermQuery(Term("str_sorted", sortedPhrase))
            hits3 = searcher.search(query3, self._maxHits).scoreDocs

            # May need length instead of just hits3
            if hits3 > 0:
                for hit in hits3:
                    results.append((80 * ratio, self.getDocument(reader, hit)))

        # 4) Normalized and Stemmed
        stemmedPhrase = self.stemWords(normPhrase)
        if results == [] or self._includeAllResults == True:

            query4 = TermQuery(Term("str_stemmed", stemmedPhrase))
            hits4 = searcher.search(query4, self._maxHits).scoreDocs

            # May need length instead of just hits3
            if hits4 > 0:
                for hit in hits4:
                    results.append((70 * ratio, self.getDocument(reader, hit)))

        # 5) Normalized, Alpha sorted, and Stemmed
        stemmedSortedPhrase = self.stemWords(sortedPhrase)
        if results == [] or self._includeAllResults == True:

            query5 = TermQuery(Term("str_stemmedSorted", stemmedSortedPhrase))
            hits5 = searcher.search(query5, self._maxHits).scoreDocs

            # May need length instead of just hits3
            if hits5 > 0:
                for hit in hits5:
                    results.append((60 * ratio, self.getDocument(reader, hit)))

        return results
Ejemplo n.º 12
0
    def doc_search(self, keywords):
        term1 = self.lReader.totalTermFreq(Term('Title', keywords))
        term2 = self.lReader.totalTermFreq(Term('Body', keywords))
        term3 = self.lReader.totalTermFreq(Term('Tags', keywords))
        term = term1 + term2 + term3

        self.term = term
        # print term
        return term
Ejemplo n.º 13
0
    def testBraces(self):
        self.assertConversion(TermQuery(Term('unqualified', 'cats')), '(cats)')
        innerQuery = BooleanQuery()
        innerQuery.add(TermQuery(Term('unqualified', 'cats')), BooleanClause.Occur.MUST)
        innerQuery.add(TermQuery(Term('unqualified', 'dogs')), BooleanClause.Occur.MUST)
        outerQuery = BooleanQuery()
        outerQuery.add(innerQuery, BooleanClause.Occur.SHOULD)
        outerQuery.add(TermQuery(Term('unqualified', 'mice')), BooleanClause.Occur.SHOULD)

        self.assertConversion(outerQuery, '(cats AND dogs) OR mice')
Ejemplo n.º 14
0
def delete(indexDir: str, id: str):
    index_dir = SimpleFSDirectory(Paths.get(indexDir))
    config = IndexWriterConfig(StandardAnalyzer())

    index_writer = IndexWriter(index_dir, config)

    delete_term_query = RegexpQuery(Term('id', id))
    delete_reg_query = RegexpQuery(Term('id', id + '\..*'))

    index_writer.deleteDocuments(delete_term_query)
    index_writer.deleteDocuments(delete_reg_query)
    index_writer.commit()
    index_writer.close()
Ejemplo n.º 15
0
    def __init__(self, *args):

        super(BooleanOrTestCase, self).__init__(*args)

        self.FIELD_T = "T"
        self.FIELD_C = "C"

        self.t1 = TermQuery(Term(self.FIELD_T, "files"))
        self.t2 = TermQuery(Term(self.FIELD_T, "deleting"))
        self.c1 = TermQuery(Term(self.FIELD_C, "production"))
        self.c2 = TermQuery(Term(self.FIELD_C, "optimize"))

        self.searcher = None
Ejemplo n.º 16
0
    def testEquality(self):

        bq1 = BooleanQuery()
        bq1.add(TermQuery(Term("field", "value1")), BooleanClause.Occur.SHOULD)
        bq1.add(TermQuery(Term("field", "value2")), BooleanClause.Occur.SHOULD)

        nested1 = BooleanQuery()
        nested1.add(TermQuery(Term("field", "nestedvalue1")),
                    BooleanClause.Occur.SHOULD)
        nested1.add(TermQuery(Term("field", "nestedvalue2")),
                    BooleanClause.Occur.SHOULD)
        bq1.add(nested1, BooleanClause.Occur.SHOULD)

        bq2 = BooleanQuery()
        bq2.add(TermQuery(Term("field", "value1")), BooleanClause.Occur.SHOULD)
        bq2.add(TermQuery(Term("field", "value2")), BooleanClause.Occur.SHOULD)

        nested2 = BooleanQuery()
        nested2.add(TermQuery(Term("field", "nestedvalue1")),
                    BooleanClause.Occur.SHOULD)
        nested2.add(TermQuery(Term("field", "nestedvalue2")),
                    BooleanClause.Occur.SHOULD)
        bq2.add(nested2, BooleanClause.Occur.SHOULD)

        self.assert_(bq1.equals(bq2))
Ejemplo n.º 17
0
    def testEquality(self):

        b1 = BooleanQuery.Builder()
        b1.add(TermQuery(Term("field", "value1")), BooleanClause.Occur.SHOULD)
        b1.add(TermQuery(Term("field", "value2")), BooleanClause.Occur.SHOULD)
        bq1 = b1.build()

        n1 = BooleanQuery.Builder()
        n1.add(TermQuery(Term("field", "nestedvalue1")),
               BooleanClause.Occur.SHOULD)
        n1.add(TermQuery(Term("field", "nestedvalue2")),
               BooleanClause.Occur.SHOULD)
        nested1 = n1.build()
        b1.add(nested1, BooleanClause.Occur.SHOULD)
        bq1 = b1.build()

        b2 = BooleanQuery.Builder()
        b2.add(TermQuery(Term("field", "value1")), BooleanClause.Occur.SHOULD)
        b2.add(TermQuery(Term("field", "value2")), BooleanClause.Occur.SHOULD)

        n2 = BooleanQuery.Builder()
        n2.add(TermQuery(Term("field", "nestedvalue1")),
               BooleanClause.Occur.SHOULD)
        n2.add(TermQuery(Term("field", "nestedvalue2")),
               BooleanClause.Occur.SHOULD)
        nested2 = n2.build()
        b2.add(nested2, BooleanClause.Occur.SHOULD)
        bq2 = b2.build()

        self.assert_(bq1.equals(bq2))
Ejemplo n.º 18
0
    def testWildcards(self):
        query = PrefixQuery(Term('unqualified', 'prefix'))
        self.assertConversion(query, 'prefix*')
        self.assertConversion(query, 'PREfix*')
        query = PrefixQuery(Term('field', 'prefix'))
        self.assertConversion(query, 'field="PREfix*"')
        self.assertConversion(query, 'field=prefix*')
        query = PrefixQuery(Term('field', 'oc-0123'))
        self.assertConversion(query, 'field="oc-0123*"')
        query = TermQuery(Term('field', 'p'))
        self.assertConversion(query, 'field="P*"')
        #only prefix queries for now
        query = TermQuery(Term('field', 'post'))
        self.assertConversion(query, 'field="*post"')

        query = TermQuery(Term('field', 'prefix'))
        self.assertConversion(query, 'field=prefix**')

        result = LuceneQueryComposer(unqualifiedTermFields=[("field0", 0.2), ("field1", 2.0)], luceneSettings=LuceneSettings()).compose(parseCql("prefix*"))

        query = BooleanQuery()
        left = PrefixQuery(Term("field0", "prefix"))
        left.setBoost(0.2)
        query.add(left, BooleanClause.Occur.SHOULD)

        right = PrefixQuery(Term("field1", "prefix"))
        right.setBoost(2.0)
        query.add(right, BooleanClause.Occur.SHOULD)

        self.assertEquals(type(query), type(result))
        self.assertEquals(repr(query), repr(result))
Ejemplo n.º 19
0
 def getLastStampId(self, prefix='oai_dc', setSpec=None):
     searcher = self._getSearcher()
     sort = Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG, True))
     if prefix is None and setSpec is None:
         query = MatchAllDocsQuery()
     else:
         if prefix is None:
             query = TermQuery(Term(SETS_FIELD, setSpec))
         else:
             query = TermQuery(Term(PREFIX_FIELD, prefix))
     results = searcher.search(query, 1, sort)
     if results.totalHits.value < 1:
         return None
     return _stampFromDocument(searcher.doc(results.scoreDocs[0].doc))
Ejemplo n.º 20
0
 def query_doc(self):
     searcher = self._searcher
     query_document = RegexpQuery(Term('id', str(self._id)))
     top_docs_doc = searcher.search(query_document, 1)
     document_id = str(self._id)
     res_dict = {}
     query_section = RegexpQuery(Term('id', document_id + '\.[0-9]+'))
     top_docs_section = searcher.search(query_section, 99999)
     query_paragraph = RegexpQuery(
         Term('id', document_id + '\.[0-9]+\.[0-9]+'))
     top_docs_sentence = searcher.search(query_paragraph, 99999)
     top_docs = top_docs_doc.merge(
         1000000, [top_docs_section, top_docs_doc, top_docs_sentence])
     for hit in top_docs.scoreDocs:
         doc = searcher.doc(hit.doc)
         id = doc.get('id')
         document = doc.get('document')
         section = doc.get('section')
         author = doc.get('author')
         dynasty = doc.get('dynasty')
         type = doc.get('type')
         text = doc.get('text')
         color = doc.get('color')
         area = doc.get('area')
         zhujie = doc.get('zhujie')
         detail = doc.get('detail')
         res_dict[id] = {}
         if document:
             res_dict[id]['document'] = document
         if section:
             res_dict[id]['section'] = section
         if author:
             res_dict[id]['author'] = author
         if dynasty:
             res_dict[id]['dynasty'] = dynasty
         if type:
             res_dict[id]['type'] = type
         if text:
             res_dict[id]['text'] = text
         if color:
             res_dict[id]['color'] = color
         if area:
             res_dict[id]['area'] = area
         if zhujie:
             res_dict[id]['zhujie'] = zhujie
         if detail:
             res_dict[id]['detail'] = detail
     self._resDict = res_dict
     return self
Ejemplo n.º 21
0
    def testUnqualifiedTermFields(self):
        composer = LuceneQueryComposer(unqualifiedTermFields=[("field0", 0.2), ("field1", 2.0)], luceneSettings=LuceneSettings())
        ast = parseCql("value")
        result = composer.compose(ast)
        query = BooleanQuery()
        left = TermQuery(Term("field0", "value"))
        left.setBoost(0.2)
        query.add(left, BooleanClause.Occur.SHOULD)

        right = TermQuery(Term("field1", "value"))
        right.setBoost(2.0)
        query.add(right, BooleanClause.Occur.SHOULD)

        self.assertEquals(type(query), type(result))
        self.assertEquals(repr(query), repr(result))
Ejemplo n.º 22
0
 def query_section(self, section):
     searcher = self._searcher
     query_doc = RegexpQuery(Term('id', self._id + '\\..+'))
     query_section = TermQuery(Term('section', section))
     query = BooleanQuery.Builder()
     bc1 = BooleanClause(query_doc, BooleanClause.Occur.MUST)
     bc2 = BooleanClause(query_section, BooleanClause.Occur.MUST)
     query = query.add(bc1).add(bc2).build()
     top_docs = searcher.search(query, 1000000)
     hits = top_docs.scoreDocs
     res_dict = {}
     for hit in hits:
         doc = searcher.doc(hit.doc)
         id = doc.get('id')
         document = doc.get('document')
         section = doc.get('section')
         author = doc.get('author')
         dynasty = doc.get('dynasty')
         type = doc.get('type')
         text = doc.get('text')
         color = doc.get('color')
         area = doc.get('area')
         zhujie = doc.get('zhujie')
         detail = doc.get('detail')
         res_dict[id] = {}
         if document:
             res_dict[id]['document'] = document
         if section:
             res_dict[id]['section'] = section
         if author:
             res_dict[id]['author'] = author
         if dynasty:
             res_dict[id]['dynasty'] = dynasty
         if type:
             res_dict[id]['type'] = type
         if text:
             res_dict[id]['text'] = text
         if color:
             res_dict[id]['color'] = color
         if area:
             res_dict[id]['area'] = area
         if zhujie:
             res_dict[id]['zhujie'] = zhujie
         if detail:
             res_dict[id]['detail'] = detail
     res_dict[self._id] = {'document': section}
     self._resDict = res_dict
     return self
Ejemplo n.º 23
0
 def _getDocId(self, identifier):
     searcher = self._getSearcher(identifier)
     results = searcher.search(
         TermQuery(Term(IDENTIFIER_FIELD, identifier)), 1)
     if results.totalHits.value == 0:
         return None
     return results.scoreDocs[0].doc
Ejemplo n.º 24
0
    def getNrOfRecords(self,
                       prefix='oai_dc',
                       setSpec=None,
                       continueAfter=None,
                       oaiFrom=None,
                       oaiUntil=None,
                       partition=None):
        queryBuilder = self._luceneQueryBuilder(
            prefix=prefix,
            sets=[setSpec] if setSpec else None,
            partition=partition)
        collector = self._search(queryBuilder.build(),
                                 continueAfter,
                                 oaiFrom,
                                 oaiUntil,
                                 batchSize=1,
                                 shouldCountHits=True)

        queryBuilder.add(TermQuery(Term(TOMBSTONE_FIELD, TOMBSTONE_VALUE)),
                         BooleanClause.Occur.MUST)

        deleteCollector = self._search(queryBuilder.build(),
                                       continueAfter,
                                       oaiFrom,
                                       oaiUntil,
                                       batchSize=1,
                                       shouldCountHits=True)
        return {
            "total": collector.totalHits(),
            "deletes": deleteCollector.totalHits()
        }
Ejemplo n.º 25
0
    def search(self, q_ids, limit):
        docs = []
        c = 0
        for i, q_id in enumerate(q_ids):  #Index 가 안되어있는 Question은 찾지 못함.
            query = TermQuery(Term("question_id", str(q_id)))
            topdocs = self.searcher.search(
                query, 1).scoreDocs  #현재는 Accepted Answer 하나만 인덱싱 되기 때문에 1개로 한정
            # index searcher에 TermQuery의 객체가 들어가고.. 질문 id에 달려있는 답변 중 상위 n개 가져옴/ scoreDocs는 점수 (소수형임..)
            for hit in topdocs:
                doc = self.searcher.doc(hit.doc)
                docs.append(
                    ResultItem(doc,
                               len(q_ids) - i, doc.get("title"),
                               doc.get("question_id")))

            if len(topdocs) > 0:
                c += 1
                if c >= limit:
                    break

        # self.searchermgr.decRef(self.searcher)
        self.searchermgr.release(self.searcher)
        self.searcher = None
        self.directory.close()
        self.directory = None
        return docs
Ejemplo n.º 26
0
    def testGiga(self):

        w = self.getWriter(analyzer=StandardAnalyzer())

        self._addDoc("Lucene in Action", w)
        self._addDoc("Lucene for Dummies", w)

        self._addDoc("Giga byte", w)

        self._addDoc("ManagingGigabytesManagingGigabyte", w)
        self._addDoc("ManagingGigabytesManagingGigabytes", w)

        self._addDoc("The Art of Computer Science", w)
        self._addDoc("J. K. Rowling", w)
        self._addDoc("JK Rowling", w)
        self._addDoc("Joanne K Roling", w)
        self._addDoc("Bruce Willis", w)
        self._addDoc("Willis bruce", w)
        self._addDoc("Brute willis", w)
        self._addDoc("B. willis", w)

        r = w.getReader()
        w.close()

        q = FuzzyQuery(Term("field", "giga"), 0)

        searcher = self.getSearcher(reader=r)
        hits = searcher.search(q, 10).scoreDocs

        self.assertEqual(1, len(hits))
        self.assertEqual("Giga byte", searcher.doc(hits[0].doc).get("field"))
Ejemplo n.º 27
0
    def test2(self):

      writer = self.getWriter()

      self._addDoc("LANGE", writer)
      self._addDoc("LUETH", writer)
      self._addDoc("PIRSING", writer)
      self._addDoc("RIEGEL", writer)
      self._addDoc("TRZECZIAK", writer)
      self._addDoc("WALKER", writer)
      self._addDoc("WBR", writer)
      self._addDoc("WE", writer)
      self._addDoc("WEB", writer)
      self._addDoc("WEBE", writer)
      self._addDoc("WEBER", writer)
      self._addDoc("WEBERE", writer)
      self._addDoc("WEBREE", writer)
      self._addDoc("WEBEREI", writer)
      self._addDoc("WBRE", writer)
      self._addDoc("WITTKOPF", writer)
      self._addDoc("WOJNAROWSKI", writer)
      self._addDoc("WRICKE", writer)

      reader = writer.getReader()
      searcher = self.getSearcher(reader=reader)
      writer.close()

      query = FuzzyQuery(Term("field", "WEBER"), 2, 1)
      hits = searcher.search(query, 1000).scoreDocs
      self.assertEqual(8, len(hits))
Ejemplo n.º 28
0
    def search(self, query, limit=1):  # default 파라미터로 5를 넣었음..
        # 이 함수를 통해 결과불러줄 것들을 정리하는 거인듯..
        docs = []
        query += " java site:stackoverflow.com"
        g = GoogleSearcher()
        q_ids = g.search(query)  #q_ids : Stackoverflow ids

        i = 0
        for i, q_id in enumerate(q_ids):
            query = TermQuery(Term("id", q_id))
            topdocs = self.searcher.search(query, 10).scoreDocs
            # index searcher에 TermQuery의 객체가 들어가고.. 질문 id에 달려있는 답변 중 상위 n개 가져옴/ scoreDocs는 점수 (소수형임..)
            for hit in topdocs:
                doc = self.searcher.doc(hit.doc)
                docs.append(
                    SOResultItem(doc,
                                 len(q_ids) - i, doc.get("title"),
                                 doc.get("id"), doc.get("description")))

                print("%s, Question Id: %s, Answer Id: %s" %
                      (doc.get("title"), doc.get("id"), doc.get("answer_id")))

            if len(topdocs) > 0:
                i += 1
                if i >= limit:
                    break
            else:
                print "Stackoverflow id %s is not in our index" % q_id
        return docs
Ejemplo n.º 29
0
def getSpanNearQuery(analyzer, s, field="title", slop=100, inOrder=True):
    keywords = tokenize_string(analyzer, s)
    spanTermQueries = [
        SpanMultiTermQueryWrapper(FuzzyQuery(Term(field, keyword)))
        for keyword in keywords
    ]
    return SpanNearQuery(spanTermQueries, slop, inOrder)
Ejemplo n.º 30
0
    def testExact(self):
        """
        Ensures slop of 0 works for exact matches, but not reversed
        """

        # slop is zero by default
        self.builder.add(Term("field", "four"))
        self.builder.add(Term("field", "five"))
        topDocs = self.searcher.search(self.builder.build(), 50)
        self.assertEqual(1, topDocs.totalHits, "exact match")

        self.builder = PhraseQuery.Builder()
        self.builder.add(Term("field", "two"))
        self.builder.add(Term("field", "one"))
        topDocs = self.searcher.search(self.builder.build(), 50)
        self.assertEqual(0, topDocs.totalHits, "reverse not exact")