Beispiel #1
0
 def docs(self, name, value, counts=False):
     """Generate doc ids which contain given term, optionally with frequency counts."""
     docsenum = index.MultiFields.getTermDocsEnum(self.indexReader, name,
                                                  util.BytesRef(value))
     docs = iter(docsenum.nextDoc,
                 index.PostingsEnum.NO_MORE_DOCS) if docsenum else ()
     return ((doc, docsenum.freq())
             for doc in docs) if counts else iter(docs)
Beispiel #2
0
 def items(self, *values: str) -> Iterator[document.Field]:
     """Generate indexed component fields."""
     field = getattr(self, 'docValueLess', self)
     for value in values:
         for name, text in zip(self.names, self.values(value)):
             yield document.Field(name, text, field)
             if self.docvalues:
                 yield self.docValueClass(name, util.BytesRef(text))
Beispiel #3
0
def test_indexes(tempdir):
    with pytest.raises(TypeError):
        engine.IndexSearcher()
    with pytest.raises(lucene.JavaError):
        engine.Indexer(tempdir, 'r')
    indexer = engine.Indexer()
    indexer.set('name', engine.Field.String, stored=True)
    indexer.set('text', engine.Field.Text)
    with engine.Indexer(tempdir) as temp:
        temp.add()
    with pytest.raises(KeyError), engine.Indexer(tempdir) as temp:
        temp.add()
        temp.add(missing='')
    for other in (temp, temp.directory, tempdir):
        indexer += other
    assert len(indexer) == 3
    analyzer = engine.Analyzer.whitespace()
    indexer.add(text=analyzer.tokens('?'), name=util.BytesRef('{}'))
    indexer.commit()
    assert indexer[next(indexer.docs('text', '?'))]['name'] == '{}'
    indexer.delete('text', '?')
    indexer.commit(merge=True)
    assert not indexer.hasDeletions()
    indexer.commit(merge=1)
    assert len(list(indexer.readers)) == 1
    reader = engine.indexers.IndexReader(indexer.indexReader)
    del reader.indexReader
    with pytest.raises(AttributeError):
        reader.maxDoc
    del indexer.indexSearcher
    with pytest.raises(AttributeError):
        indexer.search

    indexer = engine.Indexer(tempdir)
    indexer.add()
    indexer.commit()
    files = set(os.listdir(tempdir))
    path = os.path.join(tempdir, 'temp')
    with indexer.snapshot() as commit:
        indexer.commit(merge=1)
        assert indexer.indexCommit.generation > commit.generation
        engine.indexers.copy(commit, path)
        assert set(os.listdir(path)) == set(commit.fileNames) < files < set(
            os.listdir(tempdir))
        filepath = os.path.join(path, commit.segmentsFileName)
        os.remove(filepath)
        open(filepath, 'w').close()
        with pytest.raises(OSError):
            engine.indexers.copy(commit, path)
    with pytest.raises(lucene.JavaError):
        indexer.check(tempdir)
    del indexer
    assert engine.Indexer(tempdir)
    assert not os.path.exists(os.path.join(tempdir, commit.segmentsFileName))
    assert engine.IndexWriter.check(tempdir).clean
    assert not engine.IndexWriter.check(tempdir, fix=True).numBadSegments
Beispiel #4
0
 def positions(self, name, value, payloads=False, offsets=False):
     """Generate doc ids and positions which contain given term, optionally with offsets, or only ones with payloads."""
     docsenum = index.MultiFields.getTermPositionsEnum(
         self.indexReader, name, util.BytesRef(value))
     for doc in (iter(docsenum.nextDoc, index.PostingsEnum.NO_MORE_DOCS)
                 if docsenum else ()):
         positions = (docsenum.nextPosition()
                      for _ in range(docsenum.freq()))
         if payloads:
             positions = ((position, docsenum.payload.utf8ToString())
                          for position in positions if docsenum.payload)
         elif offsets:
             positions = ((docsenum.startOffset(), docsenum.endOffset())
                          for position in positions)
         yield doc, list(positions)
Beispiel #5
0
    def terms(self,
              name,
              value='',
              stop='',
              counts=False,
              distance=0,
              prefix=0):
        """Generate a slice of term values, optionally with frequency counts.

        :param name: field name
        :param value: term prefix, start value (given stop), or fuzzy value (given distance)
        :param stop: optional upper bound for range terms
        :param counts: include frequency counts
        :param distance: maximum edit distance for fuzzy terms
        :param prefix: prefix length for fuzzy terms
        """
        terms = index.MultiFields.getTerms(self.indexReader, name)
        if not terms:
            return iter([])
        term, termsenum = index.Term(name, value), terms.iterator()
        if distance:
            distance = (float if lucene6 else int)(distance)
            terms = termsenum = search.FuzzyTermsEnum(terms,
                                                      util.AttributeSource(),
                                                      term, distance, prefix,
                                                      False)
        else:
            termsenum.seekCeil(util.BytesRef(value))
            terms = itertools.chain([termsenum.term()],
                                    util.BytesRefIterator.cast_(termsenum))
        terms = map(operator.methodcaller('utf8ToString'), terms)
        predicate = partial(operator.gt,
                            stop) if stop else operator.methodcaller(
                                'startswith', value)
        if not distance:
            terms = itertools.takewhile(predicate, terms)
        return ((term, termsenum.docFreq())
                for term in terms) if counts else terms
Beispiel #6
0
 def payload(self, data):
     self.Payload.payload = util.BytesRef(data)
Beispiel #7
0
 def range(cls, name: str, start, stop, lower=True, upper=False) -> 'Query':
     """Return lucene RangeQuery, by default with a half-open interval."""
     start, stop = (value if value is None else util.BytesRef(value)
                    for value in (start, stop))
     return cls(search.TermRangeQuery, name, start, stop, lower, upper)