Beispiel #1
0
 def doc_scores(self, searcher, weighting = None, exclude_docs = None):
     query, filterquery = self.subqueries
     
     filter = BitVector(searcher.doc_count_all())
     for docnum in filterquery.docs(searcher, exclude_docs = exclude_docs):
         filter.set(docnum)
         
     for docnum, score in query.doc_scores(searcher, weighting = weighting):
         if docnum not in filter: continue
         yield docnum, score
Beispiel #2
0
 def doc_scores(self, searcher, weighting = None, exclude_docs = None):
     query, filterquery = self.subqueries
     
     filter = BitVector(searcher.doc_count_all())
     for docnum in filterquery.docs(searcher, exclude_docs = exclude_docs):
         filter.set(docnum)
         
     for docnum, score in query.doc_scores(searcher, weighting = weighting):
         if docnum not in filter: continue
         yield docnum, score
Beispiel #3
0
    def docs(self, searcher, exclude_docs=None):
        vector = BitVector(searcher.doc_count_all())
        text = self.text

        for fieldname in self.fieldnames:
            fieldnum = searcher.fieldname_to_num(fieldname)

            if (fieldnum, text) in searcher:
                for docnum, _ in searcher.postings(fieldnum, self.text, exclude_docs=exclude_docs):
                    vector.set(docnum)

        return iter(vector)
Beispiel #4
0
 def docs(self, searcher, exclude_docs = None):
     vector = BitVector(searcher.doc_count_all())
     text = self.text
     
     for fieldname in self.fieldnames:
         fieldnum = searcher.fieldname_to_num(fieldname)
         
         if (fieldnum, text) in searcher:
             for docnum, _ in searcher.postings(fieldnum, self.text,
                                                   exclude_docs = exclude_docs):
                 vector.set(docnum)
             
     return iter(vector)
Beispiel #5
0
def _not_vector(searcher, notqueries, sourcevector):
    # Returns a BitVector where the positions are docnums
    # and True means the docnum is banned from the results.
    # 'sourcevector' is the incoming exclude_docs. This
    # function makes a copy of it and adds the documents
    # from notqueries
    
    if sourcevector is None:
        nvector = BitVector(searcher.reader().doc_count_all())
    else:
        nvector = sourcevector.copy()
    
    for nquery in notqueries:
        nvector.set_from(nquery.docs(searcher))
    
    return nvector
Beispiel #6
0
def _not_vector(searcher, notqueries, sourcevector):
    # Returns a BitVector where the positions are docnums
    # and True means the docnum is banned from the results.
    # 'sourcevector' is the incoming exclude_docs. This
    # function makes a copy of it and adds the documents
    # from notqueries

    if sourcevector is None:
        nvector = BitVector(searcher.reader().doc_count_all())
    else:
        nvector = sourcevector.copy()

    for nquery in notqueries:
        nvector.set_from(nquery.docs(searcher))

    return nvector
Beispiel #7
0
    def search(self, query, limit=5000, sortedby=None, reverse=False):
        """Runs the query represented by the query object and returns a Results object.
        
        See the help for :meth:`~Searcher.find` for information on the parameters.
        
        :param query: a :class:`whoosh.query.Query` object.
        :rtype: :class:`Results`
        """

        ixreader = self.ixreader

        t = now()
        if sortedby is not None:
            if isinstance(sortedby, basestring):
                sorter = scoring.FieldSorter(sortedby)
            elif isinstance(sortedby, (list, tuple)):
                sorter = scoring.MultiFieldSorter(
                    [FieldSorter(fn) for fn in sortedby])
            elif isinstance(sortedby, Sorter):
                sorter = sortedby
            else:
                raise ValueError(
                    "sortedby argument must be a string, list, or Sorter (%r)"
                    % sortedby)

            scored_list = sorter.order(self, query.docs(self), reverse=reverse)
            scores = None
            docvector = BitVector(ixreader.doc_count_all(), source=scored_list)
            if len(scored_list) > limit:
                scored_list = list(scored_list)[:limit]
        else:
            # Sort by scores
            topdocs = TopDocs(limit, ixreader.doc_count_all())
            final = self.weighting.final
            topdocs.add_all((docnum, final(self, docnum, score))
                            for docnum, score in query.doc_scores(self))

            best = topdocs.best()
            if best:
                # topdocs.best() returns a list like
                # [(docnum, score), (docnum, score), ... ]
                # This unpacks that into two lists: docnums and scores
                scored_list, scores = zip(*topdocs.best())
            else:
                scored_list = []
                scores = []

            docvector = topdocs.docs
        t = now() - t

        return Results(self,
                       query,
                       scored_list,
                       docvector,
                       runtime=t,
                       scores=scores)
Beispiel #8
0
 def docs(self, searcher, exclude_docs = None):
     if not self.subqueries:
         return
     
     hits = BitVector(searcher.doc_count_all())
     
     self._split_queries()
     if self._notqueries:
         exclude_docs = _not_vector(self._notqueries, searcher, exclude_docs)
     
     getbit = hits.__getitem__
     setbit = hits.set
     for q in self._subqueries:
         for docnum in q.docs(searcher, exclude_docs = exclude_docs):
             if not getbit(docnum):
                 yield docnum
             setbit(docnum)
Beispiel #9
0
 def __init__(self, capacity, max_doc, docvector=None):
     self.capacity = capacity
     self.docs = docvector or BitVector(max_doc)
     self.heap = []
     self._total = 0
Beispiel #10
0
    def search(self,
               query,
               limit=5000,
               sortedby=None,
               reverse=False,
               minscore=0.0001):
        """Runs the query represented by the ``query`` object and returns a
        Results object.
        
        :param query: a :class:`whoosh.query.Query` object.
        :param limit: the maximum number of documents to score. If you're only
            interested in the top N documents, you can set limit=N to limit the
            scoring for a faster search.
        :param sortedby: if this parameter is not None, the results are sorted
            instead of scored. If this value is a string, the results are
            sorted by the field named in the string. If this value is a list or
            tuple, it is assumed to be a sequence of strings and the results
            are sorted by the fieldnames in the sequence. Otherwise 'sortedby'
            should be a scoring.Sorter object.
            
            The fields you want to sort by must be indexed.
            
            For example, to sort the results by the 'path' field::
            
                searcher.find(q, sortedby = "path")
                
            To sort the results by the 'path' field and then the 'category'
            field::
                
                searcher.find(q, sortedby = ("path", "category"))
                
            To use a sorting object::
            
                searcher.find(q, sortedby = scoring.FieldSorter("path", key=mykeyfn))
            
            Using a string or tuple simply instantiates a
            :class:`whoosh.scoring.FieldSorter` or
            :class:`whoosh.scoring.MultiFieldSorter` object for you. To get a
            custom sort order, instantiate your own ``FieldSorter`` with a
            ``key`` argument, or write a custom :class:`whoosh.scoring.Sorter`
            class.
            
            FieldSorter and MultiFieldSorter cache the document order, using 4
            bytes times the number of documents in the index, and taking time
            to cache. To increase performance, instantiate your own sorter and
            re-use it (but remember you need to recreate it if the index
            changes).
        
        :param reverse: if ``sortedby`` is not None, this reverses the
            direction of the sort.
        :param minscore: the minimum score to include in the results.
        :rtype: :class:`Results`
        """

        ixreader = self.ixreader

        t = now()
        if sortedby is not None:
            if isinstance(sortedby, basestring):
                sorter = scoring.FieldSorter(sortedby)
            elif isinstance(sortedby, (list, tuple)):
                sorter = scoring.MultiFieldSorter(
                    [FieldSorter(fn) for fn in sortedby])
            elif isinstance(sortedby, Sorter):
                sorter = sortedby
            else:
                raise ValueError(
                    "sortedby argument must be a string, list, or Sorter (%r)"
                    % sortedby)

            scored_list = sorter.order(self, query.docs(self), reverse=reverse)
            scores = None
            docvector = BitVector(ixreader.doc_count_all(), source=scored_list)
            if len(scored_list) > limit:
                scored_list = list(scored_list)[:limit]
        else:
            # Sort by scores
            topdocs = TopDocs(limit, ixreader.doc_count_all())
            final = self.weighting.final
            topdocs.add_all(((docnum, final(self, docnum, score))
                             for docnum, score in query.doc_scores(self)),
                            minscore)

            best = topdocs.best()
            if best:
                # topdocs.best() returns a list like
                # [(docnum, score), (docnum, score), ... ]
                # This unpacks that into two lists: docnums and scores
                scored_list, scores = zip(*topdocs.best())
            else:
                scored_list = []
                scores = []

            docvector = topdocs.docs
        t = now() - t

        return Results(self,
                       query,
                       scored_list,
                       docvector,
                       runtime=t,
                       scores=scores)
Beispiel #11
0
    def search(self,
               query,
               limit=5000,
               weighting=None,
               sortedby=None,
               reverse=False):
        """Runs the query represented by the query object and returns a Results object.
        
        :query: a query.Query object representing the search query. You can translate
            a query string into a query object with e.g. qparser.QueryParser.
        :limit: the maximum number of documents to score. If you're only interested in
            the top N documents, you can set limit=N to limit the scoring for a faster
            search.
        :weighting: if this parameter is not None, use this weighting object to score the
            results instead of the default.
        :sortedby: if this parameter is not None, the results are sorted instead of scored.
            If this value is a string, the results are sorted by the field named in the string.
            If this value is a list or tuple, it is assumed to be a sequence of strings and the
            results are sorted by the fieldnames in the sequence. Otherwise 'sortedby' should be
            a scoring.Sorter object.
            
            The fields you want to sort by must be indexed.
            
            For example, to sort the results by the 'path' field::
            
                searcher.search(q, sortedby = "path")
                
            To sort the results by the 'path' field and then the 'category' field::
                
                searcher.search(q, sortedby = ("path", "category"))
                
            To use a sorting object::
            
                searcher.search(q, sortedby = scoring.NullSorter)
        
        :reverse: if 'sortedby' is not None, this reverses the direction of the sort.
        """

        doc_reader = self.doc_reader

        t = time.time()
        if sortedby is not None:
            if isinstance(sortedby, basestring):
                sortedby = scoring.FieldSorter(sortedby)
            elif isinstance(sortedby, (list, tuple)):
                sortedby = scoring.MultiFieldSorter(sortedby)
            elif callable(sortedby):
                sortedby = sortedby()

            scored_list = sortedby.order(self,
                                         query.docs(self),
                                         reverse=reverse)
            docvector = BitVector(doc_reader.doc_count_all(),
                                  source=scored_list)
            if len(scored_list) > limit:
                scored_list = list(scored_list)[:limit]
        else:
            # Sort by scores
            topdocs = TopDocs(limit, doc_reader.doc_count_all())
            topdocs.add_all(
                query.doc_scores(self, weighting=weighting or self.weighting))
            scored_list = topdocs.best()
            docvector = topdocs.docs
        t = time.time() - t

        return Results(self, query, scored_list, docvector, runtime=t)