Beispiel #1
0
 def stored_fields(self, docnum):
     if self.is_closed:
         raise ReaderClosed
     assert docnum >= 0
     schema = self.schema
     sfs = self._perdoc.stored_fields(docnum)
     # Double-check with schema to filter out removed fields
     return dict(item for item in iteritems(sfs) if item[0] in schema)
Beispiel #2
0
    def __init__(self, codec, dbfile, length, postfile):
        self._codec = codec
        self._dbfile = dbfile
        self._tindex = filetables.OrderedHashReader(dbfile, length)
        self._fieldmap = self._tindex.extras["fieldmap"]
        self._postfile = postfile

        self._fieldunmap = [None] * len(self._fieldmap)
        for fieldname, num in iteritems(self._fieldmap):
            self._fieldunmap[num] = fieldname
Beispiel #3
0
    def word_values(self, value, analyzer, **kwargs):
        seen = defaultdict(list)

        kwargs["positions"] = True
        kwargs["chars"] = True
        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            seen[t.text].append((t.pos, t.startchar, t.endchar, t.boost))

        for w, poses in iteritems(seen):
            value, summedboost = self.encode(poses)
            yield (w, len(poses), summedboost, value)
Beispiel #4
0
    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        poses = defaultdict(list)
        weights = defaultdict(float)
        kwargs["positions"] = True
        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            poses[t.text].append(t.pos)
            weights[t.text] += t.boost

        for w, poslist in iteritems(poses):
            value = self.encode(poslist)
            yield (w, len(poslist), weights[w] * fb, value)
Beispiel #5
0
    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        seen = defaultdict(list)

        kwargs["positions"] = True
        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            pos = t.pos
            boost = t.boost
            seen[t.text].append((pos, boost))

        for w, poses in iteritems(seen):
            value = self.encode(poses)
            yield (w, len(poses), sum(p[1] for p in poses) * fb, value)
Beispiel #6
0
    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        length = 0
        freqs = defaultdict(int)
        weights = defaultdict(float)

        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            length += 1
            freqs[t.text] += 1
            weights[t.text] += t.boost

        wvs = ((w, freq, weights[w] * fb, pack_uint(freq))
               for w, freq in iteritems(freqs))
        return wvs
Beispiel #7
0
    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        seen = defaultdict(list)
        weights = defaultdict(float)

        kwargs["positions"] = True
        kwargs["chars"] = True
        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            if hasattr(t, "meta"):
                kwargs["meta"] = True
                meta = tuple(f"{k}={v}" for k, v in getattr(t, "meta").items())
                seen[t.text].append((t.pos, t.startchar, t.endchar, meta))
            else:
                seen[t.text].append((t.pos, t.startchar, t.endchar, ()))
            weights[t.text] += t.boost

        for w, poslist in iteritems(seen):
            value = self.encode(poslist)
            yield (w, len(poslist), weights[w] * fb, value)
Beispiel #8
0
    def __init__(self, B=0.75, K1=1.2, **kwargs):
        """

        >>> from whoosh import scoring
        >>> # Set a custom B value for the "content" field
        >>> w = scoring.BM25F(B=0.75, content_B=1.0, K1=1.5)

        :param B: free parameter, see the BM25 literature. Keyword arguments of
            the form ``fieldname_B`` (for example, ``body_B``) set field-
            specific values for B.
        :param K1: free parameter, see the BM25 literature.
        """

        self.B = B
        self.K1 = K1

        self._field_B = {}
        for k, v in iteritems(kwargs):
            if k.endswith("_B"):
                fieldname = k[:-2]
                self._field_B[fieldname] = v
Beispiel #9
0
    def collect(self, sub_docnum):
        matcher = self.child.matcher
        global_docnum = sub_docnum + self.child.offset

        # We want the sort key for the document so we can (by default) sort
        # the facet groups
        sortkey = self.child.collect(sub_docnum)

        # For each facet we're grouping by
        for name, categorizer in iteritems(self.categorizers):
            add = self.facetmaps[name].add

            # We have to do more work if the facet allows overlapping groups
            if categorizer.allow_overlap:
                for key in categorizer.keys_for(matcher, sub_docnum):
                    add(categorizer.key_to_name(key), global_docnum, sortkey)
            else:
                key = categorizer.key_for(matcher, sub_docnum)
                key = categorizer.key_to_name(key)
                add(key, global_docnum, sortkey)

        return sortkey
Beispiel #10
0
 def __repr__(self):
     parms = ", ".join("%s=%r" % (name, value)
                       for name, value in iteritems(self.__dict__))
     return "%s(%s)" % (self.__class__.__name__, parms)
Beispiel #11
0
 def __repr__(self):
     attrs = ""
     if self.__dict__:
         attrs = ", ".join("%s=%r" % (key, value)
                           for key, value in iteritems(self.__dict__))
     return self.__class__.__name__ + "(%s)" % attrs