def term_info(self, fieldname, text): term = (fieldname, text) # Get the term infos for the sub-readers containing the term tis = [(r.term_info(fieldname, text), offset) for r, offset in zip_(self.readers, self.doc_offsets) if term in r] # If only one reader had the term, return its terminfo with the offset # added if not tis: raise TermNotFound(term) elif len(tis) == 1: ti, offset = tis[0] ti._minid += offset ti._maxid += offset return ti # Combine the various statistics w = sum(ti.weight() for ti, _ in tis) df = sum(ti.doc_frequency() for ti, _ in tis) ml = min(ti.min_length() for ti, _ in tis) xl = max(ti.max_length() for ti, _ in tis) xw = max(ti.max_weight() for ti, _ in tis) # For min and max ID, we need to add the doc offsets mid = min(ti.min_id() + offset for ti, offset in tis) xid = max(ti.max_id() + offset for ti, offset in tis) return TermInfo(w, df, ml, xl, xw, mid, xid)
def vector(self, docnum, fieldname): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) vformat = self.schema[fieldname].vector if not vformat: raise Exception("No vectors are stored for field %r" % fieldname) vformat = self.schema[fieldname].vector ids, weights, values = zip_(*self.vectors[docnum, fieldname]) return ListMatcher(ids, weights, values, format=vformat)
def term_info(self, fieldname, text): term = (fieldname, text) # Get the term infos for the sub-readers containing the term tis = [(r.term_info(fieldname, text), offset) for r, offset in zip_(self.readers, self.doc_offsets) if term in r] # If only one reader had the term, return its terminfo with the offset # added if not tis: raise TermNotFound(term) return combine_terminfos(tis)
def postings(self, fieldname, text, scorer=None): self._test_field(fieldname) try: terminfo = self.term_info(fieldname, text) except KeyError: raise TermNotFound((fieldname, text)) format = self.schema[fieldname].format postings = self.invindex[fieldname][text] excludeset = self.deleted if excludeset: postings = [x for x in postings if x[0] not in excludeset] if not postings: return NullMatcher() ids, weights, values = zip_(*postings) lm = ListMatcher(ids, weights, values, format=format, scorer=scorer, term=(fieldname, text), terminfo=terminfo) return lm
def leaf_readers(self): return zip_(self.readers, self.doc_offsets)