def postings(self, fieldname, text, exclude_docs=frozenset(), scorer=None): self._test_field(fieldname) format = self.format(fieldname) try: offset = self.termsindex[(fieldname, text)][1] except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) if self.segment.deleted and exclude_docs: exclude_docs = self.segment.deleted | exclude_docs elif self.segment.deleted: exclude_docs = self.segment.deleted if isinstance(offset, (int, long)): postreader = FilePostingReader(self.postfile, offset, format, scorer=scorer, fieldname=fieldname, text=text) else: docids, weights, values = offset postreader = ListMatcher(docids, weights, values, format, scorer) if exclude_docs: postreader = ExcludeMatcher(postreader, exclude_docs) return postreader
def matcher(self, fieldname, text, format_, scorer=None): # Note this does not filter out deleted documents; a higher level is # expected to wrap this matcher to eliminate deleted docs pf = self.postfile term = (fieldname, text) try: terminfo = self[term] except KeyError: raise TermNotFound("No term %s:%r" % (fieldname, text)) p = terminfo.postings if isinstance(p, integer_types): # terminfo.postings is an offset into the posting file pr = PostingMatcher(pf, p, format_, scorer=scorer, term=term) else: # terminfo.postings is an inlined tuple of (ids, weights, values) docids, weights, values = p pr = ListMatcher(docids, weights, values, format_, scorer=scorer, term=term, terminfo=terminfo) return pr
def postings(self, fieldname, text, scorer=None): try: terminfo = self.termsindex[fieldname, text] except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) format = self.schema[fieldname].format postings = terminfo.postings if isinstance(postings, integer_types): postreader = FilePostingReader(self.postfile, postings, format, scorer=scorer, term=(fieldname, text)) else: docids, weights, values = postings postreader = ListMatcher(docids, weights, values, format, scorer=scorer, term=(fieldname, text), terminfo=terminfo) deleted = self.segment.deleted if deleted: postreader = FilterMatcher(postreader, deleted, exclude=True) return postreader
def postings(self, fieldname, text, scorer=None): try: offset = self.termsindex[fieldname, text][1] except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) format = self.schema[fieldname].format if isinstance(offset, (int, long)): postreader = FilePostingReader(self.postfile, offset, format, scorer=scorer, fieldname=fieldname, text=text) else: docids, weights, values, maxwol, minlength = offset postreader = ListMatcher(docids, weights, values, format, scorer, maxwol=maxwol, minlength=minlength) deleted = self.segment.deleted if deleted: postreader = FilterMatcher(postreader, deleted, exclude=True) return postreader
def terms_from(self, fieldname, prefix): if fieldname not in self._invindex: raise TermNotFound("Unknown field %r" % (fieldname, )) terms = sorted(self._invindex[fieldname]) if not terms: return start = bisect_left(terms, prefix) for i in range(start, len(terms)): yield (fieldname, terms[i])
def vector(self, docnum, fieldname): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) vformat = self.schema[fieldname].vector if not vformat: raise Exception("No vectors are stored for field %r" % fieldname) self._open_vectors() return self._vectors.matcher(docnum, fieldname, vformat)
def postings(self, fieldname, text, scorer=None): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) format_ = self.schema[fieldname].format matcher = self._terms.matcher(fieldname, text, format_, scorer=scorer) deleted = self.segment.deleted if deleted: matcher = FilterMatcher(matcher, deleted, exclude=True) return matcher
def vector(self, docnum, fieldname): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) vformat = self.schema[fieldname].vector if not vformat: raise Exception("No vectors are stored for field %r" % fieldname) vformat = self.schema[fieldname].vector ids, weights, values = zip_(*self.vectors[docnum, fieldname]) return ListMatcher(ids, weights, values, format=vformat)
def first_id(self, fieldname, text): try: plist = self.invindex[fieldname][text] except KeyError: raise TermNotFound((fieldname, text)) else: deleted = self.deleted for x in plist: docnum = x[0] if docnum not in deleted: return docnum
def postings(self, fieldname, text, scorer=None): try: postings = self.invindex[fieldname][text] except KeyError: raise TermNotFound((fieldname, text)) excludeset = self.deleted format = self.schema[fieldname].format if excludeset: postings = [x for x in postings if x[0] not in excludeset] if not postings: return NullMatcher() ids, weights, values = zip(*postings) return ListMatcher(ids, weights, values, format=format)
def first_id(self, fieldname, text): # Override to not construct a posting reader, just pull the first # non-deleted docnum out of the list directly self._test_field(fieldname) try: plist = self.invindex[fieldname][text] except KeyError: raise TermNotFound((fieldname, text)) else: deleted = self.deleted for x in plist: docnum = x[0] if docnum not in deleted: return docnum
def vector(self, docnum, fieldname): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) vformat = self.schema[fieldname].vector if not vformat: raise Exception("No vectors are stored for field %r" % fieldname) self._open_vectors() offset = self.vectorindex.get((docnum, fieldname)) if offset is None: raise Exception("No vector found for document" " %s field %r" % (docnum, fieldname)) return FilePostingReader(self.vpostfile, offset, vformat, stringids=True)
def matcher(self, fieldname, btext, format_, scorer=None): if not self._find_term(fieldname, btext): raise TermNotFound((fieldname, btext)) ids = [] weights = [] values = [] c = self._find_line(3, "POST") while c is not None: ids.append(c["dn"]) weights.append(c["w"]) values.append(c["v"]) c = self._find_line(3, "POST") return ListMatcher(ids, weights, values, format_, scorer=scorer)
def postings(self, fieldname, text, scorer=None): self._test_field(fieldname) try: terminfo = self.term_info(fieldname, text) except KeyError: raise TermNotFound((fieldname, text)) format = self.schema[fieldname].format postings = self.invindex[fieldname][text] excludeset = self.deleted if excludeset: postings = [x for x in postings if x[0] not in excludeset] if not postings: return NullMatcher() ids, weights, values = zip_(*postings) lm = ListMatcher(ids, weights, values, format=format, scorer=scorer, term=(fieldname, text), terminfo=terminfo) return lm
def postings(self, fieldid, text, exclude_docs=None): schema = self.schema fieldnum = schema.to_number(fieldid) format = schema[fieldnum].format try: totalfreq, offset, postcount = self.termtable[( fieldnum, text)] #@UnusedVariable except KeyError: raise TermNotFound("%s:%r" % (fieldid, text)) if self.segment.deleted and exclude_docs: exclude_docs = self.segment.deleted | exclude_docs elif self.segment.deleted: exclude_docs = self.segment.deleted postreader = FilePostingReader(self.postfile, offset, format) if exclude_docs: postreader = Exclude(postreader, exclude_docs) return postreader
def postings(self, fieldid, text, exclude_docs=frozenset()): schema = self.schema fieldnum = schema.to_number(fieldid) format = schema[fieldnum].format try: totalfreq, offset, postcount = self.termtable[( fieldnum, text)] #@UnusedVariable except KeyError: raise TermNotFound("%s:%r" % (fieldid, text)) if self.segment.deleted and exclude_docs: exclude_docs = self.segment.deleted | exclude_docs elif self.segment.deleted: exclude_docs = self.segment.deleted if not self.postfile: self.postfile = self.storage.open_file(self.segment.posts_filename, mapped=False) postreader = FilePostingReader(self.postfile, offset, format) if exclude_docs: postreader = Exclude(postreader, exclude_docs) return postreader
def postings(self, fieldname, text, exclude_docs=frozenset(), scorer=None): self._test_field(fieldname) format = self.format(fieldname) try: offset = self.termsindex[(fieldname, text)][1] except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) if self.segment.deleted and exclude_docs: exclude_docs = self.segment.deleted | exclude_docs elif self.segment.deleted: exclude_docs = self.segment.deleted postreader = FilePostingReader(self.postfile, offset, format, scorer=scorer, fieldname=fieldname, text=text) if exclude_docs: postreader = ExcludeMatcher(postreader, exclude_docs) return postreader
def term_info(self, fieldname, tbytes): key = self._keycoder(fieldname, tbytes) try: return W3TermInfo.from_bytes(self._tindex[key]) except KeyError: raise TermNotFound("No term %s:%r" % (fieldname, tbytes))
def _term_info(self, fieldnum, text): try: return self.termtable[(fieldnum, text)] except KeyError: raise TermNotFound("%s:%r" % (fieldnum, text))
def _test_field(self, fieldname): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) if self.schema[fieldname].format is None: raise TermNotFound("Field %r is not indexed" % fieldname)
def term_info(self, fieldname, btext): if not self._find_term(fieldname, btext): raise TermNotFound((fieldname, btext)) return self._find_terminfo()
def _find_field(self, fieldname): self._find_root("TERMS") if self._find_line(1, "TERMFIELD", fn=fieldname) is None: raise TermNotFound("No field %r" % fieldname)
def _term_info(self, fieldname, text): self._test_field(fieldname) try: return self.termsindex[fieldname, text] except KeyError: raise TermNotFound("%s:%r" % (fieldname, text))