def fill(self, docnum): if docnum > self._count: if self._refs is not None: self._refs.extend(0 for _ in xrange(docnum - self._count)) else: dbfile = self._dbfile for _ in xrange(docnum - self._count): dbfile.write_ushort(0)
def fill(self, docinfo): docnum, docbase = docinfo base = self._offset_base if docnum - docbase > self._count: self._lengths.extend( 0 for _ in xrange((docnum - docbase) - self._count)) self._offsets.extend( base for _ in xrange((docnum - docbase) - self._count))
def read_qsafe_array(typecode, size, dbfile): if typecode == "q": arry = [dbfile.read_long() for _ in xrange(size)] elif typecode == "Q": arry = [dbfile.read_ulong() for _ in xrange(size)] else: arry = dbfile.read_array(typecode, size) return arry
def __iter__(self): i = 0 for num in self._bitset: if num > i: for _ in xrange(num - i): yield False yield True i = num + 1 if self._doccount > i: for _ in xrange(self._doccount - i): yield False
def _read_weights(self): # If we haven't loaded the data from disk yet, load it now if self._data is None: self._read_data() weights = self._data[1] # De-minify the weights postcount = self._blocklength if weights is None: self._weights = array("f", (1.0 for _ in xrange(postcount))) elif isinstance(weights, float): self._weights = array("f", (weights for _ in xrange(postcount))) else: self._weights = weights
def __init__(self, submatchers, doccount, boost=1.0, scored=True): CombinationMatcher.__init__(self, submatchers, boost=boost) self._doccount = doccount a = array("d") active = [subm for subm in self._submatchers if subm.is_active()] if active: offset = self._docnum = min(m.id() for m in active) for m in active: while m.is_active(): if scored: score = m.score() * boost else: score = boost docnum = m.id() place = docnum - offset if len(a) <= place: a.extend(0 for _ in xrange(place - len(a) + 1)) a[place] += score m.next() self._a = a self._offset = offset else: self._docnum = 0 self._offset = 0 self._a = a
def do_wildcards(self, parser, group): i = 0 while i < len(group): node = group[i] if isinstance(node, self.WildcardNode): if i < len(group) - 1 and group[i + 1].is_text(): nextnode = group.pop(i + 1) node.text += nextnode.text if i > 0 and group[i - 1].is_text(): prevnode = group.pop(i - 1) node.text = prevnode.text + node.text else: i += 1 else: if isinstance(node, CylleneusGroupNode): self.do_wildcards(parser, node) i += 1 for i in xrange(len(group)): node = group[i] if isinstance(node, self.WildcardNode): text = node.text if (len(text) > 1 and all(qm not in text for qm in self.qmarks) and text.find("*") == len(text) - 1): newnode = PrefixPlugin.PrefixNode(text[:-1]) newnode.startchar = node.startchar newnode.endchar = node.endchar group[i] = newnode return group
def __iter__(self): count = self._count default = self._default for i in xrange(self._doccount): if i < count: yield self[i] else: yield default
def all_doc_ids(self): """ Returns an iterator of all (undeleted) document IDs in the reader. """ is_deleted = self.is_deleted return (docnum for docnum in xrange(self.doc_count_all()) if not is_deleted(docnum))
def all_stored_fields(self): """Yields the stored fields for all non-deleted documents. """ is_deleted = self.is_deleted for docnum in xrange(self.doc_count_all()): if not is_deleted(docnum): yield self.stored_fields(docnum)
def __iter__(self): last = -1 for i, block in enumerate(self._blocks): startdoc = block[0] enddoc = block[1] if startdoc > (last + 1): for _ in xrange(startdoc - last): yield emptybytes values = self._get_block(i) for docnum in xrange(startdoc, enddoc + 1): if docnum in values: yield values[docnum] else: yield emptybytes last = enddoc if enddoc < self._doccount - 1: for _ in xrange(self._doccount - enddoc): yield emptybytes
def make_array(typecode, size=0, default=None): if typecode.lower() == "q": # Python does not support arrays of long long see Issue 1172711 arry = [default] * size if default is not None and size else [] else: if default is not None and size: arry = array(typecode, (default for _ in xrange(size))) else: arry = array(typecode) return arry
def __iter__(self): get = self._dbfile.get basepos = self._basepos uniques = self._uniques unpack = self._unpack itemsize = self._itemsize for i in xrange(self._doccount): pos = basepos + i * itemsize ref = unpack(get(pos, itemsize))[0] yield uniques[ref]
def __getitem__(self, docnum): data = self._child[docnum] if not data: return [] bio = BytesIO(data) count = read_varint(bio.read) out = [] for _ in xrange(count): vlen = read_varint(bio.read) v = bio.read(vlen) out.append(v) return out
def __init__(self, ix, procs=None, batchsize=100, subargs=None, **kwargs): SegmentWriter.__init__(self, ix, **kwargs) self.procs = procs or cpu_count() self.batchsize = batchsize self.subargs = subargs if subargs else kwargs self.tasks = [ SegmentWriter(ix, _lk=False, **self.subargs) for _ in xrange(self.procs) ] self.pointer = 0 self._added_sub = False
def remove(self, global_docnum): """Removes a document from the collector. Not that this method uses the global document number as opposed to :meth:`Collector.collect` which takes a segment-relative docnum. """ items = self.items for i in xrange(len(items)): if items[i][1] == global_docnum: items.pop(i) return raise KeyError(global_docnum)
def _read_uniques(self): dbfile = self._dbfile fixedlen = self._fixedlen ucount = dbfile.read_varint() length = fixedlen uniques = [] for _ in xrange(ucount): if not length: length = dbfile.read_varint() uniques.append(dbfile.read(length)) return uniques
def remove(self, global_docnum): negated = 0 - global_docnum items = self.items # Remove the document if it's on the list (it may not be since # TopCollector forgets documents that don't make the top N list) for i in xrange(len(items)): if items[i][1] == negated: items.pop(i) # Restore the heap invariant heapify(items) self.minscore = items[0][0] if items else 0 return
def deletion_docs(self, searcher): bits = searcher._filter_to_comb(self.parents) if not bits: return m = self.child.matcher(searcher, searcher.boolean_context()) maxdoc = searcher.doc_count_all() while m.is_active(): docnum = m.id() parentdoc = bits.before(docnum + 1) nextparent = bits.after(docnum) or maxdoc for i in xrange(parentdoc, nextparent): yield i m.skip_to(nextparent)
def _process_file(self, filename, doc_count): # This method processes a "job file" written out by the parent task. A # job file is a series of pickled (code, arguments) tuples. Currently # the only command codes is 0=add_document writer = self.writer tempstorage = writer.temp_storage() load = pickle.load with tempstorage.open_file(filename).raw_file() as f: for _ in xrange(doc_count): # Load the next pickled tuple from the file code, args = load(f) assert code == 0 writer.add_document(**args) # Remove the job file tempstorage.delete_file(filename)
def __init__(self, submatchers, doccount, boost=1.0, scored=True, partsize=2048): CombinationMatcher.__init__(self, submatchers, boost=boost) self._scored = scored self._doccount = doccount if not partsize: partsize = doccount self._partsize = partsize self._a = array("d", (0 for _ in xrange(self._partsize))) self._docnum = self._min_id() self._read_part()
def _read_values(self): # If we haven't loaded the data from disk yet, load it now if self._data is None: self._read_data() # De-minify the values fixedsize = self._fixedsize vs = self._data[2] if fixedsize is None or fixedsize < 0: self._values = vs elif fixedsize is 0: self._values = (None,) * self._blocklength else: assert isinstance(vs, bytes_type) self._values = tuple( vs[i: i + fixedsize] for i in xrange(0, len(vs), fixedsize) )
def _read_part(self): scored = self._scored boost = self._boost limit = min(self._docnum + self._partsize, self._doccount) offset = self._docnum a = self._a # Clear the array for i in xrange(self._partsize): a[i] = 0 # Add the scores from the submatchers into the array for m in self._submatchers: while m.is_active() and m.id() < limit: i = m.id() - offset if scored: a[i] += m.score() * boost else: a[i] = 1 m.next() self._offset = offset self._limit = limit
def __iter__(self): for i in xrange(self._doccount): yield self[i]
def random_bytes(size=28): gen = (random.randint(0, 255) for _ in xrange(size)) if sys.version_info[0] >= 3: return bytes(gen) else: return array("B", gen).tostring()
def random_name(size=28): return "".join(random.choice(IDCHARS) for _ in xrange(size))
def __iter__(self): return (self._default for _ in xrange(self._doccount))
def __iter__(self): for docnum in xrange(len(self)): yield self[docnum]
def __getitem__(self, docnum): fixedlen = self._fixedlen v = self._child[docnum] if not v: return [] return [v[i:i + fixedlen] for i in xrange(0, len(v), fixedlen)]
def fill(self, docnum): write = self._dbfile.write default = self._defaultbytes if docnum > self._count: for _ in xrange(docnum - self._count): write(default)