Example #1
0
 def fill(self, docnum):
     if docnum > self._count:
         if self._refs is not None:
             self._refs.extend(0 for _ in xrange(docnum - self._count))
         else:
             dbfile = self._dbfile
             for _ in xrange(docnum - self._count):
                 dbfile.write_ushort(0)
Example #2
0
 def fill(self, docinfo):
     docnum, docbase = docinfo
     base = self._offset_base
     if docnum - docbase > self._count:
         self._lengths.extend(
             0 for _ in xrange((docnum - docbase) - self._count))
         self._offsets.extend(
             base for _ in xrange((docnum - docbase) - self._count))
Example #3
0
def read_qsafe_array(typecode, size, dbfile):
    if typecode == "q":
        arry = [dbfile.read_long() for _ in xrange(size)]
    elif typecode == "Q":
        arry = [dbfile.read_ulong() for _ in xrange(size)]
    else:
        arry = dbfile.read_array(typecode, size)

    return arry
Example #4
0
 def __iter__(self):
     i = 0
     for num in self._bitset:
         if num > i:
             for _ in xrange(num - i):
                 yield False
         yield True
         i = num + 1
     if self._doccount > i:
         for _ in xrange(self._doccount - i):
             yield False
Example #5
0
    def _read_weights(self):
        # If we haven't loaded the data from disk yet, load it now
        if self._data is None:
            self._read_data()
        weights = self._data[1]

        # De-minify the weights
        postcount = self._blocklength
        if weights is None:
            self._weights = array("f", (1.0 for _ in xrange(postcount)))
        elif isinstance(weights, float):
            self._weights = array("f", (weights for _ in xrange(postcount)))
        else:
            self._weights = weights
Example #6
0
    def __init__(self, submatchers, doccount, boost=1.0, scored=True):
        CombinationMatcher.__init__(self, submatchers, boost=boost)

        self._doccount = doccount

        a = array("d")
        active = [subm for subm in self._submatchers if subm.is_active()]
        if active:
            offset = self._docnum = min(m.id() for m in active)
            for m in active:
                while m.is_active():
                    if scored:
                        score = m.score() * boost
                    else:
                        score = boost

                    docnum = m.id()
                    place = docnum - offset
                    if len(a) <= place:
                        a.extend(0 for _ in xrange(place - len(a) + 1))
                    a[place] += score
                    m.next()
            self._a = a
            self._offset = offset
        else:
            self._docnum = 0
            self._offset = 0
        self._a = a
Example #7
0
    def do_wildcards(self, parser, group):
        i = 0
        while i < len(group):
            node = group[i]
            if isinstance(node, self.WildcardNode):
                if i < len(group) - 1 and group[i + 1].is_text():
                    nextnode = group.pop(i + 1)
                    node.text += nextnode.text
                if i > 0 and group[i - 1].is_text():
                    prevnode = group.pop(i - 1)
                    node.text = prevnode.text + node.text
                else:
                    i += 1
            else:
                if isinstance(node, CylleneusGroupNode):
                    self.do_wildcards(parser, node)
                i += 1

        for i in xrange(len(group)):
            node = group[i]
            if isinstance(node, self.WildcardNode):
                text = node.text
                if (len(text) > 1 and all(qm not in text for qm in self.qmarks)
                        and text.find("*") == len(text) - 1):
                    newnode = PrefixPlugin.PrefixNode(text[:-1])
                    newnode.startchar = node.startchar
                    newnode.endchar = node.endchar
                    group[i] = newnode
        return group
Example #8
0
 def __iter__(self):
     count = self._count
     default = self._default
     for i in xrange(self._doccount):
         if i < count:
             yield self[i]
         else:
             yield default
Example #9
0
    def all_doc_ids(self):
        """
        Returns an iterator of all (undeleted) document IDs in the reader.
        """

        is_deleted = self.is_deleted
        return (docnum for docnum in xrange(self.doc_count_all())
                if not is_deleted(docnum))
Example #10
0
    def all_stored_fields(self):
        """Yields the stored fields for all non-deleted documents.
        """

        is_deleted = self.is_deleted
        for docnum in xrange(self.doc_count_all()):
            if not is_deleted(docnum):
                yield self.stored_fields(docnum)
Example #11
0
 def __iter__(self):
     last = -1
     for i, block in enumerate(self._blocks):
         startdoc = block[0]
         enddoc = block[1]
         if startdoc > (last + 1):
             for _ in xrange(startdoc - last):
                 yield emptybytes
         values = self._get_block(i)
         for docnum in xrange(startdoc, enddoc + 1):
             if docnum in values:
                 yield values[docnum]
             else:
                 yield emptybytes
         last = enddoc
     if enddoc < self._doccount - 1:
         for _ in xrange(self._doccount - enddoc):
             yield emptybytes
Example #12
0
def make_array(typecode, size=0, default=None):
    if typecode.lower() == "q":
        # Python does not support arrays of long long see Issue 1172711
        arry = [default] * size if default is not None and size else []
    else:
        if default is not None and size:
            arry = array(typecode, (default for _ in xrange(size)))
        else:
            arry = array(typecode)
    return arry
Example #13
0
        def __iter__(self):
            get = self._dbfile.get
            basepos = self._basepos
            uniques = self._uniques
            unpack = self._unpack
            itemsize = self._itemsize

            for i in xrange(self._doccount):
                pos = basepos + i * itemsize
                ref = unpack(get(pos, itemsize))[0]
                yield uniques[ref]
Example #14
0
 def __getitem__(self, docnum):
     data = self._child[docnum]
     if not data:
         return []
     bio = BytesIO(data)
     count = read_varint(bio.read)
     out = []
     for _ in xrange(count):
         vlen = read_varint(bio.read)
         v = bio.read(vlen)
         out.append(v)
     return out
Example #15
0
    def __init__(self, ix, procs=None, batchsize=100, subargs=None, **kwargs):
        SegmentWriter.__init__(self, ix, **kwargs)

        self.procs = procs or cpu_count()
        self.batchsize = batchsize
        self.subargs = subargs if subargs else kwargs
        self.tasks = [
            SegmentWriter(ix, _lk=False, **self.subargs)
            for _ in xrange(self.procs)
        ]
        self.pointer = 0
        self._added_sub = False
Example #16
0
    def remove(self, global_docnum):
        """Removes a document from the collector. Not that this method uses the
        global document number as opposed to :meth:`Collector.collect` which
        takes a segment-relative docnum.
        """

        items = self.items
        for i in xrange(len(items)):
            if items[i][1] == global_docnum:
                items.pop(i)
                return
        raise KeyError(global_docnum)
Example #17
0
        def _read_uniques(self):
            dbfile = self._dbfile
            fixedlen = self._fixedlen

            ucount = dbfile.read_varint()
            length = fixedlen
            uniques = []
            for _ in xrange(ucount):
                if not length:
                    length = dbfile.read_varint()
                uniques.append(dbfile.read(length))
            return uniques
Example #18
0
    def remove(self, global_docnum):
        negated = 0 - global_docnum
        items = self.items

        # Remove the document if it's on the list (it may not be since
        # TopCollector forgets documents that don't make the top N list)
        for i in xrange(len(items)):
            if items[i][1] == negated:
                items.pop(i)
                # Restore the heap invariant
                heapify(items)
                self.minscore = items[0][0] if items else 0
                return
Example #19
0
    def deletion_docs(self, searcher):
        bits = searcher._filter_to_comb(self.parents)
        if not bits:
            return

        m = self.child.matcher(searcher, searcher.boolean_context())
        maxdoc = searcher.doc_count_all()
        while m.is_active():
            docnum = m.id()
            parentdoc = bits.before(docnum + 1)
            nextparent = bits.after(docnum) or maxdoc
            for i in xrange(parentdoc, nextparent):
                yield i
            m.skip_to(nextparent)
Example #20
0
    def _process_file(self, filename, doc_count):
        # This method processes a "job file" written out by the parent task. A
        # job file is a series of pickled (code, arguments) tuples. Currently
        # the only command codes is 0=add_document

        writer = self.writer
        tempstorage = writer.temp_storage()

        load = pickle.load
        with tempstorage.open_file(filename).raw_file() as f:
            for _ in xrange(doc_count):
                # Load the next pickled tuple from the file
                code, args = load(f)
                assert code == 0
                writer.add_document(**args)
        # Remove the job file
        tempstorage.delete_file(filename)
Example #21
0
    def __init__(self,
                 submatchers,
                 doccount,
                 boost=1.0,
                 scored=True,
                 partsize=2048):
        CombinationMatcher.__init__(self, submatchers, boost=boost)
        self._scored = scored
        self._doccount = doccount

        if not partsize:
            partsize = doccount
        self._partsize = partsize

        self._a = array("d", (0 for _ in xrange(self._partsize)))
        self._docnum = self._min_id()
        self._read_part()
Example #22
0
    def _read_values(self):
        # If we haven't loaded the data from disk yet, load it now
        if self._data is None:
            self._read_data()

        # De-minify the values
        fixedsize = self._fixedsize
        vs = self._data[2]
        if fixedsize is None or fixedsize < 0:
            self._values = vs
        elif fixedsize is 0:
            self._values = (None,) * self._blocklength
        else:
            assert isinstance(vs, bytes_type)
            self._values = tuple(
                vs[i: i + fixedsize] for i in xrange(0, len(vs), fixedsize)
            )
Example #23
0
    def _read_part(self):
        scored = self._scored
        boost = self._boost
        limit = min(self._docnum + self._partsize, self._doccount)
        offset = self._docnum
        a = self._a

        # Clear the array
        for i in xrange(self._partsize):
            a[i] = 0

        # Add the scores from the submatchers into the array
        for m in self._submatchers:
            while m.is_active() and m.id() < limit:
                i = m.id() - offset
                if scored:
                    a[i] += m.score() * boost
                else:
                    a[i] = 1
                m.next()

        self._offset = offset
        self._limit = limit
Example #24
0
 def __iter__(self):
     for i in xrange(self._doccount):
         yield self[i]
Example #25
0
def random_bytes(size=28):
    gen = (random.randint(0, 255) for _ in xrange(size))
    if sys.version_info[0] >= 3:
        return bytes(gen)
    else:
        return array("B", gen).tostring()
Example #26
0
def random_name(size=28):
    return "".join(random.choice(IDCHARS) for _ in xrange(size))
Example #27
0
 def __iter__(self):
     return (self._default for _ in xrange(self._doccount))
Example #28
0
 def __iter__(self):
     for docnum in xrange(len(self)):
         yield self[docnum]
Example #29
0
 def __getitem__(self, docnum):
     fixedlen = self._fixedlen
     v = self._child[docnum]
     if not v:
         return []
     return [v[i:i + fixedlen] for i in xrange(0, len(v), fixedlen)]
Example #30
0
 def fill(self, docnum):
     write = self._dbfile.write
     default = self._defaultbytes
     if docnum > self._count:
         for _ in xrange(docnum - self._count):
             write(default)