Ejemplo n.º 1
0
def test_dawg():
    from whoosh.support.dawg import DawgBuilder

    with TempStorage() as st:
        df = st.create_file("test.dawg")

        dw = DawgBuilder(field_root=True)
        dw.insert(["test"] + list("special"))
        dw.insert(["test"] + list("specials"))
        dw.write(df)

        assert_equal(list(dawg.flatten(dw.root.edge("test"))),
                     ["special", "specials"])
Ejemplo n.º 2
0
def test_dawg():
    from whoosh.support.dawg import DawgBuilder

    with TempStorage() as st:
        df = st.create_file("test.dawg")

        dw = DawgBuilder(field_root=True)
        dw.insert(["test"] + list("special"))
        dw.insert(["test"] + list("specials"))
        dw.write(df)

        assert_equal(list(dawg.flatten(dw.root.edge("test"))), ["special", "specials"])
Ejemplo n.º 3
0
def add_spelling(ix, fieldnames, commit=True):
    """Adds spelling files to an existing index that was created without
    them, and modifies the schema so the given fields have the ``spelling``
    attribute. Only works on filedb indexes.
    
    >>> ix = index.open_dir("testindex")
    >>> add_spelling(ix, ["content", "tags"])
    
    :param ix: a :class:`whoosh.filedb.fileindex.FileIndex` object.
    :param fieldnames: a list of field names to create word graphs for.
    :param force: if True, overwrites existing word graph files. This is only
        useful for debugging.
    """

    from whoosh.filedb.filereading import SegmentReader

    writer = ix.writer()
    storage = writer.storage
    schema = writer.schema
    segments = writer.segments

    for segment in segments:
        filename = segment.dawg_filename
        r = SegmentReader(storage, schema, segment)
        f = storage.create_file(filename)
        dawg = DawgBuilder(field_root=True)
        for fieldname in fieldnames:
            ft = (fieldname,)
            for word in r.lexicon(fieldname):
                dawg.insert(ft + tuple(word))
        dawg.write(f)

    for fieldname in fieldnames:
        schema[fieldname].spelling = True

    if commit:
        writer.commit(merge=False)
Ejemplo n.º 4
0
def add_spelling(ix, fieldnames, commit=True):
    """Adds spelling files to an existing index that was created without
    them, and modifies the schema so the given fields have the ``spelling``
    attribute. Only works on filedb indexes.
    
    >>> ix = index.open_dir("testindex")
    >>> add_spelling(ix, ["content", "tags"])
    
    :param ix: a :class:`whoosh.filedb.fileindex.FileIndex` object.
    :param fieldnames: a list of field names to create word graphs for.
    :param force: if True, overwrites existing word graph files. This is only
        useful for debugging.
    """

    from whoosh.filedb.filereading import SegmentReader

    writer = ix.writer()
    storage = writer.storage
    schema = writer.schema
    segments = writer.segments

    for segment in segments:
        filename = segment.dawg_filename
        r = SegmentReader(storage, schema, segment)
        f = storage.create_file(filename)
        dawg = DawgBuilder(field_root=True)
        for fieldname in fieldnames:
            ft = (fieldname, )
            for word in r.lexicon(fieldname):
                dawg.insert(ft + tuple(word))
        dawg.write(f)

    for fieldname in fieldnames:
        schema[fieldname].spelling = True

    if commit:
        writer.commit(merge=False)
Ejemplo n.º 5
0
    def __init__(self,
                 ix,
                 poolclass=None,
                 procs=0,
                 blocklimit=128,
                 timeout=0.0,
                 delay=0.1,
                 name=None,
                 _lk=True,
                 **poolargs):

        self.writelock = None
        if _lk:
            self.writelock = ix.lock("WRITELOCK")
            if not try_for(
                    self.writelock.acquire, timeout=timeout, delay=delay):
                raise LockError

        info = ix._read_toc()
        self.schema = info.schema
        self.segments = info.segments
        self.storage = storage = ix.storage
        self.indexname = ix.indexname
        self.is_closed = False

        self.blocklimit = blocklimit
        self.segment_number = info.segment_counter + 1
        self.generation = info.generation + 1

        self._doc_offsets = []
        base = 0
        for s in self.segments:
            self._doc_offsets.append(base)
            base += s.doc_count_all()

        self.name = name or Segment.basename(self.indexname,
                                             self.segment_number)
        self.docnum = 0
        self.fieldlength_totals = defaultdict(int)
        self._added = False
        self._unique_cache = {}

        # Create a temporary segment to use its .*_filename attributes
        segment = Segment(self.name, self.generation, 0, None, None, None)

        # Spelling
        self.wordsets = {}
        self.dawg = None
        if any(field.spelling for field in self.schema):
            self.dawgfile = storage.create_file(segment.dawg_filename)
            self.dawg = DawgBuilder(field_root=True)

        # Terms index
        tf = storage.create_file(segment.termsindex_filename)
        ti = TermIndexWriter(tf)
        # Term postings file
        pf = storage.create_file(segment.termposts_filename)
        pw = FilePostingWriter(pf, blocklimit=blocklimit)
        # Terms writer
        self.termswriter = TermsWriter(self.schema, ti, pw, self.dawg)

        if self.schema.has_vectored_fields():
            # Vector index
            vf = storage.create_file(segment.vectorindex_filename)
            self.vectorindex = TermVectorWriter(vf)

            # Vector posting file
            vpf = storage.create_file(segment.vectorposts_filename)
            self.vpostwriter = FilePostingWriter(vpf, stringids=True)
        else:
            self.vectorindex = None
            self.vpostwriter = None

        # Stored fields file
        sf = storage.create_file(segment.storedfields_filename)
        self.storedfields = StoredFieldWriter(sf, self.schema.stored_names())

        # Field lengths file
        self.lengthfile = storage.create_file(segment.fieldlengths_filename)

        # Create the pool
        if poolclass is None:
            if procs > 1:
                from whoosh.filedb.multiproc import MultiPool
                poolclass = MultiPool
            else:
                poolclass = TempfilePool
        self.pool = poolclass(self.schema, procs=procs, **poolargs)
Ejemplo n.º 6
0
class SegmentWriter(IndexWriter):
    def __init__(self,
                 ix,
                 poolclass=None,
                 procs=0,
                 blocklimit=128,
                 timeout=0.0,
                 delay=0.1,
                 name=None,
                 _lk=True,
                 **poolargs):

        self.writelock = None
        if _lk:
            self.writelock = ix.lock("WRITELOCK")
            if not try_for(
                    self.writelock.acquire, timeout=timeout, delay=delay):
                raise LockError

        info = ix._read_toc()
        self.schema = info.schema
        self.segments = info.segments
        self.storage = storage = ix.storage
        self.indexname = ix.indexname
        self.is_closed = False

        self.blocklimit = blocklimit
        self.segment_number = info.segment_counter + 1
        self.generation = info.generation + 1

        self._doc_offsets = []
        base = 0
        for s in self.segments:
            self._doc_offsets.append(base)
            base += s.doc_count_all()

        self.name = name or Segment.basename(self.indexname,
                                             self.segment_number)
        self.docnum = 0
        self.fieldlength_totals = defaultdict(int)
        self._added = False
        self._unique_cache = {}

        # Create a temporary segment to use its .*_filename attributes
        segment = Segment(self.name, self.generation, 0, None, None, None)

        # Spelling
        self.wordsets = {}
        self.dawg = None
        if any(field.spelling for field in self.schema):
            self.dawgfile = storage.create_file(segment.dawg_filename)
            self.dawg = DawgBuilder(field_root=True)

        # Terms index
        tf = storage.create_file(segment.termsindex_filename)
        ti = TermIndexWriter(tf)
        # Term postings file
        pf = storage.create_file(segment.termposts_filename)
        pw = FilePostingWriter(pf, blocklimit=blocklimit)
        # Terms writer
        self.termswriter = TermsWriter(self.schema, ti, pw, self.dawg)

        if self.schema.has_vectored_fields():
            # Vector index
            vf = storage.create_file(segment.vectorindex_filename)
            self.vectorindex = TermVectorWriter(vf)

            # Vector posting file
            vpf = storage.create_file(segment.vectorposts_filename)
            self.vpostwriter = FilePostingWriter(vpf, stringids=True)
        else:
            self.vectorindex = None
            self.vpostwriter = None

        # Stored fields file
        sf = storage.create_file(segment.storedfields_filename)
        self.storedfields = StoredFieldWriter(sf, self.schema.stored_names())

        # Field lengths file
        self.lengthfile = storage.create_file(segment.fieldlengths_filename)

        # Create the pool
        if poolclass is None:
            if procs > 1:
                from whoosh.filedb.multiproc import MultiPool
                poolclass = MultiPool
            else:
                poolclass = TempfilePool
        self.pool = poolclass(self.schema, procs=procs, **poolargs)

    def _check_state(self):
        if self.is_closed:
            raise IndexingError("This writer is closed")

    def add_field(self, fieldname, fieldspec, **kwargs):
        self._check_state()
        if self._added:
            raise Exception("Can't modify schema after adding data to writer")
        super(SegmentWriter, self).add_field(fieldname, fieldspec, **kwargs)

    def remove_field(self, fieldname):
        self._check_state()
        if self._added:
            raise Exception("Can't modify schema after adding data to writer")
        super(SegmentWriter, self).remove_field(fieldname)

    def _document_segment(self, docnum):
        #Returns the index.Segment object containing the given document
        #number.

        offsets = self._doc_offsets
        if len(offsets) == 1:
            return 0
        return bisect_right(offsets, docnum) - 1

    def _segment_and_docnum(self, docnum):
        #Returns an (index.Segment, segment_docnum) pair for the segment
        #containing the given document number.

        segmentnum = self._document_segment(docnum)
        offset = self._doc_offsets[segmentnum]
        segment = self.segments[segmentnum]
        return segment, docnum - offset

    def has_deletions(self):
        """
        :returns: True if this index has documents that are marked deleted but
            haven't been optimized out of the index yet.
        """

        return any(s.has_deletions() for s in self.segments)

    def delete_document(self, docnum, delete=True):
        self._check_state()
        if docnum >= sum(seg.doccount for seg in self.segments):
            raise IndexingError("No document ID %r in this index" % docnum)
        segment, segdocnum = self._segment_and_docnum(docnum)
        segment.delete_document(segdocnum, delete=delete)

    def deleted_count(self):
        """
        :returns: the total number of deleted documents in the index.
        """

        return sum(s.deleted_count() for s in self.segments)

    def is_deleted(self, docnum):
        segment, segdocnum = self._segment_and_docnum(docnum)
        return segment.is_deleted(segdocnum)

    def reader(self, reuse=None):
        self._check_state()
        from whoosh.filedb.fileindex import FileIndex

        return FileIndex._reader(self.storage,
                                 self.schema,
                                 self.segments,
                                 self.generation,
                                 reuse=reuse)

    def add_reader(self, reader):
        self._check_state()
        startdoc = self.docnum

        has_deletions = reader.has_deletions()
        if has_deletions:
            docmap = {}

        fieldnames = set(self.schema.names())

        # Add stored documents, vectors, and field lengths
        for docnum in reader.all_doc_ids():
            if (not has_deletions) or (not reader.is_deleted(docnum)):
                d = dict(item
                         for item in iteritems(reader.stored_fields(docnum))
                         if item[0] in fieldnames)
                # We have to append a dictionary for every document, even if
                # it's empty.
                self.storedfields.append(d)

                if has_deletions:
                    docmap[docnum] = self.docnum

                for fieldname in reader.schema.scorable_names():
                    length = reader.doc_field_length(docnum, fieldname)
                    if length and fieldname in fieldnames:
                        self.pool.add_field_length(self.docnum, fieldname,
                                                   length)

                for fieldname in reader.schema.vector_names():
                    if (fieldname in fieldnames
                            and reader.has_vector(docnum, fieldname)):
                        vpostreader = reader.vector(docnum, fieldname)
                        self._add_vector_reader(self.docnum, fieldname,
                                                vpostreader)

                self.docnum += 1

        # Add dawg contents to word sets for fields that require separate
        # handling
        for fieldname in self.schema.separate_spelling_names():
            if reader.has_word_graph(fieldname):
                graph = reader.word_graph(fieldname)
                self.add_spell_words(fieldname, flatten(graph))

        # Add postings
        for fieldname, text in reader.all_terms():
            if fieldname in fieldnames:
                postreader = reader.postings(fieldname, text)
                while postreader.is_active():
                    docnum = postreader.id()
                    valuestring = postreader.value()
                    if has_deletions:
                        newdoc = docmap[docnum]
                    else:
                        newdoc = startdoc + docnum

                    self.pool.add_posting(fieldname, text, newdoc,
                                          postreader.weight(), valuestring)
                    postreader.next()

        self._added = True

    def add_document(self, **fields):
        self._check_state()
        schema = self.schema
        docboost = self._doc_boost(fields)

        # Sort the keys
        fieldnames = sorted(
            [name for name in fields.keys() if not name.startswith("_")])

        # Check if the caller gave us a bogus field
        for name in fieldnames:
            if name not in schema:
                raise UnknownFieldError("No field named %r in %s" %
                                        (name, schema))

        storedvalues = {}

        docnum = self.docnum
        for fieldname in fieldnames:
            value = fields.get(fieldname)
            if value is None:
                continue
            field = schema[fieldname]

            if field.indexed:
                fieldboost = self._field_boost(fields, fieldname, docboost)
                self.pool.add_content(docnum, fieldname, field, value,
                                      fieldboost)

            if field.separate_spelling():
                # This field requires spelling words to be added in a separate
                # step, instead of as part of indexing
                self.add_spell_words(fieldname, field.spellable_words(value))

            vformat = field.vector
            if vformat:
                wvs = vformat.word_values(value, field.analyzer, mode="index")
                vlist = sorted((w, weight, valuestring)
                               for w, _, weight, valuestring in wvs)
                self._add_vector(docnum, fieldname, vlist)

            if field.stored:
                # Caller can override the stored value by including a key
                # _stored_<fieldname>
                storedvalue = value
                storedname = "_stored_" + fieldname
                if storedname in fields:
                    storedvalue = fields[storedname]
                storedvalues[fieldname] = storedvalue

        self._added = True
        self.storedfields.append(storedvalues)
        self.docnum += 1

    def add_spell_words(self, fieldname, words):
        # Get or make a set for the words in this field
        if fieldname not in self.wordsets:
            self.wordsets[fieldname] = set()
        wordset = self.wordsets[fieldname]

        # If the in-memory set is getting big, replace it with an
        # on-disk set
        if has_sqlite and isinstance(wordset, set) and len(wordset) > 4096:
            diskset = DiskSet(wordset)
            self.wordsets[fieldname] = wordset = diskset

        for word in words:
            wordset.add(word)

        self._added = True

    def _add_wordsets(self):
        dawg = self.dawg
        for fieldname in self.wordsets:
            ws = self.wordsets[fieldname]
            ft = (fieldname, )

            words = sorted(ws) if isinstance(ws, set) else iter(ws)
            for text in words:
                dawg.insert(ft + tuple(text))

            if isinstance(ws, DiskSet):
                ws.destroy()

    def _add_vector(self, docnum, fieldname, vlist):
        vpostwriter = self.vpostwriter
        offset = vpostwriter.start(self.schema[fieldname].vector)
        for text, weight, valuestring in vlist:
            #assert isinstance(text, text_type), "%r is not unicode" % text
            vpostwriter.write(text, weight, valuestring, 0)
        vpostwriter.finish(inlinelimit=0)
        self.vectorindex.add((docnum, fieldname), offset)

    def _add_vector_reader(self, docnum, fieldname, vreader):
        vpostwriter = self.vpostwriter
        offset = vpostwriter.start(self.schema[fieldname].vector)
        while vreader.is_active():
            # text, weight, valuestring, fieldlen
            vpostwriter.write(vreader.id(), vreader.weight(), vreader.value(),
                              0)
            vreader.next()
        vpostwriter.finish(inlinelimit=0)
        self.vectorindex.add((docnum, fieldname), offset)

    def _close_all(self):
        self.is_closed = True

        self.termswriter.close()
        self.storedfields.close()
        if not self.lengthfile.is_closed:
            self.lengthfile.close()
        if self.vectorindex:
            self.vectorindex.close()
        if self.vpostwriter:
            self.vpostwriter.close()

    def _getsegment(self):
        return Segment(self.name, self.generation, self.docnum,
                       self.pool.fieldlength_totals(),
                       self.pool.fieldlength_mins(),
                       self.pool.fieldlength_maxes())

    def commit(self, mergetype=None, optimize=False, merge=True):
        """Finishes writing and saves all additions and changes to disk.
        
        There are four possible ways to use this method::
        
            # Merge small segments but leave large segments, trying to
            # balance fast commits with fast searching:
            writer.commit()
        
            # Merge all segments into a single segment:
            writer.commit(optimize=True)
            
            # Don't merge any existing segments:
            writer.commit(merge=False)
            
            # Use a custom merge function
            writer.commit(mergetype=my_merge_function)
        
        :param mergetype: a custom merge function taking a Writer object and
            segment list as arguments, and returning a new segment list. If you
            supply a ``mergetype`` function, the values of the ``optimize`` and
            ``merge`` arguments are ignored.
        :param optimize: if True, all existing segments are merged with the
            documents you've added to this writer (and the value of the
            ``merge`` argument is ignored).
        :param merge: if False, do not merge small segments.
        """

        self._check_state()
        try:
            if mergetype:
                pass
            elif optimize:
                mergetype = OPTIMIZE
            elif not merge:
                mergetype = NO_MERGE
            else:
                mergetype = MERGE_SMALL

            # Call the merge policy function. The policy may choose to merge
            # other segments into this writer's pool
            new_segments = mergetype(self, self.segments)

            if self._added:
                # Create a Segment object for the segment created by this
                # writer
                thissegment = self._getsegment()

                # Tell the pool we're finished adding information, it should
                # add its accumulated data to the lengths, terms index, and
                # posting files.
                self.pool.finish(self.termswriter, self.docnum,
                                 self.lengthfile)

                # Write out spelling files
                if self.dawg:
                    # Insert any wordsets we've accumulated into the word graph
                    self._add_wordsets()
                    # Write out the word graph
                    self.dawg.write(self.dawgfile)

                # Add new segment to the list of remaining segments returned by
                # the merge policy function
                new_segments.append(thissegment)
            else:
                self.pool.cleanup()

            # Close all files, write a new TOC with the new segment list, and
            # release the lock.
            self._close_all()

            from whoosh.filedb.fileindex import _write_toc, _clean_files

            _write_toc(self.storage, self.schema, self.indexname,
                       self.generation, self.segment_number, new_segments)

            # Delete leftover files
            _clean_files(self.storage, self.indexname, self.generation,
                         new_segments)

        finally:
            if self.writelock:
                self.writelock.release()

    def cancel(self):
        self._check_state()
        try:
            self.pool.cancel()
            self._close_all()
        finally:
            if self.writelock:
                self.writelock.release()
Ejemplo n.º 7
0
 def _make_dawg_files(self):
     dawgfile = self.segment.create_file(self.storage, StdCodec.DAWG_EXT)
     self.dawg = DawgBuilder(dawgfile, field_root=True)
Ejemplo n.º 8
0
class StdFieldWriter(base.FieldWriter):
    def __init__(self,
                 storage,
                 segment,
                 blocklimit=128,
                 compression=3,
                 inlinelimit=1):
        self.storage = storage
        self.segment = segment
        self.fieldname = None
        self.text = None
        self.field = None
        self.format = None
        self.spelling = False

        tifile = segment.create_file(storage, StdCodec.TERMS_EXT)
        self.termsindex = TermIndexWriter(tifile)
        self.postfile = segment.create_file(storage, StdCodec.POSTS_EXT)

        # We'll wait to create the DAWG builder until someone actually adds
        # a spelled field
        self.dawg = None

        self.blocklimit = blocklimit
        self.compression = compression
        self.inlinelimit = inlinelimit
        self.block = None
        self.terminfo = None

    def _make_dawg_files(self):
        dawgfile = self.segment.create_file(self.storage, StdCodec.DAWG_EXT)
        self.dawg = DawgBuilder(dawgfile, field_root=True)

    def _reset_block(self):
        self.block = StdBlock(self.format.posting_size)

    def _write_block(self):
        self.terminfo.add_block(self.block)
        self.block.to_file(self.postfile, compression=self.compression)
        self._reset_block()
        self.blockcount += 1

    def _start_blocklist(self):
        postfile = self.postfile
        self._reset_block()

        # Magic number
        self.startoffset = postfile.tell()
        postfile.write(StdBlock.magic)
        # Placeholder for block count
        self.blockcount = 0
        postfile.write_uint(0)

    def _finish_blocklist(self):
        if self.block:
            self._write_block()

        # Seek back to the start of this list of posting blocks and write the
        # number of blocks
        postfile = self.postfile
        postfile.flush()
        here = postfile.tell()
        postfile.seek(self.startoffset + 4)
        postfile.write_uint(self.blockcount)
        postfile.seek(here)

        self.block = None

    def start_field(self, fieldname, fieldobj):
        self.fieldname = fieldname
        self.field = fieldobj
        self.format = fieldobj.format
        self.spelling = fieldobj.spelling and not fieldobj.separate_spelling()

    def start_term(self, text):
        if self.block is not None:
            raise Exception("Called start_term in a block")
        self.text = text
        self.terminfo = base.FileTermInfo()
        if self.spelling:
            if self.dawg is None:
                self._make_dawg_files()
            self.dawg.insert((self.fieldname, ) + tuple(text))
        self._start_blocklist()

    def add(self, docnum, weight, valuestring, length):
        self.block.add(docnum, weight, valuestring, length)
        if len(self.block) > self.blocklimit:
            self._write_block()

    def add_spell_word(self, fieldname, text):
        if self.dawg is None:
            self._make_dawg_files()
        self.dawg.insert((fieldname, ) + tuple(text))

    def finish_term(self):
        if self.block is None:
            raise Exception("Called finish_term when not in a block")
        block = self.block
        terminfo = self.terminfo
        if self.blockcount < 1 and block and len(block) < self.inlinelimit:
            # Inline the single block
            terminfo.add_block(block)
            vals = None if not block.values else tuple(block.values)
            postings = (tuple(block.ids), tuple(block.weights), vals)
        else:
            self._finish_blocklist()
            postings = self.startoffset

        self.block = None
        terminfo.postings = postings
        self.termsindex.add((self.fieldname, self.text), terminfo)

    def close(self):
        self.termsindex.close()
        self.postfile.close()
        if self.dawg is not None:
            self.dawg.close()
Ejemplo n.º 9
0
 def _make_dawg_files(self):
     dawgfile = self.segment.create_file(self.storage, StdCodec.DAWG_EXT)
     self.dawg = DawgBuilder(dawgfile, field_root=True)
Ejemplo n.º 10
0
class StdFieldWriter(base.FieldWriter):
    def __init__(self, storage, segment, blocklimit=128, compression=3,
                 inlinelimit=1):
        self.storage = storage
        self.segment = segment
        self.fieldname = None
        self.text = None
        self.field = None
        self.format = None
        self.spelling = False

        tifile = segment.create_file(storage, StdCodec.TERMS_EXT)
        self.termsindex = TermIndexWriter(tifile)
        self.postfile = segment.create_file(storage, StdCodec.POSTS_EXT)

        # We'll wait to create the DAWG builder until someone actually adds
        # a spelled field
        self.dawg = None

        self.blocklimit = blocklimit
        self.compression = compression
        self.inlinelimit = inlinelimit
        self.block = None
        self.terminfo = None

    def _make_dawg_files(self):
        dawgfile = self.segment.create_file(self.storage, StdCodec.DAWG_EXT)
        self.dawg = DawgBuilder(dawgfile, field_root=True)

    def _reset_block(self):
        self.block = StdBlock(self.format.posting_size)

    def _write_block(self):
        self.terminfo.add_block(self.block)
        self.block.to_file(self.postfile, compression=self.compression)
        self._reset_block()
        self.blockcount += 1

    def _start_blocklist(self):
        postfile = self.postfile
        self._reset_block()

        # Magic number
        self.startoffset = postfile.tell()
        postfile.write(StdBlock.magic)
        # Placeholder for block count
        self.blockcount = 0
        postfile.write_uint(0)

    def _finish_blocklist(self):
        if self.block:
            self._write_block()

        # Seek back to the start of this list of posting blocks and write the
        # number of blocks
        postfile = self.postfile
        postfile.flush()
        here = postfile.tell()
        postfile.seek(self.startoffset + 4)
        postfile.write_uint(self.blockcount)
        postfile.seek(here)

        self.block = None

    def start_field(self, fieldname, fieldobj):
        self.fieldname = fieldname
        self.field = fieldobj
        self.format = fieldobj.format
        self.spelling = fieldobj.spelling and not fieldobj.separate_spelling()

    def start_term(self, text):
        if self.block is not None:
            raise Exception("Called start_term in a block")
        self.text = text
        self.terminfo = base.FileTermInfo()
        if self.spelling:
            if self.dawg is None:
                self._make_dawg_files()
            self.dawg.insert((self.fieldname,) + tuple(text))
        self._start_blocklist()

    def add(self, docnum, weight, valuestring, length):
        self.block.add(docnum, weight, valuestring, length)
        if len(self.block) > self.blocklimit:
            self._write_block()

    def add_spell_word(self, fieldname, text):
        if self.dawg is None:
            self._make_dawg_files()
        self.dawg.insert((fieldname,) + tuple(text))

    def finish_term(self):
        if self.block is None:
            raise Exception("Called finish_term when not in a block")
        block = self.block
        terminfo = self.terminfo
        if self.blockcount < 1 and block and len(block) < self.inlinelimit:
            # Inline the single block
            terminfo.add_block(block)
            vals = None if not block.values else tuple(block.values)
            postings = (tuple(block.ids), tuple(block.weights), vals)
        else:
            self._finish_blocklist()
            postings = self.startoffset

        self.block = None
        terminfo.postings = postings
        self.termsindex.add((self.fieldname, self.text), terminfo)

    def close(self):
        self.termsindex.close()
        self.postfile.close()
        if self.dawg is not None:
            self.dawg.close()
Ejemplo n.º 11
0
    def __init__(self, ix, poolclass=None, procs=0, blocklimit=128,
                 timeout=0.0, delay=0.1, name=None, _lk=True, **poolargs):

        self.writelock = None
        if _lk:
            self.writelock = ix.lock("WRITELOCK")
            if not try_for(self.writelock.acquire, timeout=timeout,
                           delay=delay):
                raise LockError

        info = ix._read_toc()
        self.schema = info.schema
        self.segments = info.segments
        self.storage = storage = ix.storage
        self.indexname = ix.indexname
        self.is_closed = False

        self.blocklimit = blocklimit
        self.segment_number = info.segment_counter + 1
        self.generation = info.generation + 1

        self._doc_offsets = []
        base = 0
        for s in self.segments:
            self._doc_offsets.append(base)
            base += s.doc_count_all()

        self.name = name or Segment.basename(self.indexname,
                                             self.segment_number)
        self.docnum = 0
        self.fieldlength_totals = defaultdict(int)
        self._added = False
        self._unique_cache = {}

        # Create a temporary segment to use its .*_filename attributes
        segment = Segment(self.name, self.generation, 0, None, None, None)

        # Spelling
        self.wordsets = {}
        self.dawg = None
        if any(field.spelling for field in self.schema):
            self.dawgfile = storage.create_file(segment.dawg_filename)
            self.dawg = DawgBuilder(field_root=True)

        # Terms index
        tf = storage.create_file(segment.termsindex_filename)
        ti = TermIndexWriter(tf)
        # Term postings file
        pf = storage.create_file(segment.termposts_filename)
        pw = FilePostingWriter(pf, blocklimit=blocklimit)
        # Terms writer
        self.termswriter = TermsWriter(self.schema, ti, pw, self.dawg)

        if self.schema.has_vectored_fields():
            # Vector index
            vf = storage.create_file(segment.vectorindex_filename)
            self.vectorindex = TermVectorWriter(vf)

            # Vector posting file
            vpf = storage.create_file(segment.vectorposts_filename)
            self.vpostwriter = FilePostingWriter(vpf, stringids=True)
        else:
            self.vectorindex = None
            self.vpostwriter = None

        # Stored fields file
        sf = storage.create_file(segment.storedfields_filename)
        self.storedfields = StoredFieldWriter(sf, self.schema.stored_names())

        # Field lengths file
        self.lengthfile = storage.create_file(segment.fieldlengths_filename)

        # Create the pool
        if poolclass is None:
            if procs > 1:
                from whoosh.filedb.multiproc import MultiPool
                poolclass = MultiPool
            else:
                poolclass = TempfilePool
        self.pool = poolclass(self.schema, procs=procs, **poolargs)
Ejemplo n.º 12
0
class SegmentWriter(IndexWriter):
    def __init__(self, ix, poolclass=None, procs=0, blocklimit=128,
                 timeout=0.0, delay=0.1, name=None, _lk=True, **poolargs):

        self.writelock = None
        if _lk:
            self.writelock = ix.lock("WRITELOCK")
            if not try_for(self.writelock.acquire, timeout=timeout,
                           delay=delay):
                raise LockError

        info = ix._read_toc()
        self.schema = info.schema
        self.segments = info.segments
        self.storage = storage = ix.storage
        self.indexname = ix.indexname
        self.is_closed = False

        self.blocklimit = blocklimit
        self.segment_number = info.segment_counter + 1
        self.generation = info.generation + 1

        self._doc_offsets = []
        base = 0
        for s in self.segments:
            self._doc_offsets.append(base)
            base += s.doc_count_all()

        self.name = name or Segment.basename(self.indexname,
                                             self.segment_number)
        self.docnum = 0
        self.fieldlength_totals = defaultdict(int)
        self._added = False
        self._unique_cache = {}

        # Create a temporary segment to use its .*_filename attributes
        segment = Segment(self.name, self.generation, 0, None, None, None)

        # Spelling
        self.wordsets = {}
        self.dawg = None
        if any(field.spelling for field in self.schema):
            self.dawgfile = storage.create_file(segment.dawg_filename)
            self.dawg = DawgBuilder(field_root=True)

        # Terms index
        tf = storage.create_file(segment.termsindex_filename)
        ti = TermIndexWriter(tf)
        # Term postings file
        pf = storage.create_file(segment.termposts_filename)
        pw = FilePostingWriter(pf, blocklimit=blocklimit)
        # Terms writer
        self.termswriter = TermsWriter(self.schema, ti, pw, self.dawg)

        if self.schema.has_vectored_fields():
            # Vector index
            vf = storage.create_file(segment.vectorindex_filename)
            self.vectorindex = TermVectorWriter(vf)

            # Vector posting file
            vpf = storage.create_file(segment.vectorposts_filename)
            self.vpostwriter = FilePostingWriter(vpf, stringids=True)
        else:
            self.vectorindex = None
            self.vpostwriter = None

        # Stored fields file
        sf = storage.create_file(segment.storedfields_filename)
        self.storedfields = StoredFieldWriter(sf, self.schema.stored_names())

        # Field lengths file
        self.lengthfile = storage.create_file(segment.fieldlengths_filename)

        # Create the pool
        if poolclass is None:
            if procs > 1:
                from whoosh.filedb.multiproc import MultiPool
                poolclass = MultiPool
            else:
                poolclass = TempfilePool
        self.pool = poolclass(self.schema, procs=procs, **poolargs)

    def _check_state(self):
        if self.is_closed:
            raise IndexingError("This writer is closed")

    def add_field(self, fieldname, fieldspec, **kwargs):
        self._check_state()
        if self._added:
            raise Exception("Can't modify schema after adding data to writer")
        super(SegmentWriter, self).add_field(fieldname, fieldspec, **kwargs)

    def remove_field(self, fieldname):
        self._check_state()
        if self._added:
            raise Exception("Can't modify schema after adding data to writer")
        super(SegmentWriter, self).remove_field(fieldname)

    def _document_segment(self, docnum):
        #Returns the index.Segment object containing the given document
        #number.

        offsets = self._doc_offsets
        if len(offsets) == 1:
            return 0
        return bisect_right(offsets, docnum) - 1

    def _segment_and_docnum(self, docnum):
        #Returns an (index.Segment, segment_docnum) pair for the segment
        #containing the given document number.

        segmentnum = self._document_segment(docnum)
        offset = self._doc_offsets[segmentnum]
        segment = self.segments[segmentnum]
        return segment, docnum - offset

    def has_deletions(self):
        """
        :returns: True if this index has documents that are marked deleted but
            haven't been optimized out of the index yet.
        """

        return any(s.has_deletions() for s in self.segments)

    def delete_document(self, docnum, delete=True):
        self._check_state()
        if docnum >= sum(seg.doccount for seg in self.segments):
            raise IndexingError("No document ID %r in this index" % docnum)
        segment, segdocnum = self._segment_and_docnum(docnum)
        segment.delete_document(segdocnum, delete=delete)

    def deleted_count(self):
        """
        :returns: the total number of deleted documents in the index.
        """

        return sum(s.deleted_count() for s in self.segments)

    def is_deleted(self, docnum):
        segment, segdocnum = self._segment_and_docnum(docnum)
        return segment.is_deleted(segdocnum)

    def reader(self, reuse=None):
        self._check_state()
        from whoosh.filedb.fileindex import FileIndex

        return FileIndex._reader(self.storage, self.schema, self.segments,
                                 self.generation, reuse=reuse)

    def add_reader(self, reader):
        self._check_state()
        startdoc = self.docnum

        has_deletions = reader.has_deletions()
        if has_deletions:
            docmap = {}

        fieldnames = set(self.schema.names())

        # Add stored documents, vectors, and field lengths
        for docnum in reader.all_doc_ids():
            if (not has_deletions) or (not reader.is_deleted(docnum)):
                d = dict(item for item
                         in iteritems(reader.stored_fields(docnum))
                         if item[0] in fieldnames)
                # We have to append a dictionary for every document, even if
                # it's empty.
                self.storedfields.append(d)

                if has_deletions:
                    docmap[docnum] = self.docnum

                for fieldname in reader.schema.scorable_names():
                    length = reader.doc_field_length(docnum, fieldname)
                    if length and fieldname in fieldnames:
                        self.pool.add_field_length(self.docnum, fieldname,
                                                   length)

                for fieldname in reader.schema.vector_names():
                    if (fieldname in fieldnames
                        and reader.has_vector(docnum, fieldname)):
                        vpostreader = reader.vector(docnum, fieldname)
                        self._add_vector_reader(self.docnum, fieldname,
                                                vpostreader)

                self.docnum += 1

        # Add dawg contents to word sets for fields that require separate
        # handling
        for fieldname in self.schema.separate_spelling_names():
            if reader.has_word_graph(fieldname):
                graph = reader.word_graph(fieldname)
                self.add_spell_words(fieldname, flatten(graph))

        # Add postings
        for fieldname, text in reader.all_terms():
            if fieldname in fieldnames:
                postreader = reader.postings(fieldname, text)
                while postreader.is_active():
                    docnum = postreader.id()
                    valuestring = postreader.value()
                    if has_deletions:
                        newdoc = docmap[docnum]
                    else:
                        newdoc = startdoc + docnum

                    self.pool.add_posting(fieldname, text, newdoc,
                                          postreader.weight(), valuestring)
                    postreader.next()

        self._added = True

    def add_document(self, **fields):
        self._check_state()
        schema = self.schema
        docboost = self._doc_boost(fields)

        # Sort the keys
        fieldnames = sorted([name for name in fields.keys()
                             if not name.startswith("_")])

        # Check if the caller gave us a bogus field
        for name in fieldnames:
            if name not in schema:
                raise UnknownFieldError("No field named %r in %s"
                                        % (name, schema))

        storedvalues = {}

        docnum = self.docnum
        for fieldname in fieldnames:
            value = fields.get(fieldname)
            if value is None:
                continue
            field = schema[fieldname]

            if field.indexed:
                fieldboost = self._field_boost(fields, fieldname, docboost)
                self.pool.add_content(docnum, fieldname, field, value,
                                      fieldboost)

            if field.separate_spelling():
                # This field requires spelling words to be added in a separate
                # step, instead of as part of indexing
                self.add_spell_words(fieldname, field.spellable_words(value))

            vformat = field.vector
            if vformat:
                wvs = vformat.word_values(value, field.analyzer, mode="index")
                vlist = sorted((w, weight, valuestring)
                               for w, _, weight, valuestring in wvs)
                self._add_vector(docnum, fieldname, vlist)

            if field.stored:
                # Caller can override the stored value by including a key
                # _stored_<fieldname>
                storedvalue = value
                storedname = "_stored_" + fieldname
                if storedname in fields:
                    storedvalue = fields[storedname]
                storedvalues[fieldname] = storedvalue

        self._added = True
        self.storedfields.append(storedvalues)
        self.docnum += 1

    def add_spell_words(self, fieldname, words):
        # Get or make a set for the words in this field
        if fieldname not in self.wordsets:
            self.wordsets[fieldname] = set()
        wordset = self.wordsets[fieldname]

        # If the in-memory set is getting big, replace it with an
        # on-disk set
        if has_sqlite and isinstance(wordset, set) and len(wordset) > 4096:
            diskset = DiskSet(wordset)
            self.wordsets[fieldname] = wordset = diskset

        for word in words:
            wordset.add(word)

        self._added = True

    def _add_wordsets(self):
        dawg = self.dawg
        for fieldname in self.wordsets:
            ws = self.wordsets[fieldname]
            ft = (fieldname,)

            words = sorted(ws) if isinstance(ws, set) else iter(ws)
            for text in words:
                dawg.insert(ft + tuple(text))

            if isinstance(ws, DiskSet):
                ws.destroy()

    def _add_vector(self, docnum, fieldname, vlist):
        vpostwriter = self.vpostwriter
        offset = vpostwriter.start(self.schema[fieldname].vector)
        for text, weight, valuestring in vlist:
            #assert isinstance(text, text_type), "%r is not unicode" % text
            vpostwriter.write(text, weight, valuestring, 0)
        vpostwriter.finish(inlinelimit=0)
        self.vectorindex.add((docnum, fieldname), offset)

    def _add_vector_reader(self, docnum, fieldname, vreader):
        vpostwriter = self.vpostwriter
        offset = vpostwriter.start(self.schema[fieldname].vector)
        while vreader.is_active():
            # text, weight, valuestring, fieldlen
            vpostwriter.write(vreader.id(), vreader.weight(), vreader.value(),
                              0)
            vreader.next()
        vpostwriter.finish(inlinelimit=0)
        self.vectorindex.add((docnum, fieldname), offset)

    def _close_all(self):
        self.is_closed = True

        self.termswriter.close()
        self.storedfields.close()
        if not self.lengthfile.is_closed:
            self.lengthfile.close()
        if self.vectorindex:
            self.vectorindex.close()
        if self.vpostwriter:
            self.vpostwriter.close()

    def _getsegment(self):
        return Segment(self.name, self.generation, self.docnum,
                       self.pool.fieldlength_totals(),
                       self.pool.fieldlength_mins(),
                       self.pool.fieldlength_maxes())

    def commit(self, mergetype=None, optimize=False, merge=True):
        """Finishes writing and saves all additions and changes to disk.
        
        There are four possible ways to use this method::
        
            # Merge small segments but leave large segments, trying to
            # balance fast commits with fast searching:
            writer.commit()
        
            # Merge all segments into a single segment:
            writer.commit(optimize=True)
            
            # Don't merge any existing segments:
            writer.commit(merge=False)
            
            # Use a custom merge function
            writer.commit(mergetype=my_merge_function)
        
        :param mergetype: a custom merge function taking a Writer object and
            segment list as arguments, and returning a new segment list. If you
            supply a ``mergetype`` function, the values of the ``optimize`` and
            ``merge`` arguments are ignored.
        :param optimize: if True, all existing segments are merged with the
            documents you've added to this writer (and the value of the
            ``merge`` argument is ignored).
        :param merge: if False, do not merge small segments.
        """

        self._check_state()
        try:
            if mergetype:
                pass
            elif optimize:
                mergetype = OPTIMIZE
            elif not merge:
                mergetype = NO_MERGE
            else:
                mergetype = MERGE_SMALL

            # Call the merge policy function. The policy may choose to merge
            # other segments into this writer's pool
            new_segments = mergetype(self, self.segments)

            if self._added:
                # Create a Segment object for the segment created by this
                # writer
                thissegment = self._getsegment()

                # Tell the pool we're finished adding information, it should
                # add its accumulated data to the lengths, terms index, and
                # posting files.
                self.pool.finish(self.termswriter, self.docnum,
                                 self.lengthfile)

                # Write out spelling files
                if self.dawg:
                    # Insert any wordsets we've accumulated into the word graph
                    self._add_wordsets()
                    # Write out the word graph
                    self.dawg.write(self.dawgfile)

                # Add new segment to the list of remaining segments returned by
                # the merge policy function
                new_segments.append(thissegment)
            else:
                self.pool.cleanup()

            # Close all files, write a new TOC with the new segment list, and
            # release the lock.
            self._close_all()

            from whoosh.filedb.fileindex import _write_toc, _clean_files

            _write_toc(self.storage, self.schema, self.indexname,
                       self.generation, self.segment_number, new_segments)

            # Delete leftover files
            _clean_files(self.storage, self.indexname, self.generation,
                         new_segments)

        finally:
            if self.writelock:
                self.writelock.release()

    def cancel(self):
        self._check_state()
        try:
            self.pool.cancel()
            self._close_all()
        finally:
            if self.writelock:
                self.writelock.release()