Example #1
0
def test_dawg():
    from whoosh.support.dawg import DawgBuilder

    with TempStorage() as st:
        df = st.create_file("test.dawg")

        dw = DawgBuilder(field_root=True)
        dw.insert(["test"] + list("special"))
        dw.insert(["test"] + list("specials"))
        dw.write(df)

        assert_equal(list(dawg.flatten(dw.root.edge("test"))),
                     ["special", "specials"])
Example #2
0
def add_spelling(ix, fieldnames, commit=True):
    """Adds spelling files to an existing index that was created without
    them, and modifies the schema so the given fields have the ``spelling``
    attribute. Only works on filedb indexes.
    
    >>> ix = index.open_dir("testindex")
    >>> add_spelling(ix, ["content", "tags"])
    
    :param ix: a :class:`whoosh.filedb.fileindex.FileIndex` object.
    :param fieldnames: a list of field names to create word graphs for.
    :param force: if True, overwrites existing word graph files. This is only
        useful for debugging.
    """

    from whoosh.filedb.filereading import SegmentReader

    writer = ix.writer()
    storage = writer.storage
    schema = writer.schema
    segments = writer.segments

    for segment in segments:
        filename = segment.dawg_filename
        r = SegmentReader(storage, schema, segment)
        f = storage.create_file(filename)
        dawg = DawgBuilder(field_root=True)
        for fieldname in fieldnames:
            ft = (fieldname, )
            for word in r.lexicon(fieldname):
                dawg.insert(ft + tuple(word))
        dawg.write(f)

    for fieldname in fieldnames:
        schema[fieldname].spelling = True

    if commit:
        writer.commit(merge=False)
Example #3
0
    def __init__(self,
                 ix,
                 poolclass=None,
                 procs=0,
                 blocklimit=128,
                 timeout=0.0,
                 delay=0.1,
                 name=None,
                 _lk=True,
                 **poolargs):

        self.writelock = None
        if _lk:
            self.writelock = ix.lock("WRITELOCK")
            if not try_for(
                    self.writelock.acquire, timeout=timeout, delay=delay):
                raise LockError

        info = ix._read_toc()
        self.schema = info.schema
        self.segments = info.segments
        self.storage = storage = ix.storage
        self.indexname = ix.indexname
        self.is_closed = False

        self.blocklimit = blocklimit
        self.segment_number = info.segment_counter + 1
        self.generation = info.generation + 1

        self._doc_offsets = []
        base = 0
        for s in self.segments:
            self._doc_offsets.append(base)
            base += s.doc_count_all()

        self.name = name or Segment.basename(self.indexname,
                                             self.segment_number)
        self.docnum = 0
        self.fieldlength_totals = defaultdict(int)
        self._added = False
        self._unique_cache = {}

        # Create a temporary segment to use its .*_filename attributes
        segment = Segment(self.name, self.generation, 0, None, None, None)

        # Spelling
        self.wordsets = {}
        self.dawg = None
        if any(field.spelling for field in self.schema):
            self.dawgfile = storage.create_file(segment.dawg_filename)
            self.dawg = DawgBuilder(field_root=True)

        # Terms index
        tf = storage.create_file(segment.termsindex_filename)
        ti = TermIndexWriter(tf)
        # Term postings file
        pf = storage.create_file(segment.termposts_filename)
        pw = FilePostingWriter(pf, blocklimit=blocklimit)
        # Terms writer
        self.termswriter = TermsWriter(self.schema, ti, pw, self.dawg)

        if self.schema.has_vectored_fields():
            # Vector index
            vf = storage.create_file(segment.vectorindex_filename)
            self.vectorindex = TermVectorWriter(vf)

            # Vector posting file
            vpf = storage.create_file(segment.vectorposts_filename)
            self.vpostwriter = FilePostingWriter(vpf, stringids=True)
        else:
            self.vectorindex = None
            self.vpostwriter = None

        # Stored fields file
        sf = storage.create_file(segment.storedfields_filename)
        self.storedfields = StoredFieldWriter(sf, self.schema.stored_names())

        # Field lengths file
        self.lengthfile = storage.create_file(segment.fieldlengths_filename)

        # Create the pool
        if poolclass is None:
            if procs > 1:
                from whoosh.filedb.multiproc import MultiPool
                poolclass = MultiPool
            else:
                poolclass = TempfilePool
        self.pool = poolclass(self.schema, procs=procs, **poolargs)
Example #4
0
 def _make_dawg_files(self):
     dawgfile = self.segment.create_file(self.storage, StdCodec.DAWG_EXT)
     self.dawg = DawgBuilder(dawgfile, field_root=True)