Beispiel #1
0
    def __init__(self, storage, schema, segment, generation=None, codec=None):
        self.storage = storage
        self.schema = schema
        self.segment = segment
        self._gen = generation
        self.is_closed = False
        # Copy info from underlying segment
        self._has_deletions = segment.has_deletions()
        self._dc = segment.doc_count()
        self._dc_all = segment.doc_count_all()
        if hasattr(self.segment, "segment_id"):
            self.segid = str(self.segment.segment_id())
        else:
            self.segid = Segment._random_id()

        # Get microreaders from codec
        if codec is None:
            from whoosh.codec.standard import StdCodec

            codec = StdCodec(self.storage)
        self._codec = codec
        self._terms = codec.terms_reader(self.segment)
        self._lengths = codec.lengths_reader(self.segment)
        self._stored = codec.stored_fields_reader(self.segment)
        self._vectors = None  # Lazy open with self._open_vectors()
        self._dawg = None  # Lazy open with self._open_dawg()

        self.set_caching_policy()
Beispiel #2
0
 def _new_task(self, firstjob):
     ix = self.index
     self.segment_number += 1
     segmentname = Segment.basename(ix.indexname, self.segment_number)
     task = SegmentWritingTask(ix.storage, ix.indexname, segmentname,
                               self.kwargs, self.jobqueue, firstjob)
     self.tasks.append(task)
     task.start()
     return task
Beispiel #3
0
    def __init__(self, storage, schema, segment, generation=None, codec=None):
        self.storage = storage
        self.schema = schema
        self.segment = segment
        self._gen = generation
        self.is_closed = False
        # Copy info from underlying segment
        self._has_deletions = segment.has_deletions()
        self._dc = segment.doc_count()
        self._dc_all = segment.doc_count_all()
        if hasattr(self.segment, "segment_id"):
            self.segid = self.segment.segment_id()
        else:
            self.segid = Segment._random_id()

        # self.files is a storage object from which to load the segment files.
        # This is different from the general storage (which will be used for
        # cahces) if the segment is in a compound file.
        if segment.is_compound():
            # Use an overlay here instead of just the compound storage because
            # in rare circumstances a segment file may be added after the
            # segment is written
            self.files = OverlayStorage(segment.open_compound_file(storage),
                                        self.storage)
        else:
            self.files = storage

        # Get microreaders from codec
        if codec is None:
            from whoosh.codec import default_codec
            codec = default_codec()
        self._codec = codec
        self._terms = codec.terms_reader(self.files, self.segment)
        self._lengths = codec.lengths_reader(self.files, self.segment)
        self._stored = codec.stored_fields_reader(self.files, self.segment)
        self._vectors = None  # Lazy open with self._open_vectors()
        self._graph = None  # Lazy open with self._open_dawg()

        self.set_caching_policy()
Beispiel #4
0
 def _getsegment(self):
     return Segment(self.name, self.generation, self.docnum,
                    self.pool.fieldlength_totals(),
                    self.pool.fieldlength_maxes())
Beispiel #5
0
    def __init__(self, ix, poolclass=None, procs=0, blocklimit=128,
                 timeout=0.0, delay=0.1, name=None, _l=True, **poolargs):

        self.writelock = None
        if _l:
            self.writelock = ix.lock("WRITELOCK")
            if not try_for(self.writelock.acquire, timeout=timeout, delay=delay):
                raise LockError
        self.readlock = ix.lock("READLOCK")

        info = ix._read_toc()
        self.schema = info.schema
        self.segments = info.segments
        self.storage = ix.storage
        self.indexname = ix.indexname
        self.is_closed = False

        self.blocklimit = blocklimit
        self.segment_number = info.segment_counter + 1
        self.generation = info.generation + 1

        self._doc_offsets = []
        base = 0
        for s in self.segments:
            self._doc_offsets.append(base)
            base += s.doc_count_all()

        self.name = name or Segment.basename(self.indexname, self.segment_number)
        self.docnum = 0
        self.fieldlength_totals = defaultdict(int)
        self._added = False
        self._unique_cache = {}

        # Create a temporary segment to use its .*_filename attributes
        segment = Segment(self.name, self.generation, 0, None, None)

        # Terms index
        tf = self.storage.create_file(segment.termsindex_filename)
        ti = TermIndexWriter(tf)
        # Term postings file
        pf = self.storage.create_file(segment.termposts_filename)
        pw = FilePostingWriter(pf, blocklimit=blocklimit)
        # Terms writer
        self.termswriter = TermsWriter(self.schema, ti, pw)

        if self.schema.has_vectored_fields():
            # Vector index
            vf = self.storage.create_file(segment.vectorindex_filename)
            self.vectorindex = TermVectorWriter(vf)

            # Vector posting file
            vpf = self.storage.create_file(segment.vectorposts_filename)
            self.vpostwriter = FilePostingWriter(vpf, stringids=True)
        else:
            self.vectorindex = None
            self.vpostwriter = None

        # Stored fields file
        sf = self.storage.create_file(segment.storedfields_filename)
        self.storedfields = StoredFieldWriter(sf, self.schema.stored_names())

        # Field lengths file
        self.lengthfile = self.storage.create_file(segment.fieldlengths_filename)

        # Create the pool
        if poolclass is None:
            if procs > 1:
                from whoosh.filedb.multiproc import MultiPool
                poolclass = MultiPool
            else:
                poolclass = TempfilePool
        self.pool = poolclass(self.schema, procs=procs, **poolargs)
Beispiel #6
0
 def segment(self):
     """Returns an index.Segment object for the segment being written."""
     return Segment(self.name, self.max_doc, dict(self.field_length_totals))