def __init__(self, storage, schema, segment, generation=None, codec=None): self.storage = storage self.schema = schema self.segment = segment self._gen = generation self.is_closed = False # Copy info from underlying segment self._has_deletions = segment.has_deletions() self._dc = segment.doc_count() self._dc_all = segment.doc_count_all() if hasattr(self.segment, "segment_id"): self.segid = str(self.segment.segment_id()) else: self.segid = Segment._random_id() # Get microreaders from codec if codec is None: from whoosh.codec.standard import StdCodec codec = StdCodec(self.storage) self._codec = codec self._terms = codec.terms_reader(self.segment) self._lengths = codec.lengths_reader(self.segment) self._stored = codec.stored_fields_reader(self.segment) self._vectors = None # Lazy open with self._open_vectors() self._dawg = None # Lazy open with self._open_dawg() self.set_caching_policy()
def _new_task(self, firstjob): ix = self.index self.segment_number += 1 segmentname = Segment.basename(ix.indexname, self.segment_number) task = SegmentWritingTask(ix.storage, ix.indexname, segmentname, self.kwargs, self.jobqueue, firstjob) self.tasks.append(task) task.start() return task
def __init__(self, storage, schema, segment, generation=None, codec=None): self.storage = storage self.schema = schema self.segment = segment self._gen = generation self.is_closed = False # Copy info from underlying segment self._has_deletions = segment.has_deletions() self._dc = segment.doc_count() self._dc_all = segment.doc_count_all() if hasattr(self.segment, "segment_id"): self.segid = self.segment.segment_id() else: self.segid = Segment._random_id() # self.files is a storage object from which to load the segment files. # This is different from the general storage (which will be used for # cahces) if the segment is in a compound file. if segment.is_compound(): # Use an overlay here instead of just the compound storage because # in rare circumstances a segment file may be added after the # segment is written self.files = OverlayStorage(segment.open_compound_file(storage), self.storage) else: self.files = storage # Get microreaders from codec if codec is None: from whoosh.codec import default_codec codec = default_codec() self._codec = codec self._terms = codec.terms_reader(self.files, self.segment) self._lengths = codec.lengths_reader(self.files, self.segment) self._stored = codec.stored_fields_reader(self.files, self.segment) self._vectors = None # Lazy open with self._open_vectors() self._graph = None # Lazy open with self._open_dawg() self.set_caching_policy()
def _getsegment(self): return Segment(self.name, self.generation, self.docnum, self.pool.fieldlength_totals(), self.pool.fieldlength_maxes())
def __init__(self, ix, poolclass=None, procs=0, blocklimit=128, timeout=0.0, delay=0.1, name=None, _l=True, **poolargs): self.writelock = None if _l: self.writelock = ix.lock("WRITELOCK") if not try_for(self.writelock.acquire, timeout=timeout, delay=delay): raise LockError self.readlock = ix.lock("READLOCK") info = ix._read_toc() self.schema = info.schema self.segments = info.segments self.storage = ix.storage self.indexname = ix.indexname self.is_closed = False self.blocklimit = blocklimit self.segment_number = info.segment_counter + 1 self.generation = info.generation + 1 self._doc_offsets = [] base = 0 for s in self.segments: self._doc_offsets.append(base) base += s.doc_count_all() self.name = name or Segment.basename(self.indexname, self.segment_number) self.docnum = 0 self.fieldlength_totals = defaultdict(int) self._added = False self._unique_cache = {} # Create a temporary segment to use its .*_filename attributes segment = Segment(self.name, self.generation, 0, None, None) # Terms index tf = self.storage.create_file(segment.termsindex_filename) ti = TermIndexWriter(tf) # Term postings file pf = self.storage.create_file(segment.termposts_filename) pw = FilePostingWriter(pf, blocklimit=blocklimit) # Terms writer self.termswriter = TermsWriter(self.schema, ti, pw) if self.schema.has_vectored_fields(): # Vector index vf = self.storage.create_file(segment.vectorindex_filename) self.vectorindex = TermVectorWriter(vf) # Vector posting file vpf = self.storage.create_file(segment.vectorposts_filename) self.vpostwriter = FilePostingWriter(vpf, stringids=True) else: self.vectorindex = None self.vpostwriter = None # Stored fields file sf = self.storage.create_file(segment.storedfields_filename) self.storedfields = StoredFieldWriter(sf, self.schema.stored_names()) # Field lengths file self.lengthfile = self.storage.create_file(segment.fieldlengths_filename) # Create the pool if poolclass is None: if procs > 1: from whoosh.filedb.multiproc import MultiPool poolclass = MultiPool else: poolclass = TempfilePool self.pool = poolclass(self.schema, procs=procs, **poolargs)
def segment(self): """Returns an index.Segment object for the segment being written.""" return Segment(self.name, self.max_doc, dict(self.field_length_totals))