def __init__(self, ix, poolclass=None, timeout=0.0, delay=0.1, _lk=True, limitmb=128, docbase=0, codec=None, **kwargs): # Lock the index self.writelock = None if _lk: self.writelock = ix.lock("WRITELOCK") if not try_for(self.writelock.acquire, timeout=timeout, delay=delay): raise LockError # Get info from the index self.storage = ix.storage self.indexname = ix.indexname info = ix._read_toc() self.generation = info.generation + 1 self.schema = info.schema self.segments = info.segments self.docnum = self.docbase = docbase self._setup_doc_offsets() # Internals poolprefix = "whoosh_%s_" % self.indexname self.pool = PostingPool(limitmb=limitmb, prefix=poolprefix) self.newsegment = Segment(self.indexname, 0) self.is_closed = False self._added = False # Set up writers if codec is None: from whoosh.codec.standard import StdCodec codec = StdCodec(self.storage) self.codec = codec self.perdocwriter = codec.per_document_writer(self.newsegment) self.fieldwriter = codec.field_writer(self.newsegment)
def __init__(self, ix, postlimit, blocklimit, name=None): """ :param ix: the Index object in which to write the new segment. :param postlimit: the maximum size for a run in the posting pool. :param blocklimit: the maximum number of postings in a posting block. :param name: the name of the segment. """ self.index = ix self.schema = ix.schema self.storage = storage = ix.storage self.name = name or ix._next_segment_name() self.max_doc = 0 self.pool = postpool.PostingPool(postlimit) # Create mappings of field numbers to the position of that field in the # lists of scorable and stored fields. For example, consider a schema # with fields (A, B, C, D, E, F). If B, D, and E are scorable, then the # list of scorable fields is (B, D, E). The _scorable_to_pos dictionary # would then map B -> 0, D -> 1, and E -> 2. self._scorable_to_pos = dict( (fnum, i) for i, fnum in enumerate(self.schema.scorable_fields())) self._stored_to_pos = dict( (fnum, i) for i, fnum in enumerate(self.schema.stored_fields())) # Create a temporary segment object just so we can access its # *_filename attributes (so if we want to change the naming convention, # we only have to do it in one place). tempseg = Segment(self.name, 0, 0, None) self.termtable = create_terms(storage, tempseg) self.docslist = create_storedfields(storage, tempseg) self.doclengths = None if self.schema.scorable_fields(): self.doclengths = create_doclengths(storage, tempseg, len(self._scorable_to_pos)) postfile = storage.create_file(tempseg.posts_filename) self.postwriter = FilePostingWriter(postfile, blocklimit=blocklimit) self.vectortable = None if self.schema.has_vectored_fields(): # Table associating document fields with (postoffset, postcount) self.vectortable = create_vectors(storage, tempseg) vpostfile = storage.create_file(tempseg.vectorposts_filename) self.vpostwriter = FilePostingWriter(vpostfile, stringids=True) # Keep track of the total number of tokens (across all docs) # in each field self.field_length_totals = defaultdict(int)
def __init__(self, ix, postlimit, name=None): """ :param ix: the Index object in which to write the new segment. :param postlimit: the maximum size for a run in the posting pool. :param name: the name of the segment. """ self.index = ix self.schema = ix.schema self.storage = ix.storage self.name = name or ix._next_segment_name() self.max_doc = 0 self.pool = postpool.PostingPool(limit=postlimit) self._scorable_to_pos = dict( (fnum, i) for i, fnum in enumerate(self.schema.scorable_fields())) self._stored_to_pos = dict( (fnum, i) for i, fnum in enumerate(self.schema.stored_fields())) # Create a temporary segment object just so we can access # its *_filename attributes (so if we want to change the # naming convention, we only have to do it in one place). tempseg = Segment(self.name, 0, 0, None) # Open files for writing self.term_table = create_term_table(self.storage, tempseg) self.docs_table = create_docs_table(self.storage, tempseg) recordformat = "<" + DOCLENGTH_TYPE * len(self._scorable_to_pos) self.doclength_table = self.storage.create_records( tempseg.doclen_filename, recordformat) self.vector_table = None if self.schema.has_vectored_fields(): self.vector_table = create_vector_table(self.storage, tempseg) # Keep track of the total number of tokens (across all docs) # in each field self.field_length_totals = defaultdict(int)
def _getsegment(self): return Segment(self.name, self.generation, self.docnum, self.pool.fieldlength_totals(), self.pool.fieldlength_maxes())
def __init__(self, ix, poolclass=None, procs=0, blocklimit=128, timeout=0.0, delay=0.1, name=None, _l=True, **poolargs): self.writelock = None if _l: self.writelock = ix.lock("WRITELOCK") if not try_for(self.writelock.acquire, timeout=timeout, delay=delay): raise LockError self.readlock = ix.lock("READLOCK") info = ix._read_toc() self.schema = info.schema self.segments = info.segments self.storage = ix.storage self.indexname = ix.indexname self.is_closed = False self.blocklimit = blocklimit self.segment_number = info.segment_counter + 1 self.generation = info.generation + 1 self._doc_offsets = [] base = 0 for s in self.segments: self._doc_offsets.append(base) base += s.doc_count_all() self.name = name or Segment.basename(self.indexname, self.segment_number) self.docnum = 0 self.fieldlength_totals = defaultdict(int) self._added = False self._unique_cache = {} # Create a temporary segment to use its .*_filename attributes segment = Segment(self.name, self.generation, 0, None, None) # Terms index tf = self.storage.create_file(segment.termsindex_filename) ti = TermIndexWriter(tf) # Term postings file pf = self.storage.create_file(segment.termposts_filename) pw = FilePostingWriter(pf, blocklimit=blocklimit) # Terms writer self.termswriter = TermsWriter(self.schema, ti, pw) if self.schema.has_vectored_fields(): # Vector index vf = self.storage.create_file(segment.vectorindex_filename) self.vectorindex = TermVectorWriter(vf) # Vector posting file vpf = self.storage.create_file(segment.vectorposts_filename) self.vpostwriter = FilePostingWriter(vpf, stringids=True) else: self.vectorindex = None self.vpostwriter = None # Stored fields file sf = self.storage.create_file(segment.storedfields_filename) self.storedfields = StoredFieldWriter(sf, self.schema.stored_names()) # Field lengths file self.lengthfile = self.storage.create_file(segment.fieldlengths_filename) # Create the pool if poolclass is None: if procs > 1: from whoosh.filedb.multiproc import MultiPool poolclass = MultiPool else: poolclass = TempfilePool self.pool = poolclass(self.schema, procs=procs, **poolargs)
def segment(self): """Returns an index.Segment object for the segment being written.""" return Segment(self.name, self.max_doc, dict(self.field_length_totals))