def __init__(self, ix, postlimit, blocklimit, name=None): """ :param ix: the Index object in which to write the new segment. :param postlimit: the maximum size for a run in the posting pool. :param blocklimit: the maximum number of postings in a posting block. :param name: the name of the segment. """ self.index = ix self.schema = ix.schema self.storage = storage = ix.storage self.name = name or ix._next_segment_name() self.max_doc = 0 self.pool = postpool.PostingPool(postlimit) # Create mappings of field numbers to the position of that field in the # lists of scorable and stored fields. For example, consider a schema # with fields (A, B, C, D, E, F). If B, D, and E are scorable, then the # list of scorable fields is (B, D, E). The _scorable_to_pos dictionary # would then map B -> 0, D -> 1, and E -> 2. self._scorable_to_pos = dict((fnum, i) for i, fnum in enumerate(self.schema.scorable_fields())) self._stored_to_pos = dict((fnum, i) for i, fnum in enumerate(self.schema.stored_fields())) # Create a temporary segment object just so we can access its # *_filename attributes (so if we want to change the naming convention, # we only have to do it in one place). tempseg = Segment(self.name, 0, 0, None) self.termtable = create_terms(storage, tempseg) self.docslist = create_storedfields(storage, tempseg) self.doclengths = None if self.schema.scorable_fields(): self.doclengths = create_doclengths(storage, tempseg, len(self._scorable_to_pos)) postfile = storage.create_file(tempseg.posts_filename) self.postwriter = FilePostingWriter(postfile, blocklimit=blocklimit) self.vectortable = None if self.schema.has_vectored_fields(): # Table associating document fields with (postoffset, postcount) self.vectortable = create_vectors(storage, tempseg) vpostfile = storage.create_file(tempseg.vectorposts_filename) self.vpostwriter = FilePostingWriter(vpostfile, stringids=True) # Keep track of the total number of tokens (across all docs) # in each field self.field_length_totals = defaultdict(int)
def __init__(self, ix, postlimit, blocklimit, name=None): """ :param ix: the Index object in which to write the new segment. :param postlimit: the maximum size for a run in the posting pool. :param blocklimit: the maximum number of postings in a posting block. :param name: the name of the segment. """ self.index = ix self.schema = ix.schema self.storage = storage = ix.storage self.name = name or ix._next_segment_name() self.max_doc = 0 self.pool = postpool.PostingPool(postlimit) # Create mappings of field numbers to the position of that field in the # lists of scorable and stored fields. For example, consider a schema # with fields (A, B, C, D, E, F). If B, D, and E are scorable, then the # list of scorable fields is (B, D, E). The _scorable_to_pos dictionary # would then map B -> 0, D -> 1, and E -> 2. self._scorable_to_pos = dict( (fnum, i) for i, fnum in enumerate(self.schema.scorable_fields())) self._stored_to_pos = dict( (fnum, i) for i, fnum in enumerate(self.schema.stored_fields())) # Create a temporary segment object just so we can access its # *_filename attributes (so if we want to change the naming convention, # we only have to do it in one place). tempseg = Segment(self.name, 0, 0, None) self.termtable = create_terms(storage, tempseg) self.docslist = create_storedfields(storage, tempseg) self.doclengths = None if self.schema.scorable_fields(): self.doclengths = create_doclengths(storage, tempseg, len(self._scorable_to_pos)) postfile = storage.create_file(tempseg.posts_filename) self.postwriter = FilePostingWriter(postfile, blocklimit=blocklimit) self.vectortable = None if self.schema.has_vectored_fields(): # Table associating document fields with (postoffset, postcount) self.vectortable = create_vectors(storage, tempseg) vpostfile = storage.create_file(tempseg.vectorposts_filename) self.vpostwriter = FilePostingWriter(vpostfile, stringids=True) # Keep track of the total number of tokens (across all docs) # in each field self.field_length_totals = defaultdict(int)
def test_huge_postfile(): with TempStorage("hugeindex") as st: pf = st.create_file("test.pst") gb5 = 5 * 1024 * 1024 * 1024 pf.seek(gb5) pf.write("\x00\x00\x00\x00") assert_equal(pf.tell(), gb5 + 4) fpw = FilePostingWriter(pf) format = formats.Frequency(None) offset = fpw.start(format) for i in xrange(10): fpw.write(i, float(i), struct.pack("!I", i), 10) posttotal = fpw.finish() assert_equal(posttotal, 10) fpw.close() pf = st.open_file("test.pst") pfr = FilePostingReader(pf, offset, format) i = 0 while pfr.is_active(): assert_equal(pfr.id(), i) assert_equal(pfr.weight(), float(i)) assert_equal(pfr.value(), struct.pack("!I", i)) pfr.next() i += 1 pf.close()
def roundtrip(postings, format, astype): with TempStorage("roundtrip") as st: postfile = st.create_file(astype) getweight = format.decoder("weight") fpw = FilePostingWriter(postfile, blocklimit=8) fpw.start(format) for id, value in postings: v = format.encode(value) fpw.write(id, getweight(v), v, 0) fpw.finish() fpw.close() postfile = st.open_file(astype) fpr = FilePostingReader(postfile, 0, format) readback = list(fpr.items_as(astype)) postfile.close() return readback
def test_readwrite(): with TempStorage("readwrite") as st: format = Frequency() postings = make_postings() postfile = st.create_file("readwrite") fpw = FilePostingWriter(postfile, blocklimit=8) fpw.start(format) for id, freq in postings: fpw.write(id, float(freq), format.encode(freq), 0) fpw.finish() fpw.close() postfile = st.open_file("readwrite") fpr = FilePostingReader(postfile, 0, format) assert_equal(postings, list(fpr.items_as("frequency"))) postfile.close()
def roundtrip(self, postings, format, astype): postfile = self.make_file(astype) readback = None try: fpw = FilePostingWriter(postfile, blocklimit=8) fpw.start(format) for id, value in postings: fpw.write(id, format.encode(value)) fpw.close() postfile = self.open_file(astype) fpr = FilePostingReader(postfile, 0, format) readback = list(fpr.all_as(astype)) fpr.close() finally: self.delete_file(astype) return readback
def test_readwrite(self): format = Frequency(None) postings = self.make_postings() postfile = self.make_file("readwrite") try: fpw = FilePostingWriter(postfile, blocklimit=8) fpw.start(format) for id, freq in postings: fpw.write(id, format.encode(freq)) fpw.close() postfile = self.open_file("readwrite") fpr = FilePostingReader(postfile, 0, format) #self.assertEqual(postings, list(fpr.items_as("frequency"))) fpr.close() finally: self.delete_file("readwrite")
def __init__(self, ix, poolclass=None, procs=0, blocklimit=128, timeout=0.0, delay=0.1, name=None, _l=True, **poolargs): self.writelock = None if _l: self.writelock = ix.lock("WRITELOCK") if not try_for(self.writelock.acquire, timeout=timeout, delay=delay): raise LockError self.readlock = ix.lock("READLOCK") info = ix._read_toc() self.schema = info.schema self.segments = info.segments self.storage = ix.storage self.indexname = ix.indexname self.is_closed = False self.blocklimit = blocklimit self.segment_number = info.segment_counter + 1 self.generation = info.generation + 1 self._doc_offsets = [] base = 0 for s in self.segments: self._doc_offsets.append(base) base += s.doc_count_all() self.name = name or Segment.basename(self.indexname, self.segment_number) self.docnum = 0 self.fieldlength_totals = defaultdict(int) self._added = False self._unique_cache = {} # Create a temporary segment to use its .*_filename attributes segment = Segment(self.name, self.generation, 0, None, None) # Terms index tf = self.storage.create_file(segment.termsindex_filename) ti = TermIndexWriter(tf) # Term postings file pf = self.storage.create_file(segment.termposts_filename) pw = FilePostingWriter(pf, blocklimit=blocklimit) # Terms writer self.termswriter = TermsWriter(self.schema, ti, pw) if self.schema.has_vectored_fields(): # Vector index vf = self.storage.create_file(segment.vectorindex_filename) self.vectorindex = TermVectorWriter(vf) # Vector posting file vpf = self.storage.create_file(segment.vectorposts_filename) self.vpostwriter = FilePostingWriter(vpf, stringids=True) else: self.vectorindex = None self.vpostwriter = None # Stored fields file sf = self.storage.create_file(segment.storedfields_filename) self.storedfields = StoredFieldWriter(sf, self.schema.stored_names()) # Field lengths file self.lengthfile = self.storage.create_file(segment.fieldlengths_filename) # Create the pool if poolclass is None: if procs > 1: from whoosh.filedb.multiproc import MultiPool poolclass = MultiPool else: poolclass = TempfilePool self.pool = poolclass(self.schema, procs=procs, **poolargs)
class SegmentWriter(IndexWriter): def __init__(self, ix, poolclass=None, procs=0, blocklimit=128, timeout=0.0, delay=0.1, name=None, _l=True, **poolargs): self.writelock = None if _l: self.writelock = ix.lock("WRITELOCK") if not try_for(self.writelock.acquire, timeout=timeout, delay=delay): raise LockError self.readlock = ix.lock("READLOCK") info = ix._read_toc() self.schema = info.schema self.segments = info.segments self.storage = ix.storage self.indexname = ix.indexname self.is_closed = False self.blocklimit = blocklimit self.segment_number = info.segment_counter + 1 self.generation = info.generation + 1 self._doc_offsets = [] base = 0 for s in self.segments: self._doc_offsets.append(base) base += s.doc_count_all() self.name = name or Segment.basename(self.indexname, self.segment_number) self.docnum = 0 self.fieldlength_totals = defaultdict(int) self._added = False self._unique_cache = {} # Create a temporary segment to use its .*_filename attributes segment = Segment(self.name, self.generation, 0, None, None) # Terms index tf = self.storage.create_file(segment.termsindex_filename) ti = TermIndexWriter(tf) # Term postings file pf = self.storage.create_file(segment.termposts_filename) pw = FilePostingWriter(pf, blocklimit=blocklimit) # Terms writer self.termswriter = TermsWriter(self.schema, ti, pw) if self.schema.has_vectored_fields(): # Vector index vf = self.storage.create_file(segment.vectorindex_filename) self.vectorindex = TermVectorWriter(vf) # Vector posting file vpf = self.storage.create_file(segment.vectorposts_filename) self.vpostwriter = FilePostingWriter(vpf, stringids=True) else: self.vectorindex = None self.vpostwriter = None # Stored fields file sf = self.storage.create_file(segment.storedfields_filename) self.storedfields = StoredFieldWriter(sf, self.schema.stored_names()) # Field lengths file self.lengthfile = self.storage.create_file(segment.fieldlengths_filename) # Create the pool if poolclass is None: if procs > 1: from whoosh.filedb.multiproc import MultiPool poolclass = MultiPool else: poolclass = TempfilePool self.pool = poolclass(self.schema, procs=procs, **poolargs) def _check_state(self): if self.is_closed: raise IndexingError("This writer is closed") def add_field(self, fieldname, fieldspec): self._check_state() if self._added: raise Exception("Can't modify schema after adding data to writer") super(SegmentWriter, self).add_field(fieldname, fieldspec) def remove_field(self, fieldname): self._check_state() if self._added: raise Exception("Can't modify schema after adding data to writer") super(SegmentWriter, self).remove_field(fieldname) def _document_segment(self, docnum): #Returns the index.Segment object containing the given document #number. offsets = self._doc_offsets if len(offsets) == 1: return 0 return bisect_right(offsets, docnum) - 1 def _segment_and_docnum(self, docnum): #Returns an (index.Segment, segment_docnum) pair for the segment #containing the given document number. segmentnum = self._document_segment(docnum) offset = self._doc_offsets[segmentnum] segment = self.segments[segmentnum] return segment, docnum - offset def has_deletions(self): """ :returns: True if this index has documents that are marked deleted but haven't been optimized out of the index yet. """ return any(s.has_deletions() for s in self.segments) def delete_document(self, docnum, delete=True): self._check_state() if docnum >= sum(seg.doccount for seg in self.segments): raise IndexingError("No document ID %r in this index" % docnum) segment, segdocnum = self._segment_and_docnum(docnum) segment.delete_document(segdocnum, delete=delete) def deleted_count(self): """ :returns: the total number of deleted documents in the index. """ return sum(s.deleted_count() for s in self.segments) def is_deleted(self, docnum): segment, segdocnum = self._segment_and_docnum(docnum) return segment.is_deleted(segdocnum) def reader(self, reuse=None): self._check_state() from whoosh.filedb.fileindex import FileIndex return FileIndex._reader(self.storage, self.schema, self.segments, self.generation, reuse=reuse) def add_reader(self, reader): self._check_state() startdoc = self.docnum has_deletions = reader.has_deletions() if has_deletions: docmap = {} fieldnames = set(self.schema.names()) # Add stored documents, vectors, and field lengths for docnum in reader.all_doc_ids(): if (not has_deletions) or (not reader.is_deleted(docnum)): d = dict(item for item in reader.stored_fields(docnum).iteritems() if item[0] in fieldnames) # We have to append a dictionary for every document, even if # it's empty. self.storedfields.append(d) if has_deletions: docmap[docnum] = self.docnum for fieldname, length in reader.doc_field_lengths(docnum): if fieldname in fieldnames: self.pool.add_field_length(self.docnum, fieldname, length) for fieldname in reader.schema.vector_names(): if (fieldname in fieldnames and reader.has_vector(docnum, fieldname)): vpostreader = reader.vector(docnum, fieldname) self._add_vector_reader(self.docnum, fieldname, vpostreader) self.docnum += 1 for fieldname, text, _, _ in reader: if fieldname in fieldnames: postreader = reader.postings(fieldname, text) while postreader.is_active(): docnum = postreader.id() valuestring = postreader.value() if has_deletions: newdoc = docmap[docnum] else: newdoc = startdoc + docnum self.pool.add_posting(fieldname, text, newdoc, postreader.weight(), valuestring) postreader.next() self._added = True def add_document(self, **fields): #from whoosh.util import now #t = now() self._check_state() schema = self.schema # Sort the keys fieldnames = sorted([name for name in fields.keys() if not name.startswith("_")]) # Check if the caller gave us a bogus field for name in fieldnames: if name not in schema: raise UnknownFieldError("No field named %r in %s" % (name, schema)) storedvalues = {} docnum = self.docnum for fieldname in fieldnames: value = fields.get(fieldname) if value is not None: field = schema[fieldname] if field.indexed: self.pool.add_content(docnum, fieldname, field, value) vformat = field.vector if vformat: vlist = sorted((w, weight, valuestring) for w, freq, weight, valuestring in vformat.word_values(value, mode="index")) self._add_vector(docnum, fieldname, vlist) if field.stored: # Caller can override the stored value by including a key # _stored_<fieldname> storedvalue = value storedname = "_stored_" + fieldname if storedname in fields: storedvalue = fields[storedname] storedvalues[fieldname] = storedvalue self._added = True self.storedfields.append(storedvalues) self.docnum += 1 #print "%f" % (now() - t) #def update_document(self, **fields): def _add_vector(self, docnum, fieldname, vlist): vpostwriter = self.vpostwriter offset = vpostwriter.start(self.schema[fieldname].vector) for text, weight, valuestring in vlist: assert isinstance(text, unicode), "%r is not unicode" % text vpostwriter.write(text, weight, valuestring, 0) vpostwriter.finish() self.vectorindex.add((docnum, fieldname), offset) def _add_vector_reader(self, docnum, fieldname, vreader): vpostwriter = self.vpostwriter offset = vpostwriter.start(self.schema[fieldname].vector) while vreader.is_active(): # text, weight, valuestring, fieldlen vpostwriter.write(vreader.id(), vreader.weight(), vreader.value(), 0) vreader.next() vpostwriter.finish() self.vectorindex.add((docnum, fieldname), offset) def _close_all(self): self.is_closed = True self.termswriter.close() self.storedfields.close() if not self.lengthfile.is_closed: self.lengthfile.close() if self.vectorindex: self.vectorindex.close() if self.vpostwriter: self.vpostwriter.close() def _getsegment(self): return Segment(self.name, self.generation, self.docnum, self.pool.fieldlength_totals(), self.pool.fieldlength_maxes()) def commit(self, mergetype=None, optimize=False, merge=True): """Finishes writing and saves all additions and changes to disk. There are four possible ways to use this method:: # Merge small segments but leave large segments, trying to # balance fast commits with fast searching: writer.commit() # Merge all segments into a single segment: writer.commit(optimize=True) # Don't merge any existing segments: writer.commit(merge=False) # Use a custom merge function writer.commit(mergetype=my_merge_function) :param mergetype: a custom merge function taking a Writer object and segment list as arguments, and returning a new segment list. If you supply a ``mergetype`` function, the values of the ``optimize`` and ``merge`` arguments are ignored. :param optimize: if True, all existing segments are merged with the documents you've added to this writer (and the value of the ``merge`` argument is ignored). :param merge: if False, do not merge small segments. """ self._check_state() try: if mergetype: pass elif optimize: mergetype = OPTIMIZE elif not merge: mergetype = NO_MERGE else: mergetype = MERGE_SMALL # Call the merge policy function. The policy may choose to merge other # segments into this writer's pool new_segments = mergetype(self, self.segments) # Tell the pool we're finished adding information, it should add its # accumulated data to the lengths, terms index, and posting files. if self._added: self.pool.finish(self.termswriter, self.docnum, self.lengthfile) # Create a Segment object for the segment created by this writer and # add it to the list of remaining segments returned by the merge policy # function new_segments.append(self._getsegment()) else: self.pool.cleanup() # Close all files, write a new TOC with the new segment list, and # release the lock. self._close_all() from whoosh.filedb.fileindex import _write_toc, _clean_files _write_toc(self.storage, self.schema, self.indexname, self.generation, self.segment_number, new_segments) self.readlock.acquire(True) try: _clean_files(self.storage, self.indexname, self.generation, new_segments) finally: self.readlock.release() finally: if self.writelock: self.writelock.release() def cancel(self): self._check_state() try: self.pool.cancel() self._close_all() finally: if self.writelock: self.writelock.release()
class SegmentWriter(object): """Do not instantiate this object directly; it is created by the IndexWriter object. Handles the actual writing of new documents to the index: writes stored fields, handles the posting pool, and writes out the term index. """ def __init__(self, ix, postlimit, blocklimit, name=None): """ :param ix: the Index object in which to write the new segment. :param postlimit: the maximum size for a run in the posting pool. :param blocklimit: the maximum number of postings in a posting block. :param name: the name of the segment. """ self.index = ix self.schema = ix.schema self.storage = storage = ix.storage self.name = name or ix._next_segment_name() self.max_doc = 0 self.pool = postpool.PostingPool(postlimit) # Create mappings of field numbers to the position of that field in the # lists of scorable and stored fields. For example, consider a schema # with fields (A, B, C, D, E, F). If B, D, and E are scorable, then the # list of scorable fields is (B, D, E). The _scorable_to_pos dictionary # would then map B -> 0, D -> 1, and E -> 2. self._scorable_to_pos = dict( (fnum, i) for i, fnum in enumerate(self.schema.scorable_fields())) self._stored_to_pos = dict( (fnum, i) for i, fnum in enumerate(self.schema.stored_fields())) # Create a temporary segment object just so we can access its # *_filename attributes (so if we want to change the naming convention, # we only have to do it in one place). tempseg = Segment(self.name, 0, 0, None) self.termtable = create_terms(storage, tempseg) self.docslist = create_storedfields(storage, tempseg) self.doclengths = None if self.schema.scorable_fields(): self.doclengths = create_doclengths(storage, tempseg, len(self._scorable_to_pos)) postfile = storage.create_file(tempseg.posts_filename) self.postwriter = FilePostingWriter(postfile, blocklimit=blocklimit) self.vectortable = None if self.schema.has_vectored_fields(): # Table associating document fields with (postoffset, postcount) self.vectortable = create_vectors(storage, tempseg) vpostfile = storage.create_file(tempseg.vectorposts_filename) self.vpostwriter = FilePostingWriter(vpostfile, stringids=True) # Keep track of the total number of tokens (across all docs) # in each field self.field_length_totals = defaultdict(int) def segment(self): """Returns an index.Segment object for the segment being written.""" return Segment(self.name, self.max_doc, dict(self.field_length_totals)) def _close_all(self): self.termtable.close() self.postwriter.close() self.docslist.close() if self.doclengths: self.doclengths.close() if self.vectortable: self.vectortable.close() self.vpostwriter.close() def close(self): """Finishes writing the segment (flushes the posting pool out to disk) and closes all open files. """ self._flush_pool() self._close_all() def add_reader(self, reader): """Adds the contents of another segment to this one. This is used to merge existing segments into the new one before deleting them. :param ix: The index.Index object containing the segment to merge. :param segment: The index.Segment object to merge into this one. """ start_doc = self.max_doc has_deletions = reader.has_deletions() if has_deletions: doc_map = {} schema = self.schema name2num = schema.name_to_number stored_to_pos = self._stored_to_pos def storedkeyhelper(item): return stored_to_pos[name2num(item[0])] # Merge document info docnum = 0 vectored_fieldnums = schema.vectored_fields() for docnum in xrange(reader.doc_count_all()): if not reader.is_deleted(docnum): # Copy the stored fields and field lengths from the reader # into this segment storeditems = reader.stored_fields(docnum).items() storedvalues = [ v for k, v in sorted(storeditems, key=storedkeyhelper) ] self._add_doc_data(storedvalues, reader.doc_field_lengths(docnum)) if has_deletions: doc_map[docnum] = self.max_doc # Copy term vectors for fieldnum in vectored_fieldnums: if reader.has_vector(docnum, fieldnum): self._add_vector( fieldnum, reader.vector(docnum, fieldnum).items()) self.max_doc += 1 # Add field length totals for fieldnum in schema.scorable_fields(): self.field_length_totals[fieldnum] += reader.field_length(fieldnum) # Merge terms current_fieldnum = None decoder = None for fieldnum, text, _, _ in reader: if fieldnum != current_fieldnum: current_fieldnum = fieldnum decoder = schema[fieldnum].format.decode_frequency postreader = reader.postings(fieldnum, text) for docnum, valuestring in postreader.all_items(): if has_deletions: newdoc = doc_map[docnum] else: newdoc = start_doc + docnum # TODO: Is there a faster way to do this? freq = decoder(valuestring) self.pool.add_posting(fieldnum, text, newdoc, freq, valuestring) def add_document(self, fields): scorable_to_pos = self._scorable_to_pos stored_to_pos = self._stored_to_pos schema = self.schema # Sort the keys by their order in the schema fieldnames = [ name for name in fields.keys() if not name.startswith("_") ] fieldnames.sort(key=schema.name_to_number) # Check if the caller gave us a bogus field for name in fieldnames: if name not in schema: raise UnknownFieldError("There is no field named %r" % name) # Create an array of counters to record the length of each field fieldlengths = array(DOCLENGTH_TYPE, [0] * len(scorable_to_pos)) # Create a list (initially a list of Nones) in which we will put stored # field values as we get them. Why isn't this an empty list that we # append to? Because if the caller doesn't supply a value for a stored # field, we don't want to have a list in the wrong order/of the wrong # length. storedvalues = [None] * len(stored_to_pos) for name in fieldnames: value = fields.get(name) if value: fieldnum = schema.name_to_number(name) field = schema.field_by_number(fieldnum) # If the field is indexed, add the words in the value to the # index if field.indexed: # Count of all terms in the value count = 0 # Count of UNIQUE terms in the value unique = 0 # TODO: Method for adding progressive field values, ie # setting start_pos/start_char? for w, freq, valuestring in field.index(value): #assert w != "" self.pool.add_posting(fieldnum, w, self.max_doc, freq, valuestring) count += freq unique += 1 if field.scorable: # Add the term count to the total for this field self.field_length_totals[fieldnum] += count # Set the term count to the per-document field length pos = scorable_to_pos[fieldnum] fieldlengths[pos] = min(count, DOCLENGTH_LIMIT) # If the field is vectored, add the words in the value to the # vector table vector = field.vector if vector: # TODO: Method for adding progressive field values, ie # setting start_pos/start_char? vlist = sorted((w, valuestring) for w, freq, valuestring in vector.word_values(value, mode="index")) self._add_vector(fieldnum, vlist) # If the field is stored, put the value in storedvalues if field.stored: # Caller can override the stored value by including a key # _stored_<fieldname> storedname = "_stored_" + name if storedname in fields: stored_value = fields[storedname] else: stored_value = value storedvalues[stored_to_pos[fieldnum]] = stored_value self._add_doc_data(storedvalues, fieldlengths) self.max_doc += 1 def _add_terms(self): pass def _add_doc_data(self, storedvalues, fieldlengths): self.docslist.append(storedvalues) if self.doclengths: self.doclengths.append(fieldlengths) def _add_vector(self, fieldnum, vlist): vpostwriter = self.vpostwriter vformat = self.schema[fieldnum].vector offset = vpostwriter.start(vformat) for text, valuestring in vlist: assert isinstance(text, unicode), "%r is not unicode" % text vpostwriter.write(text, valuestring) vpostwriter.finish() self.vectortable.add((self.max_doc, fieldnum), offset) def _flush_pool(self): # This method pulls postings out of the posting pool (built up as # documents are added) and writes them to the posting file. Each time # it encounters a posting for a new term, it writes the previous term # to the term index (by waiting to write the term entry, we can easily # count the document frequency and sum the terms by looking at the # postings). termtable = self.termtable postwriter = self.postwriter schema = self.schema current_fieldnum = None # Field number of the current term current_text = None # Text of the current term first = True current_freq = 0 offset = None # Loop through the postings in the pool. Postings always come out of # the pool in (field number, lexical) order. for fieldnum, text, docnum, freq, valuestring in self.pool: # Is this the first time through, or is this a new term? if first or fieldnum > current_fieldnum or text > current_text: if first: first = False else: # This is a new term, so finish the postings and add the # term to the term table postcount = postwriter.finish() termtable.add((current_fieldnum, current_text), (current_freq, offset, postcount)) # Reset the post writer and the term variables current_fieldnum = fieldnum current_text = text current_freq = 0 offset = postwriter.start(schema[fieldnum].format) elif (fieldnum < current_fieldnum or (fieldnum == current_fieldnum and text < current_text)): # This should never happen! raise Exception( "Postings are out of order: %s:%s .. %s:%s" % (current_fieldnum, current_text, fieldnum, text)) # Write a posting for this occurrence of the current term current_freq += freq postwriter.write(docnum, valuestring) # If there are still "uncommitted" postings at the end, finish them off if not first: postcount = postwriter.finish() termtable.add((current_fieldnum, current_text), (current_freq, offset, postcount))
class SegmentWriter(object): """Do not instantiate this object directly; it is created by the IndexWriter object. Handles the actual writing of new documents to the index: writes stored fields, handles the posting pool, and writes out the term index. """ def __init__(self, ix, postlimit, blocklimit, name=None): """ :param ix: the Index object in which to write the new segment. :param postlimit: the maximum size for a run in the posting pool. :param blocklimit: the maximum number of postings in a posting block. :param name: the name of the segment. """ self.index = ix self.schema = ix.schema self.storage = storage = ix.storage self.name = name or ix._next_segment_name() self.max_doc = 0 self.pool = postpool.PostingPool(postlimit) # Create mappings of field numbers to the position of that field in the # lists of scorable and stored fields. For example, consider a schema # with fields (A, B, C, D, E, F). If B, D, and E are scorable, then the # list of scorable fields is (B, D, E). The _scorable_to_pos dictionary # would then map B -> 0, D -> 1, and E -> 2. self._scorable_to_pos = dict((fnum, i) for i, fnum in enumerate(self.schema.scorable_fields())) self._stored_to_pos = dict((fnum, i) for i, fnum in enumerate(self.schema.stored_fields())) # Create a temporary segment object just so we can access its # *_filename attributes (so if we want to change the naming convention, # we only have to do it in one place). tempseg = Segment(self.name, 0, 0, None) self.termtable = create_terms(storage, tempseg) self.docslist = create_storedfields(storage, tempseg) self.doclengths = None if self.schema.scorable_fields(): self.doclengths = create_doclengths(storage, tempseg, len(self._scorable_to_pos)) postfile = storage.create_file(tempseg.posts_filename) self.postwriter = FilePostingWriter(postfile, blocklimit=blocklimit) self.vectortable = None if self.schema.has_vectored_fields(): # Table associating document fields with (postoffset, postcount) self.vectortable = create_vectors(storage, tempseg) vpostfile = storage.create_file(tempseg.vectorposts_filename) self.vpostwriter = FilePostingWriter(vpostfile, stringids=True) # Keep track of the total number of tokens (across all docs) # in each field self.field_length_totals = defaultdict(int) def segment(self): """Returns an index.Segment object for the segment being written.""" return Segment(self.name, self.max_doc, dict(self.field_length_totals)) def _close_all(self): self.termtable.close() self.postwriter.close() self.docslist.close() if self.doclengths: self.doclengths.close() if self.vectortable: self.vectortable.close() self.vpostwriter.close() def close(self): """Finishes writing the segment (flushes the posting pool out to disk) and closes all open files. """ self._flush_pool() self._close_all() def add_reader(self, reader): """Adds the contents of another segment to this one. This is used to merge existing segments into the new one before deleting them. :param ix: The index.Index object containing the segment to merge. :param segment: The index.Segment object to merge into this one. """ start_doc = self.max_doc has_deletions = reader.has_deletions() if has_deletions: doc_map = {} schema = self.schema name2num = schema.name_to_number stored_to_pos = self._stored_to_pos def storedkeyhelper(item): return stored_to_pos[name2num(item[0])] # Merge document info docnum = 0 vectored_fieldnums = schema.vectored_fields() for docnum in xrange(reader.doc_count_all()): if not reader.is_deleted(docnum): # Copy the stored fields and field lengths from the reader # into this segment storeditems = reader.stored_fields(docnum).items() storedvalues = [v for k, v in sorted(storeditems, key=storedkeyhelper)] self._add_doc_data(storedvalues, reader.doc_field_lengths(docnum)) if has_deletions: doc_map[docnum] = self.max_doc # Copy term vectors for fieldnum in vectored_fieldnums: if reader.has_vector(docnum, fieldnum): self._add_vector(fieldnum, reader.vector(docnum, fieldnum).items()) self.max_doc += 1 # Add field length totals for fieldnum in schema.scorable_fields(): self.field_length_totals[fieldnum] += reader.field_length(fieldnum) # Merge terms current_fieldnum = None decoder = None for fieldnum, text, _, _ in reader: if fieldnum != current_fieldnum: current_fieldnum = fieldnum decoder = schema[fieldnum].format.decode_frequency postreader = reader.postings(fieldnum, text) for docnum, valuestring in postreader.all_items(): if has_deletions: newdoc = doc_map[docnum] else: newdoc = start_doc + docnum # TODO: Is there a faster way to do this? freq = decoder(valuestring) self.pool.add_posting(fieldnum, text, newdoc, freq, valuestring) def add_document(self, fields): scorable_to_pos = self._scorable_to_pos stored_to_pos = self._stored_to_pos schema = self.schema # Sort the keys by their order in the schema fieldnames = [name for name in fields.keys() if not name.startswith("_")] fieldnames.sort(key=schema.name_to_number) # Check if the caller gave us a bogus field for name in fieldnames: if name not in schema: raise UnknownFieldError("There is no field named %r" % name) # Create an array of counters to record the length of each field fieldlengths = array(DOCLENGTH_TYPE, [0] * len(scorable_to_pos)) # Create a list (initially a list of Nones) in which we will put stored # field values as we get them. Why isn't this an empty list that we # append to? Because if the caller doesn't supply a value for a stored # field, we don't want to have a list in the wrong order/of the wrong # length. storedvalues = [None] * len(stored_to_pos) for name in fieldnames: value = fields.get(name) if value: fieldnum = schema.name_to_number(name) field = schema.field_by_number(fieldnum) # If the field is indexed, add the words in the value to the # index if field.indexed: # Count of all terms in the value count = 0 # Count of UNIQUE terms in the value unique = 0 # TODO: Method for adding progressive field values, ie # setting start_pos/start_char? for w, freq, valuestring in field.index(value): #assert w != "" self.pool.add_posting(fieldnum, w, self.max_doc, freq, valuestring) count += freq unique += 1 if field.scorable: # Add the term count to the total for this field self.field_length_totals[fieldnum] += count # Set the term count to the per-document field length pos = scorable_to_pos[fieldnum] fieldlengths[pos] = min(count, DOCLENGTH_LIMIT) # If the field is vectored, add the words in the value to the # vector table vector = field.vector if vector: # TODO: Method for adding progressive field values, ie # setting start_pos/start_char? vlist = sorted((w, valuestring) for w, freq, valuestring in vector.word_values(value, mode="index")) self._add_vector(fieldnum, vlist) # If the field is stored, put the value in storedvalues if field.stored: # Caller can override the stored value by including a key # _stored_<fieldname> storedname = "_stored_" + name if storedname in fields: stored_value = fields[storedname] else : stored_value = value storedvalues[stored_to_pos[fieldnum]] = stored_value self._add_doc_data(storedvalues, fieldlengths) self.max_doc += 1 def _add_terms(self): pass def _add_doc_data(self, storedvalues, fieldlengths): self.docslist.append(storedvalues) if self.doclengths: self.doclengths.append(fieldlengths) def _add_vector(self, fieldnum, vlist): vpostwriter = self.vpostwriter vformat = self.schema[fieldnum].vector offset = vpostwriter.start(vformat) for text, valuestring in vlist: assert isinstance(text, unicode), "%r is not unicode" % text vpostwriter.write(text, valuestring) vpostwriter.finish() self.vectortable.add((self.max_doc, fieldnum), offset) def _flush_pool(self): # This method pulls postings out of the posting pool (built up as # documents are added) and writes them to the posting file. Each time # it encounters a posting for a new term, it writes the previous term # to the term index (by waiting to write the term entry, we can easily # count the document frequency and sum the terms by looking at the # postings). termtable = self.termtable postwriter = self.postwriter schema = self.schema current_fieldnum = None # Field number of the current term current_text = None # Text of the current term first = True current_freq = 0 offset = None # Loop through the postings in the pool. Postings always come out of # the pool in (field number, lexical) order. for fieldnum, text, docnum, freq, valuestring in self.pool: # Is this the first time through, or is this a new term? if first or fieldnum > current_fieldnum or text > current_text: if first: first = False else: # This is a new term, so finish the postings and add the # term to the term table postcount = postwriter.finish() termtable.add((current_fieldnum, current_text), (current_freq, offset, postcount)) # Reset the post writer and the term variables current_fieldnum = fieldnum current_text = text current_freq = 0 offset = postwriter.start(schema[fieldnum].format) elif (fieldnum < current_fieldnum or (fieldnum == current_fieldnum and text < current_text)): # This should never happen! raise Exception("Postings are out of order: %s:%s .. %s:%s" % (current_fieldnum, current_text, fieldnum, text)) # Write a posting for this occurrence of the current term current_freq += freq postwriter.write(docnum, valuestring) # If there are still "uncommitted" postings at the end, finish them off if not first: postcount = postwriter.finish() termtable.add((current_fieldnum, current_text), (current_freq, offset, postcount))
def test_lowlevel_block_writing(): st = RamStorage() f = st.create_file("postfile") fpw = FilePostingWriter(f, blocklimit=4) fmt = formats.Frequency() fpw.start(fmt) fpw.write(0, 1.0, fmt.encode(1.0), 1) fpw.write(1, 2.0, fmt.encode(2.0), 2) fpw.write(2, 12.0, fmt.encode(12.0), 6) fpw.write(5, 6.5, fmt.encode(6.5), 420) fpw.write(11, 1.5, fmt.encode(1.5), 1) fpw.write(12, 2.5, fmt.encode(2.5), 2) fpw.write(26, 100.5, fmt.encode(100.5), 21) fpw.write(50, 8.0, fmt.encode(8.0), 1020) ti = fpw.finish() assert_equal(ti.weight(), 134.0) assert_equal(ti.doc_frequency(), 8) assert_equal(ti.min_length(), 1) assert_equal(ti.max_length(), byte_to_length(length_to_byte(1020))) assert_equal(ti.max_weight(), 100.5) assert_equal(ti.max_wol(), 100.5 / byte_to_length(length_to_byte(21)))