def __init__(self, ix, procs=None, batchsize=100, subargs=None, multisegment=False, **kwargs): # This is the "main" writer that will aggregate the results created by # the sub-tasks SegmentWriter.__init__(self, ix, **kwargs) self.procs = procs or cpu_count() # The maximum number of documents in each job file submitted to the # sub-tasks self.batchsize = batchsize # You can use keyword arguments or the "subargs" argument to pass # keyword arguments to the sub-writers self.subargs = subargs if subargs else kwargs # If multisegment is True, don't merge the segments created by the # sub-writers, just add them directly to the TOC self.multisegment = multisegment # A list to hold the sub-task Process objects self.tasks = [] # A queue to pass the filenames of job files to the sub-tasks self.jobqueue = Queue(self.procs * 4) # A queue to get back the final results of the sub-tasks self.resultqueue = Queue() # A buffer for documents before they are flushed to a job file self.docbuffer = [] self._grouping = 0 self._added_sub = False
def commit(self, mergetype=None, optimize=False, merge=True): if self._added_sub: # If documents have been added to sub-writers, use the parallel # merge commit code self._commit(mergetype, optimize, merge) else: # Otherwise, just do a regular-old commit SegmentWriter.commit(self, mergetype=mergetype, optimize=optimize, merge=merge)
def __init__(self, ix, procs=None, batchsize=100, subargs=None, **kwargs): SegmentWriter.__init__(self, ix, **kwargs) self.procs = procs or cpu_count() self.batchsize = batchsize self.subargs = subargs if subargs else kwargs self.tasks = [SegmentWriter(ix, _lk=False, **self.subargs) for _ in xrange(self.procs)] self.pointer = 0 self._added_sub = False
def __init__(self, ix, procs=None, batchsize=100, subargs=None, **kwargs): SegmentWriter.__init__(self, ix, **kwargs) self.procs = procs or cpu_count() self.batchsize = batchsize self.subargs = subargs if subargs else kwargs self.tasks = [ SegmentWriter(ix, _lk=False, **self.subargs) for _ in xrange(self.procs) ] self.pointer = 0 self._added_sub = False
def writer(self, procs=1, **kwargs): if procs > 1: from whoosh.filedb.multiproc2 import MpWriter return MpWriter(self, **kwargs) else: from whoosh.filedb.filewriting import SegmentWriter return SegmentWriter(self, **kwargs)
def run(self): # This is the main loop of the process. OK, so the way this works is # kind of brittle and stupid, but I had to figure out how to use the # multiprocessing module, work around bugs, and address performance # issues, so there is at least some reasoning behind some of this # The "parent" task farms individual documents out to the subtasks for # indexing. You could pickle the actual documents and put them in the # queue, but that is not very performant. Instead, we assume the tasks # share a filesystem and use that to pass the information around. The # parent task writes a certain number of documents to a file, then puts # the filename on the "job queue". A subtask gets the filename off the # queue and reads through the file processing the documents. jobqueue = self.jobqueue resultqueue = self.resultqueue multisegment = self.multisegment # Open a placeholder object representing the index ix = self.storage.open_index(self.indexname) # Open a writer for the index. The _lk=False parameter means to not try # to lock the index (the parent object that started me takes care of # locking the index) writer = self.writer = SegmentWriter(ix, _lk=False, **self.kwargs) # If the parent task calls cancel() on me, it will set self.running to # False, so I'll notice the next time through the loop while self.running: # Take an object off the job queue jobinfo = jobqueue.get() # If the object is None, it means the parent task wants me to # finish up if jobinfo is None: break # The object from the queue is a tuple of (filename, # number_of_docs_in_file). Pass those two pieces of information as # arguments to _process_file(). self._process_file(*jobinfo) if not self.running: # I was cancelled, so I'll cancel my underlying writer writer.cancel() else: if multisegment: # Actually finish the segment and return it with no run runname = None writer._flush_segment() writer._close_segment() writer._assemble_segment() segment = writer.get_segment() else: # Merge all runs in the writer's pool into one run, close the # segment, and return the run name and the segment k = self.kwargs.get("k", 64) runname, segment = finish_subsegment(writer, k) # Put the results (the run filename and the segment object) on the # result queue resultqueue.put((runname, segment), timeout=5)
def run(self): pqueue = self.postingqueue index = self.storage.open_index(self.indexname) writer = SegmentWriter(index, name=self.segmentname, lock=False, **self.kwargs) while self.running: args = pqueue.get() if args is None: break writer.add_document(**args) if not self.running: writer.cancel() self.terminate() else: writer.pool.finish(writer.docnum, writer.lengthfile, writer.termsindex, writer.postwriter) self._segment = writer._getsegment()
def run(self): jobqueue = self.jobqueue resultqueue = self.resultqueue ix = self.storage.open_index(self.indexname) writer = self.writer = SegmentWriter(ix, _lk=False, **self.kwargs) while self.running: jobinfo = jobqueue.get() if jobinfo is None: break self._process_file(*jobinfo) if not self.running: writer.cancel() else: writer.pool.save() writer.pool.reduce_to(1, self.kwargs.get("k", 64)) runname = writer.pool.runs[0] doccount = writer.doc_count() lenname, lenfile = self.storage.create_temp() writer.lengths.to_file(lenfile, doccount) resultqueue.put((runname, doccount, lenname), timeout=5)
def run(self): jobqueue = self.jobqueue ix = self.storage.open_index(self.indexname) writer = self.writer = SegmentWriter(ix, _lk=False, name=self.segname, **self.kwargs) if self.firstjob: self._add_file(self.firstjob) while self.running: args = jobqueue.get() if args is None: break self._add_file(args) if not self.running: writer.cancel() else: writer.pool.finish(writer.termswriter, writer.docnum, writer.lengthfile) writer._close_all() self.resultqueue.put(writer._getsegment())
def cancel(self): try: for task in self.tasks: task.cancel() finally: SegmentWriter.cancel(self)
def commit(self, mergetype=None, optimize=False, merge=True): if self._added_sub: self._commit(mergetype, optimize, merge) else: SegmentWriter.commit(self, mergetype=mergetype, optimize=optimize, merge=merge)