def __init__(self, ix, procs=None, batchsize=100, subargs=None, multisegment=False, **kwargs): # This is the "main" writer that will aggregate the results created by # the sub-tasks SegmentWriter.__init__(self, ix, **kwargs) self.procs = procs or cpu_count() # The maximum number of documents in each job file submitted to the # sub-tasks self.batchsize = batchsize # You can use keyword arguments or the "subargs" argument to pass # keyword arguments to the sub-writers self.subargs = subargs if subargs else kwargs # If multisegment is True, don't merge the segments created by the # sub-writers, just add them directly to the TOC self.multisegment = multisegment # A list to hold the sub-task Process objects self.tasks = [] # A queue to pass the filenames of job files to the sub-tasks self.jobqueue = Queue(self.procs * 4) # A queue to get back the final results of the sub-tasks self.resultqueue = Queue() # A buffer for documents before they are flushed to a job file self.docbuffer = [] self._grouping = 0 self._added_sub = False
def __init__(self, ix, procs=None, batchsize=100, subargs=None, **kwargs): SegmentWriter.__init__(self, ix, **kwargs) self.procs = procs or cpu_count() self.batchsize = batchsize self.subargs = subargs if subargs else kwargs self.tasks = [SegmentWriter(ix, _lk=False, **self.subargs) for _ in xrange(self.procs)] self.pointer = 0 self._added_sub = False