Beispiel #1
0
class DynamicForEachSource(object):
    """A source that for each given argument creates a new source that
    will be iterated by this source.

    For example, useful for directories where a CSVSource should be created
    for each file.

    The user must provide a function that when called with a single argument,
    returns a new source to iterate. A DynamicForEachSource instance can be
    given to several ProcessSource instances.
    """

    def __init__(self, seq, callee):
        """Arguments:
        - seq: A sequence with the elements for each of which a unique source
          must be created. The elements are given (one by one) to callee.
        - callee: A function f(e) that must accept elements as those in the seq
          argument. The function should return a source which then will be
          iterated by this source. The function is called once for every
          element in seq.
        """
        self.__queue = Queue()  # a multiprocessing.Queue
        if not callable(callee):
            raise TypeError, 'callee must be callable'
        self.__callee = callee
        for e in seq:
            # put them in a safe queue such that this object can be used from
            # different fork'ed processes
            self.__queue.put(e)

    def __iter__(self):
        while True:
            try:
                arg = self.__queue.get(False)
                src = self.__callee(arg)
                for row in src:
                    yield row
            except Empty:
                raise StopIteration
Beispiel #2
0
 def __init__(self, source, batchsize=500, queuesize=20):
     """Arguments:
        - source: the source to iterate
        - batchsize: the number of rows passed from the worker process each
          time it passes on a batch of rows. Must be positive. Default: 1000
        - queuesize: the maximum number of batches that can wait in a queue
          between the processes. 0 means unlimited. Default: 100
     """
     if type(batchsize) != int or batchsize < 1:
         raise ValueError, 'batchsize must be a positive integer'
     self.__source = source
     self.__batchsize = batchsize
     self.__queue = Queue(queuesize)
     p = Process(target=self.__worker)
     p.name = "Process for ProcessSource"
     p.start()
Beispiel #3
0
 def __init__(self, seq, callee):
     """Arguments:
     - seq: A sequence with the elements for each of which a unique source
       must be created. The elements are given (one by one) to callee.
     - callee: A function f(e) that must accept elements as those in the seq
       argument. The function should return a source which then will be
       iterated by this source. The function is called once for every
       element in seq.
     """
     self.__queue = Queue()  # a multiprocessing.Queue
     if not callable(callee):
         raise TypeError, 'callee must be callable'
     self.__callee = callee
     for e in seq:
         # put them in a safe queue such that this object can be used from
         # different fork'ed processes
         self.__queue.put(e)
Beispiel #4
0
class ProcessSource(object):
    """A class for iterating another source in a separate process"""

    def __init__(self, source, batchsize=500, queuesize=20):
        """Arguments:
           - source: the source to iterate
           - batchsize: the number of rows passed from the worker process each
             time it passes on a batch of rows. Must be positive. Default: 1000
           - queuesize: the maximum number of batches that can wait in a queue
             between the processes. 0 means unlimited. Default: 100
        """
        if type(batchsize) != int or batchsize < 1:
            raise ValueError, 'batchsize must be a positive integer'
        self.__source = source
        self.__batchsize = batchsize
        self.__queue = Queue(queuesize)
        p = Process(target=self.__worker)
        p.name = "Process for ProcessSource"
        p.start()

    def __worker(self):
        batch = []
        try:
            for row in self.__source:
                batch.append(row)
                if len(batch) == self.__batchsize:
                    self.__queue.put(batch)
                    batch = []
            # We're done. Send the batch if it has any data and a signal
            if batch:
                self.__queue.put(batch)
            self.__queue.put('STOP')
        except Exception, e:
            if batch:
                self.__queue.put(batch)
            self.__queue.put('EXCEPTION')
            self.__queue.put(e)