コード例 #1
0
ファイル: datasources.py プロジェクト: harryprince/pygrametl
 def __init__(self, source, batchsize=500, queuesize=20):
     """Arguments:
        - source: the source to iterate
        - batchsize: the number of rows passed from the worker process each
          time it passes on a batch of rows. Must be positive. Default: 500
        - queuesize: the maximum number of batches that can wait in a queue
          between the processes. 0 means unlimited. Default: 20
     """
     if not isinstance(batchsize, int) or batchsize < 1:
         raise ValueError('batchsize must be a positive integer')
     self.__source = source
     self.__batchsize = batchsize
     self.__queue = Queue(queuesize)
     p = Process(target=self.__worker)
     p.name = "Process for ProcessSource"
     p.start()
コード例 #2
0
ファイル: datasources.py プロジェクト: harryprince/pygrametl
 def __init__(self, seq, callee):
     """Arguments:
        - seq: a sequence with the elements for each of which a unique source
          must be created. the elements are given (one by one) to callee.
        - callee: a function f(e) that must accept elements as those in the
          seq argument. the function should return a source which then will
          be iterated by this source. the function is called once for every
          element in seq.
     """
     self.__queue = Queue()  # a multiprocessing.Queue
     if not callable(callee):
         raise TypeError('callee must be callable')
     self.__callee = callee
     for e in seq:
         # put them in a safe queue such that this object can be used from
         # different fork'ed processes
         self.__queue.put(e)
コード例 #3
0
ファイル: datasources.py プロジェクト: zbingwen/pygrametl
class DynamicForEachSource(object):

    """A source that for each given argument creates a new source that
    will be iterated by this source.

    For example, useful for directories where a CSVSource should be created
    for each file.

    The user must provide a function that when called with a single argument,
    returns a new source to iterate. A DynamicForEachSource instance can be
    given to several ProcessSource instances.
    """

    def __init__(self, seq, callee):
        """Arguments:
            
           - seq: a sequence with the elements for each of which a unique
             source must be created. the elements are given (one by one) to
             callee.
           - callee: a function f(e) that must accept elements as those in the
             seq argument. the function should return a source which then will
             be iterated by this source. the function is called once for every
             element in seq.
        """
        self.__queue = Queue()  # a multiprocessing.Queue
        if not callable(callee):
            raise TypeError('callee must be callable')
        self.__callee = callee
        for e in seq:
            # put them in a safe queue such that this object can be used from
            # different fork'ed processes
            self.__queue.put(e)

    def __iter__(self):
        while True:
            try:
                arg = self.__queue.get(False)
                src = self.__callee(arg)
                for row in src:
                    yield row
            except Empty:
                raise StopIteration()
コード例 #4
0
ファイル: datasources.py プロジェクト: DSTKES/pygrametl
class DynamicForEachSource(object):

    """A source that for each given argument creates a new source that
    will be iterated by this source.

    For example, useful for directories where a CSVSource should be created
    for each file.

    The user must provide a function that when called with a single argument,
    returns a new source to iterate. A DynamicForEachSource instance can be
    given to several ProcessSource instances.
    """

    def __init__(self, seq, callee):
        """Arguments:
           - seq: a sequence with the elements for each of which a unique
             source must be created. the elements are given (one by one) to
             callee.
           - callee: a function f(e) that must accept elements as those in the
             seq argument. the function should return a source which then will
             be iterated by this source. the function is called once for every
             element in seq.
        """
        self.__queue = Queue()  # a multiprocessing.Queue
        if not callable(callee):
            raise TypeError('callee must be callable')
        self.__callee = callee
        for e in seq:
            # put them in a safe queue such that this object can be used from
            # different fork'ed processes
            self.__queue.put(e)

    def __iter__(self):
        while True:
            try:
                arg = self.__queue.get(False)
                src = self.__callee(arg)
                for row in src:
                    yield row
            except Empty:
                raise StopIteration()
コード例 #5
0
ファイル: datasources.py プロジェクト: harryprince/pygrametl
class ProcessSource(object):

    """A class for iterating another source in a separate process"""

    def __init__(self, source, batchsize=500, queuesize=20):
        """Arguments:
           - source: the source to iterate
           - batchsize: the number of rows passed from the worker process each
             time it passes on a batch of rows. Must be positive. Default: 500
           - queuesize: the maximum number of batches that can wait in a queue
             between the processes. 0 means unlimited. Default: 20
        """
        if not isinstance(batchsize, int) or batchsize < 1:
            raise ValueError('batchsize must be a positive integer')
        self.__source = source
        self.__batchsize = batchsize
        self.__queue = Queue(queuesize)
        p = Process(target=self.__worker)
        p.name = "Process for ProcessSource"
        p.start()

    def __worker(self):
        batch = []
        try:
            for row in self.__source:
                batch.append(row)
                if len(batch) == self.__batchsize:
                    self.__queue.put(batch)
                    batch = []
            # We're done. Send the batch if it has any data and a signal
            if batch:
                self.__queue.put(batch)
            self.__queue.put('STOP')
        except Exception:
            # Jython 2.5.X does not support the as syntax required by Python 3
            e = sys.exc_info()[1]

            if batch:
                self.__queue.put(batch)
            self.__queue.put('EXCEPTION')
            self.__queue.put(e)

    def __iter__(self):
        while True:
            data = self.__queue.get()
            if data == 'STOP':
                break
            elif data == 'EXCEPTION':
                exc = self.__queue.get()
                raise exc
            # else we got a list of rows from the other process
            for row in data:
                yield row
コード例 #6
0
class ProcessSource(object):
    """A class for iterating another source in a separate process"""

    def __init__(self, source, batchsize=500, queuesize=20):
        """Arguments:
           - source: the source to iterate
           - batchsize: the number of rows passed from the worker process each
             time it passes on a batch of rows. Must be positive. Default: 1000
           - queuesize: the maximum number of batches that can wait in a queue
             between the processes. 0 means unlimited. Default: 100
        """
        if type(batchsize) != int or batchsize < 1:
            raise ValueError, 'batchsize must be a positive integer'
        self.__source = source
        self.__batchsize = batchsize
        self.__queue = Queue(queuesize)
        p = Process(target=self.__worker)
        p.name = "Process for ProcessSource"
        p.start()

    def __worker(self):
        batch = []    
        try:
            for row in self.__source:
                batch.append(row)
                if len(batch) == self.__batchsize:
                    self.__queue.put(batch)
                    batch = []
            # We're done. Send the batch if it has any data and a signal
            if batch:
                self.__queue.put(batch)
            self.__queue.put('STOP')
        except Exception, e:
            if batch:
                self.__queue.put(batch)
            self.__queue.put('EXCEPTION')
            self.__queue.put(e)