Beispiel #1
0
    def __init__(self,
                 obj,
                 returnvalues=True,
                 consumes=(),
                 directupdatepositions=(),
                 batchsize=500,
                 queuesize=200,
                 autowrap=True):
        self.__instancenumber = len(Decoupled.__instances)
        self.__futurecnt = 0
        Decoupled.__instances.append(self)
        self._obj = obj
        if hasattr(obj, '_decoupling') and callable(obj._decoupling):
            obj._decoupling()
        self.batchsize = batchsize
        self.__batch = []
        self.__results = {}
        self.autowrap = autowrap
        self.__toworker = multiprocessing.JoinableQueue(queuesize)
        if returnvalues:
            self.__fromworker = multiprocessing.JoinableQueue(queuesize)
        else:
            self.__fromworker = None
        self.__otherqueues = dict([(dcpld.__instancenumber, dcpld.__fromworker)\
                                       for dcpld in consumes])
        self.__otherresults = {
        }  # Will store dicts - see also __decoupledworker
        self.__directupdates = directupdatepositions

        self.__worker = multiprocessing.Process(target=self.__decoupledworker)
        self.__worker.daemon = True
        self.__worker.name = 'Process for %s object for %s' % \
            (self.__class__.__name__, getattr(obj, 'name', 'an unnamed object'))
        self.__worker.start()
Beispiel #2
0
    def decorator(func):
        global _splitpointqueues
        if instances < 1:
            # A special case where there is no process so
            # we just call func directly
            def sillywrapper(*args, **kw):
                res = func(*args, **kw)
                if output is not None:
                    output.put(res)

            return sillywrapper
        # Else set up processes
        input = multiprocessing.JoinableQueue(queuesize)
        for n in range(instances):
            p = multiprocessing.Process(target=_splitprocess,\
                                            args=(func, input, output))
            p.name = 'Process-%d for %s' % (n, func.__name__)
            p.daemon = True
            p.start()
        _splitpointqueues.append(input)

        def wrapper(*args, **kw):
            input.put((args, kw))

        return wrapper
Beispiel #3
0
def getsharedsequencefactory(startvalue, intervallen=5000):
    """ Creates a factory for parallel readers of a sequence.

        Returns a callable f. When f() is called, it returns a callable g.
        Whenever g(*args) is called, it returns a unique int from a sequence
        (if several g's are created, the order of the calls may lead to that
        the returned ints are not ordered, but they will be unique). The
        arguments to g are ignored, but accepted. Thus g can be used as
        idfinder for [Decoupled]Dimensions.

        The different g's can be used safely from different processes and
        threads.

        Arguments:
            
        - startvalue: The first value to return. If None, 0 is assumed.
        - intervallen: The amount of numbers that a single g from above
          can return before synchronization is needed to get a new amount.
          Default: 5000.
    """
    if startvalue is None:
        startvalue = 0

    # We use a Queue to ensure that intervals are only given to one deliverer
    values = multiprocessing.Queue(10)

    # A worker that fills the queue
    def valuegenerator(nextval):
        sys.excepthook = _getexcepthook()
        while True:
            values.put((nextval, nextval + intervallen))
            nextval += intervallen

    p = multiprocessing.Process(target=valuegenerator, args=(startvalue, ))
    p.daemon = True
    p.start()

    # A generator that repeatedly gets an interval from the queue and returns
    # all numbers in that interval before it gets a new interval and goes on
    # ...
    def valuedeliverer():
        while True:
            interval = values.get()
            for i in range(*interval):
                yield i

    # A factory method for the object the end-consumer calls
    def factory():
        generator = valuedeliverer()  # get a unique generator

        # The method called (i.e., the g) by the end-consumer

        def getnextseqval(*ignored):
            return next(generator)

        return getnextseqval

    return factory
Beispiel #4
0
def _getexitfunction():
    """Return a function that halts the execution of pygrametl.

       pygrametl uses the function as excepthook in spawned processes such that
       an uncaught exception halts the entire execution.
    """
    # On Java, System.exit will do as there are no separate processes
    if sys.platform.startswith('java'):
        def javaexitfunction():
            import java.lang.System
            java.lang.System.exit(1)
        return javaexitfunction

    # else see if the os module provides functions to kill process groups;
    # this should be the case on UNIX.
    import signal
    if hasattr(os, 'getpgrp') and hasattr(os, 'killpg'):
        def unixexitfunction():
            procgrp = os.getpgrp()
            os.killpg(procgrp, signal.SIGTERM)
        return unixexitfunction

    # else, we are on a platform that does not allow us to kill a group.
    # We make a special process that gets the pids of all calls to
    # this procedure. The function we return, informs this process to kill
    # all processes it knows about.

    # set up the terminator
    global _toterminator
    if _toterminator is None:
        _toterminator = multiprocessing.Queue()

        def terminatorfunction():
            pids = set([_masterpid])
            while True:
                item = _toterminator.get()
                if isinstance(item, int):
                    pids.add(item)
                else:
                    # We take it as a signal to kill all
                    for p in pids:
                        # we don't know which signals exist; use 9
                        os.kill(p, 9)
                    return

        terminatorprocess = multiprocessing.Process(target=terminatorfunction)
        terminatorprocess.daemon = True
        terminatorprocess.start()

    # tell the terminator about this process
    _toterminator.put(os.getpid())

    # return a function that tells the terminator to kill all known processes
    def exitfunction():
        _toterminator.put('TERMINATE')

    return exitfunction
Beispiel #5
0
def shareconnectionwrapper(targetconnection, maxclients=10, userfuncs=()):
    """Share a ConnectionWrapper between several processes/threads.

    When Decoupled objects are used, they can try to update the DW at the same 
    time. They can use several ConnectionWrappers to avoid race conditions, but
    this is not transactionally safe. Instead, they can use a "shared" 
    ConnectionWrapper obtained through this function.

    When a ConnectionWrapper is shared, it is executing in a separate process
    (or thread, in case Jython is used) and ensuring that only one operation
    takes place at the time. This is hidden from the users of the shared 
    ConnectionWrapper.  They see an interface similar to the normal 
    ConnectionWrapper.

    When this method is called, it returns a SharedConnectionWrapperClient
    which can be used as a normal ConnectionWrapper. Each process 
    (i.e., each Decoupled object) should, however, get a unique
    SharedConnectionWrapperClient by calling copy() on the returned
    SharedConnectionWrapperClient.

    Note that a shared ConnectionWrapper needs to hold the complete result of 
    each query in memory until it is fetched by the process that executed the
    query. Again, this is hidden from the users.

    It is also possible to add methods to a shared ConnectionWrapper when it 
    is created. When this is done and the method is invoked, no other 
    operation will modify the DW at the same time. If, for example,
    the functions foo and bar are added to a shared ConnectionWrapper (by
    passing the argument userfuncs=(foo, bar) to shareconnectionwrapper),
    the returned SharedConnectionWrapperClient will offer the methods
    foo and bar which when called will be running in the separate process
    for the shared ConnectionWrapper. This is particularly useful for
    user-defined bulk loaders as used by BulkFactTable:

    def bulkload():
        # DBMS-specific code here.
        # No other DW operation should take place concurrently

    scw = shareconnectionwrapper(ConnectionWrapper(...), userfuncs=(bulkload,))
    facttbl = BulkFact(..., bulkloader=scw.copy().bulkload) #Note the .copy(). 

    Arguments:
    - targetconnection: a pygrametl ConnectionWrapper
    - maxclients: the maximum number of concurrent clients. Default: 10
    - userfuncs: a sequence of functions to add to the shared 
    ConnectionWrapper. Default: ()
    """
    toserver = multiprocessing.JoinableQueue(5000)
    toclients = [multiprocessing.Queue() for i in range(maxclients)]
    freelines = multiprocessing.Queue()
    for i in range(maxclients):
        freelines.put(i)
    serverCW = SharedConnectionWrapperServer(targetconnection, toserver,
                                             toclients)
    userfuncnames = []
    for func in userfuncs:
        if not (callable(func) and hasattr(func, 'func_name') and \
                    not func.func_name == '<lambda>'):
            raise ValueError, "Elements in userfunc must be callable and named"
        if hasattr(SharedConnectionWrapperClient, func.func_name):
            raise ValueError, "Illegal function name: " + func.func_name
        setattr(serverCW, '_userfunc_' + func.func_name, func)
        userfuncnames.append(func.func_name)
    serverprocess = multiprocessing.Process(target=serverCW.worker)
    serverprocess.name = 'Process for shared connection wrapper'
    serverprocess.daemon = True
    serverprocess.start()
    module = targetconnection.getunderlyingmodule()
    clientCW = SharedConnectionWrapperClient(toserver, toclients, freelines,
                                             module, userfuncnames)
    return clientCW
Beispiel #6
0
def createflow(*functions, **options):
    """Create a flow of functions running in different processes.

       A Flow object ready for use is returned.

       A flow consists of several functions running in several processes.
       A flow created by 
           flow = createflow(f1, f2, f3) 
       uses three processes. Data can be inserted into the flow by calling it
       as in flow(data). The argument data is then first processed by f1(data),
       then f2(data), and finally f3(data). Return values from f1, f2, and f3 
       are *not* preserved, but their side-effects are. The functions in a flow
       should all accept the same number of arguments (*args are also okay).

       Internally, a Flow object groups calls together in batches to reduce
       communication costs (see also the description of arguments below).
       In the example above, f1 could thus work on one batch, while f2 works
       on another batch and so on. Flows are thus good to use even if there
       are many calls of relatively fast functions.

       When no more data is to be inserted into a flow, it should be closed
       by calling its close method.

       Data processed by a flow can be fetched by calling get/getall or simply
       iterating the flow. This can both be done by the process that inserted
       data into the flow or by another (possibly concurrent) process. All
       data in a flow should be fetched again as it otherwise will remain in 
       memory .

       Arguments:
       - *functions: A sequence of functions of sequences of functions.
         Each element in the sequence will be executed in a separate process.
         For example, the argument (f1, (f2, f3), f4) leads to that
         f1 executes in process-1, f2 and f3 execute in process-2, and f4
         executes in process-3.
         The functions in the sequence should all accept the same number of
         arguments.
       - **options: keyword arguments configuring details. The considered
         options are:
         - batchsize: an integer deciding how many function calls are "grouped
           together" before they are passed on between processes. The default
           is 500.
         - queuesize: an integer deciding the maximum number of batches
           that can wait in a JoinableQueue between two different processes. 
           0 means that there is no limit.
           The default is 25.
       
    """
    # A special case
    if not functions:
        return Flow([multiprocessing.JoinableQueue()],\
                        [multiprocessing.Value('b', 0)], 1)

    # Create functions that invoke a group of functions if needed
    resultfuncs = []
    for item in functions:
        if callable(item):
            resultfuncs.append(item)
        else:
            # Check the arguments
            if not hasattr(item, '__iter__'):
                raise ValueError, \
                    'An element is neither iterable nor callable'
            for f in item:
                if not callable(f):
                    raise ValueError, \
                        'An element in a sequence is not callable'
            # We can - finally - create the function
            groupfunc = _buildgroupfunction(item)
            resultfuncs.append(groupfunc)

    # resultfuncs are now the functions we need to deal with.
    # Each function in resultfuncs should run in a separate process
    queuesize = ('queuesize' in options and options['queuesize']) or 0
    batchsize = ('batchsize' in options and options['batchsize']) or 25
    if batchsize < 1:
        batchsize = 25
    queues = [multiprocessing.JoinableQueue(queuesize) for f in resultfuncs]
    queues.append(multiprocessing.JoinableQueue(queuesize))  # for the results
    closed = [multiprocessing.Value('b', 0) for q in queues]  # in shared mem
    for i in range(len(resultfuncs)):
        p = multiprocessing.Process(target=_flowprocess, \
                                        args=(resultfuncs[i], \
                                                  queues[i], queues[i+1], \
                                                  closed[i], closed[i+1]))
        p.start()

    # Now create and return the object which allows data to enter the flow
    return Flow(queues, closed, batchsize)