def __init__(self, obj, returnvalues=True, consumes=(), directupdatepositions=(), batchsize=500, queuesize=200, autowrap=True): self.__instancenumber = len(Decoupled.__instances) self.__futurecnt = 0 Decoupled.__instances.append(self) self._obj = obj if hasattr(obj, '_decoupling') and callable(obj._decoupling): obj._decoupling() self.batchsize = batchsize self.__batch = [] self.__results = {} self.autowrap = autowrap self.__toworker = multiprocessing.JoinableQueue(queuesize) if returnvalues: self.__fromworker = multiprocessing.JoinableQueue(queuesize) else: self.__fromworker = None self.__otherqueues = dict([(dcpld.__instancenumber, dcpld.__fromworker)\ for dcpld in consumes]) self.__otherresults = { } # Will store dicts - see also __decoupledworker self.__directupdates = directupdatepositions self.__worker = multiprocessing.Process(target=self.__decoupledworker) self.__worker.daemon = True self.__worker.name = 'Process for %s object for %s' % \ (self.__class__.__name__, getattr(obj, 'name', 'an unnamed object')) self.__worker.start()
def decorator(func): global _splitpointqueues if instances < 1: # A special case where there is no process so # we just call func directly def sillywrapper(*args, **kw): res = func(*args, **kw) if output is not None: output.put(res) return sillywrapper # Else set up processes input = multiprocessing.JoinableQueue(queuesize) for n in range(instances): p = multiprocessing.Process(target=_splitprocess,\ args=(func, input, output)) p.name = 'Process-%d for %s' % (n, func.__name__) p.daemon = True p.start() _splitpointqueues.append(input) def wrapper(*args, **kw): input.put((args, kw)) return wrapper
def getsharedsequencefactory(startvalue, intervallen=5000): """ Creates a factory for parallel readers of a sequence. Returns a callable f. When f() is called, it returns a callable g. Whenever g(*args) is called, it returns a unique int from a sequence (if several g's are created, the order of the calls may lead to that the returned ints are not ordered, but they will be unique). The arguments to g are ignored, but accepted. Thus g can be used as idfinder for [Decoupled]Dimensions. The different g's can be used safely from different processes and threads. Arguments: - startvalue: The first value to return. If None, 0 is assumed. - intervallen: The amount of numbers that a single g from above can return before synchronization is needed to get a new amount. Default: 5000. """ if startvalue is None: startvalue = 0 # We use a Queue to ensure that intervals are only given to one deliverer values = multiprocessing.Queue(10) # A worker that fills the queue def valuegenerator(nextval): sys.excepthook = _getexcepthook() while True: values.put((nextval, nextval + intervallen)) nextval += intervallen p = multiprocessing.Process(target=valuegenerator, args=(startvalue, )) p.daemon = True p.start() # A generator that repeatedly gets an interval from the queue and returns # all numbers in that interval before it gets a new interval and goes on # ... def valuedeliverer(): while True: interval = values.get() for i in range(*interval): yield i # A factory method for the object the end-consumer calls def factory(): generator = valuedeliverer() # get a unique generator # The method called (i.e., the g) by the end-consumer def getnextseqval(*ignored): return next(generator) return getnextseqval return factory
def _getexitfunction(): """Return a function that halts the execution of pygrametl. pygrametl uses the function as excepthook in spawned processes such that an uncaught exception halts the entire execution. """ # On Java, System.exit will do as there are no separate processes if sys.platform.startswith('java'): def javaexitfunction(): import java.lang.System java.lang.System.exit(1) return javaexitfunction # else see if the os module provides functions to kill process groups; # this should be the case on UNIX. import signal if hasattr(os, 'getpgrp') and hasattr(os, 'killpg'): def unixexitfunction(): procgrp = os.getpgrp() os.killpg(procgrp, signal.SIGTERM) return unixexitfunction # else, we are on a platform that does not allow us to kill a group. # We make a special process that gets the pids of all calls to # this procedure. The function we return, informs this process to kill # all processes it knows about. # set up the terminator global _toterminator if _toterminator is None: _toterminator = multiprocessing.Queue() def terminatorfunction(): pids = set([_masterpid]) while True: item = _toterminator.get() if isinstance(item, int): pids.add(item) else: # We take it as a signal to kill all for p in pids: # we don't know which signals exist; use 9 os.kill(p, 9) return terminatorprocess = multiprocessing.Process(target=terminatorfunction) terminatorprocess.daemon = True terminatorprocess.start() # tell the terminator about this process _toterminator.put(os.getpid()) # return a function that tells the terminator to kill all known processes def exitfunction(): _toterminator.put('TERMINATE') return exitfunction
def shareconnectionwrapper(targetconnection, maxclients=10, userfuncs=()): """Share a ConnectionWrapper between several processes/threads. When Decoupled objects are used, they can try to update the DW at the same time. They can use several ConnectionWrappers to avoid race conditions, but this is not transactionally safe. Instead, they can use a "shared" ConnectionWrapper obtained through this function. When a ConnectionWrapper is shared, it is executing in a separate process (or thread, in case Jython is used) and ensuring that only one operation takes place at the time. This is hidden from the users of the shared ConnectionWrapper. They see an interface similar to the normal ConnectionWrapper. When this method is called, it returns a SharedConnectionWrapperClient which can be used as a normal ConnectionWrapper. Each process (i.e., each Decoupled object) should, however, get a unique SharedConnectionWrapperClient by calling copy() on the returned SharedConnectionWrapperClient. Note that a shared ConnectionWrapper needs to hold the complete result of each query in memory until it is fetched by the process that executed the query. Again, this is hidden from the users. It is also possible to add methods to a shared ConnectionWrapper when it is created. When this is done and the method is invoked, no other operation will modify the DW at the same time. If, for example, the functions foo and bar are added to a shared ConnectionWrapper (by passing the argument userfuncs=(foo, bar) to shareconnectionwrapper), the returned SharedConnectionWrapperClient will offer the methods foo and bar which when called will be running in the separate process for the shared ConnectionWrapper. This is particularly useful for user-defined bulk loaders as used by BulkFactTable: def bulkload(): # DBMS-specific code here. # No other DW operation should take place concurrently scw = shareconnectionwrapper(ConnectionWrapper(...), userfuncs=(bulkload,)) facttbl = BulkFact(..., bulkloader=scw.copy().bulkload) #Note the .copy(). Arguments: - targetconnection: a pygrametl ConnectionWrapper - maxclients: the maximum number of concurrent clients. Default: 10 - userfuncs: a sequence of functions to add to the shared ConnectionWrapper. Default: () """ toserver = multiprocessing.JoinableQueue(5000) toclients = [multiprocessing.Queue() for i in range(maxclients)] freelines = multiprocessing.Queue() for i in range(maxclients): freelines.put(i) serverCW = SharedConnectionWrapperServer(targetconnection, toserver, toclients) userfuncnames = [] for func in userfuncs: if not (callable(func) and hasattr(func, 'func_name') and \ not func.func_name == '<lambda>'): raise ValueError, "Elements in userfunc must be callable and named" if hasattr(SharedConnectionWrapperClient, func.func_name): raise ValueError, "Illegal function name: " + func.func_name setattr(serverCW, '_userfunc_' + func.func_name, func) userfuncnames.append(func.func_name) serverprocess = multiprocessing.Process(target=serverCW.worker) serverprocess.name = 'Process for shared connection wrapper' serverprocess.daemon = True serverprocess.start() module = targetconnection.getunderlyingmodule() clientCW = SharedConnectionWrapperClient(toserver, toclients, freelines, module, userfuncnames) return clientCW
def createflow(*functions, **options): """Create a flow of functions running in different processes. A Flow object ready for use is returned. A flow consists of several functions running in several processes. A flow created by flow = createflow(f1, f2, f3) uses three processes. Data can be inserted into the flow by calling it as in flow(data). The argument data is then first processed by f1(data), then f2(data), and finally f3(data). Return values from f1, f2, and f3 are *not* preserved, but their side-effects are. The functions in a flow should all accept the same number of arguments (*args are also okay). Internally, a Flow object groups calls together in batches to reduce communication costs (see also the description of arguments below). In the example above, f1 could thus work on one batch, while f2 works on another batch and so on. Flows are thus good to use even if there are many calls of relatively fast functions. When no more data is to be inserted into a flow, it should be closed by calling its close method. Data processed by a flow can be fetched by calling get/getall or simply iterating the flow. This can both be done by the process that inserted data into the flow or by another (possibly concurrent) process. All data in a flow should be fetched again as it otherwise will remain in memory . Arguments: - *functions: A sequence of functions of sequences of functions. Each element in the sequence will be executed in a separate process. For example, the argument (f1, (f2, f3), f4) leads to that f1 executes in process-1, f2 and f3 execute in process-2, and f4 executes in process-3. The functions in the sequence should all accept the same number of arguments. - **options: keyword arguments configuring details. The considered options are: - batchsize: an integer deciding how many function calls are "grouped together" before they are passed on between processes. The default is 500. - queuesize: an integer deciding the maximum number of batches that can wait in a JoinableQueue between two different processes. 0 means that there is no limit. The default is 25. """ # A special case if not functions: return Flow([multiprocessing.JoinableQueue()],\ [multiprocessing.Value('b', 0)], 1) # Create functions that invoke a group of functions if needed resultfuncs = [] for item in functions: if callable(item): resultfuncs.append(item) else: # Check the arguments if not hasattr(item, '__iter__'): raise ValueError, \ 'An element is neither iterable nor callable' for f in item: if not callable(f): raise ValueError, \ 'An element in a sequence is not callable' # We can - finally - create the function groupfunc = _buildgroupfunction(item) resultfuncs.append(groupfunc) # resultfuncs are now the functions we need to deal with. # Each function in resultfuncs should run in a separate process queuesize = ('queuesize' in options and options['queuesize']) or 0 batchsize = ('batchsize' in options and options['batchsize']) or 25 if batchsize < 1: batchsize = 25 queues = [multiprocessing.JoinableQueue(queuesize) for f in resultfuncs] queues.append(multiprocessing.JoinableQueue(queuesize)) # for the results closed = [multiprocessing.Value('b', 0) for q in queues] # in shared mem for i in range(len(resultfuncs)): p = multiprocessing.Process(target=_flowprocess, \ args=(resultfuncs[i], \ queues[i], queues[i+1], \ closed[i], closed[i+1])) p.start() # Now create and return the object which allows data to enter the flow return Flow(queues, closed, batchsize)