def runParallel(iterable, parallelFunction, updateFunction=None, updateMultiply=3, unpackIterable=False): ''' runs parallelFunction over iterable in parallel, optionally calling updateFunction after each common.cpus * updateMultiply calls. Setting updateMultiply too small can make it so that cores wait around when they could be working if one CPU has a particularly hard task. Setting it too high can make it seem like the job has hung. updateFunction should take three arguments: the current position, the total to run, and the most recent results. It does not need to be pickleable, and in fact, a bound method might be very useful here. Or updateFunction can be "True" which just prints a generic message. If unpackIterable is True then each element in iterable is considered a list or tuple of different arguments to delayFunction. As of Python 2.7, partial functions are pickleable, so if you need to pass the same arguments to parallelFunction each time, make it a partial function before passing it to runParallel. Note that parallelFunction, iterable's contents, and the results of calling parallelFunction must all be pickleable, and that if pickling the contents or unpickling the results takes a lot of time, you won't get nearly the speedup from this function as you might expect. The big culprit here is definitely music21 streams. ''' iterLength = len(iterable) totalRun = 0 numCpus = cpus() resultsList = [] if multiprocessing.current_process().daemon: # @UndefinedVariable return runNonParallel(iterable, parallelFunction, updateFunction, updateMultiply, unpackIterable) with Parallel(n_jobs=numCpus) as para: delayFunction = delayed(parallelFunction) while totalRun < iterLength: endPosition = min(totalRun + numCpus * updateMultiply, iterLength) rangeGen = range(totalRun, endPosition) if unpackIterable: _r = para(delayFunction(*iterable[i]) for i in rangeGen) else: _r = para(delayFunction(iterable[i]) for i in rangeGen) totalRun = endPosition resultsList.extend(_r) if updateFunction is True: print("Done {} tasks of {}".format(totalRun, iterLength)) elif updateFunction is not None: updateFunction(totalRun, iterLength, _r) return resultsList
def runParallel(iterable, parallelFunction, *, updateFunction=None, updateMultiply=3, unpackIterable=False, updateSendsIterable=False): ''' runs parallelFunction over iterable in parallel, optionally calling updateFunction after each common.cpus * updateMultiply calls. Setting updateMultiply too small can make it so that cores wait around when they could be working if one CPU has a particularly hard task. Setting it too high can make it seem like the job has hung. updateFunction should take three arguments: the current position, the total to run, and the most recent results. It does not need to be pickleable, and in fact, a bound method might be very useful here. Or updateFunction can be "True" which just prints a generic message. If unpackIterable is True then each element in iterable is considered a list or tuple of different arguments to parallelFunction. If updateSendsIterable is True then the update function will get the iterable content, after the output. As of Python 3, partial functions are pickleable, so if you need to pass the same arguments to parallelFunction each time, make it a partial function before passing it to runParallel. Note that parallelFunction, iterable's contents, and the results of calling parallelFunction must all be pickleable, and that if pickling the contents or unpickling the results takes a lot of time, you won't get nearly the speedup from this function as you might expect. The big culprit here is definitely music21 streams. >>> files = ['bach/bwv66.6', 'schoenberg/opus19', 'AcaciaReel'] >>> def countNotes(fn): ... c = corpus.parse(fn) # this is the slow call that is good to parallelize ... return len(c.recurse().notes) >>> #_DOCS_SHOW outputs = common.runParallel(files, countNotes) >>> outputs = common.runNonParallel(files, countNotes) #_DOCS_HIDE cant pickle doctest funcs. >>> outputs [165, 50, 131] Set updateFunction=True to get an update every 3 * numCpus (-1 if > 2) >>> #_DOCS_SHOW outputs = common.runParallel(files, countNotes, updateFunction=True) >>> outputs = common.runNonParallel(files, countNotes, updateFunction=True) #_DOCS_HIDE Done 0 tasks of 3 Done 3 tasks of 3 With a custom updateFunction that gets each output: >>> def yak(position, length, output): ... print("%d:%d %d is a lot of notes!" % (position, length, output)) >>> #_DOCS_SHOW outputs = common.runParallel(files, countNotes, updateFunction=yak) >>> outputs = common.runNonParallel(files, countNotes, updateFunction=yak) #_DOCS_HIDE 0:3 165 is a lot of notes! 1:3 50 is a lot of notes! 2:3 131 is a lot of notes! Or with updateSendsIterable, we can get the original files data as well: >>> def yik(position, length, output, fn): ... print("%d:%d (%s) %d is a lot of notes!" % (position, length, fn, output)) >>> #_DOCS_SHOW outputs = common.runParallel(files, countNotes, updateFunction=yik, >>> outputs = common.runNonParallel(files, countNotes, updateFunction=yik, #_DOCS_HIDE ... updateSendsIterable=True) 0:3 (bach/bwv66.6) 165 is a lot of notes! 1:3 (schoenberg/opus19) 50 is a lot of notes! 2:3 (AcaciaReel) 131 is a lot of notes! unpackIterable is useful for when you need to send multiple values to your function call as separate arguments. For instance, something like: >>> def pitchesAbove(fn, minPitch): # a two-argument function ... c = corpus.parse(fn) # again, the slow call goes in the function ... return len([p for p in c.pitches if p.ps > minPitch]) >>> inputs = [('bach/bwv66.6', 60), ... ('schoenberg/opus19', 72), ... ('AcaciaReel', 66)] >>> #_DOCS_SHOW outputs = common.runParallel(inputs, pitchesAbove, unpackIterable=True) >>> outputs = common.runNonParallel(inputs, pitchesAbove, unpackIterable=True) #_DOCS_HIDE >>> outputs [99, 11, 123] ''' # multiprocessing has trouble with introspection # pylint: disable=not-callable numCpus = cpus() if numCpus == 1 or multiprocessing.current_process().daemon: # @UndefinedVariable return runNonParallel(iterable, parallelFunction, updateFunction=updateFunction, updateMultiply=updateMultiply, unpackIterable=unpackIterable, updateSendsIterable=updateSendsIterable) iterLength = len(iterable) totalRun = 0 if updateFunction is None: updateMultiply = iterLength # if there is no need for updates, run at max speed # -- do the whole list at once. resultsList = [] def callUpdate(ii): if updateFunction is True: print("Done {} tasks of {}".format(min([ii, iterLength]), iterLength)) elif updateFunction not in (False, None): for thisPosition in range(ii - (updateMultiply * numCpus), ii): if thisPosition < 0: continue if thisPosition >= len(resultsList): thisResult = None else: thisResult = resultsList[thisPosition] if updateSendsIterable is False: updateFunction(thisPosition, iterLength, thisResult) else: updateFunction(thisPosition, iterLength, thisResult, iterable[thisPosition]) callUpdate(0) with Parallel(n_jobs=numCpus) as para: delayFunction = delayed(parallelFunction) while totalRun < iterLength: endPosition = min(totalRun + numCpus * updateMultiply, iterLength) rangeGen = range(totalRun, endPosition) if unpackIterable: _r = para(delayFunction(*iterable[i]) for i in rangeGen) else: _r = para(delayFunction(iterable[i]) for i in rangeGen) totalRun = endPosition resultsList.extend(_r) callUpdate(totalRun) return resultsList
def runParallel(iterable, parallelFunction, *, updateFunction=None, updateMultiply=3, unpackIterable=False, updateSendsIterable=False): ''' runs parallelFunction over iterable in parallel, optionally calling updateFunction after each common.cpus * updateMultiply calls. Setting updateMultiply too small can make it so that cores wait around when they could be working if one CPU has a particularly hard task. Setting it too high can make it seem like the job has hung. updateFunction should take three arguments: the current position, the total to run, and the most recent results. It does not need to be pickleable, and in fact, a bound method might be very useful here. Or updateFunction can be "True" which just prints a generic message. If unpackIterable is True then each element in iterable is considered a list or tuple of different arguments to parallelFunction. If updateSendsIterable is True then the update function will get the iterable content, after the output. As of Python 3, partial functions are pickleable, so if you need to pass the same arguments to parallelFunction each time, make it a partial function before passing it to runParallel. Note that parallelFunction, iterable's contents, and the results of calling parallelFunction must all be pickleable, and that if pickling the contents or unpickling the results takes a lot of time, you won't get nearly the speedup from this function as you might expect. The big culprit here is definitely music21 streams. >>> files = ['bach/bwv66.6', 'schoenberg/opus19', 'AcaciaReel'] >>> def countNotes(fn): ... c = corpus.parse(fn) # this is the slow call that is good to parallelize ... return len(c.recurse().notes) >>> #_DOCS_SHOW outputs = common.runParallel(files, countNotes) >>> outputs = common.runNonParallel(files, countNotes) #_DOCS_HIDE cant pickle doctest funcs. >>> outputs [165, 50, 131] Set updateFunction=True to get an update every 3 * numCpus (-1 if > 2) >>> #_DOCS_SHOW outputs = common.runParallel(files, countNotes, updateFunction=True) >>> outputs = common.runNonParallel(files, countNotes, updateFunction=True) #_DOCS_HIDE Done 0 tasks of 3 Done 3 tasks of 3 With a custom updateFunction that gets each output: >>> def yak(position, length, output): ... print("%d:%d %d is a lot of notes!" % (position, length, output)) >>> #_DOCS_SHOW outputs = common.runParallel(files, countNotes, updateFunction=yak) >>> outputs = common.runNonParallel(files, countNotes, updateFunction=yak) #_DOCS_HIDE 0:3 165 is a lot of notes! 1:3 50 is a lot of notes! 2:3 131 is a lot of notes! Or with updateSendsIterable, we can get the original files data as well: >>> def yik(position, length, output, fn): ... print("%d:%d (%s) %d is a lot of notes!" % (position, length, fn, output)) >>> #_DOCS_SHOW outputs = common.runParallel(files, countNotes, updateFunction=yik, >>> outputs = common.runNonParallel(files, countNotes, updateFunction=yik, #_DOCS_HIDE ... updateSendsIterable=True) 0:3 (bach/bwv66.6) 165 is a lot of notes! 1:3 (schoenberg/opus19) 50 is a lot of notes! 2:3 (AcaciaReel) 131 is a lot of notes! unpackIterable is useful for when you need to send multiple values to your function call as separate arguments. For instance, something like: >>> def pitchesAbove(fn, minPitch): # a two-argument function ... c = corpus.parse(fn) # again, the slow call goes in the function ... return len([p for p in c.pitches if p.ps > minPitch]) >>> inputs = [('bach/bwv66.6', 60), ... ('schoenberg/opus19', 72), ... ('AcaciaReel', 66)] >>> #_DOCS_SHOW outputs = common.runParallel(inputs, pitchesAbove, unpackIterable=True) >>> outputs = common.runNonParallel(inputs, pitchesAbove, unpackIterable=True) #_DOCS_HIDE >>> outputs [99, 11, 123] ''' # multiprocessing has trouble with introspection # pylint: disable=not-callable numCpus = cpus() if numCpus == 1 or multiprocessing.current_process( ).daemon: # @UndefinedVariable return runNonParallel(iterable, parallelFunction, updateFunction=updateFunction, updateMultiply=updateMultiply, unpackIterable=unpackIterable, updateSendsIterable=updateSendsIterable) iterLength = len(iterable) totalRun = 0 if updateFunction is None: updateMultiply = iterLength # if there is no need for updates, run at max speed # -- do the whole list at once. resultsList = [] def callUpdate(ii): if updateFunction is True: print("Done {} tasks of {}".format(min([ii, iterLength]), iterLength)) elif updateFunction not in (False, None): for thisPosition in range(ii - (updateMultiply * numCpus), ii): if thisPosition < 0: continue if thisPosition >= len(resultsList): thisResult = None else: thisResult = resultsList[thisPosition] if updateSendsIterable is False: updateFunction(thisPosition, iterLength, thisResult) else: updateFunction(thisPosition, iterLength, thisResult, iterable[thisPosition]) callUpdate(0) with Parallel(n_jobs=numCpus) as para: delayFunction = delayed(parallelFunction) while totalRun < iterLength: endPosition = min(totalRun + numCpus * updateMultiply, iterLength) rangeGen = range(totalRun, endPosition) if unpackIterable: _r = para(delayFunction(*iterable[i]) for i in rangeGen) else: _r = para(delayFunction(iterable[i]) for i in rangeGen) totalRun = endPosition resultsList.extend(_r) callUpdate(totalRun) return resultsList