def pool_map( processes=None, initializer=None, initargs=(), maxtasksperchild=Auto, func=None, fixed_func=None, iterable=None, args=None, chunksize=Auto, func_wrapper="simple", index_args=True, log=None, call_back_for_serial_run=None): """ Parallelized map() using subclassed multiprocessing.Pool. If func is not None, this function essentially calls the Pool's own map method; this means that both func and iterable/args must be pickle-able. If fixed_func is not None, it will not be pickled but instead saved as an attribute of the Pool, which will be preserved after the fork() call. Additional features include optional redirection of output and automatic process number determination. Note that because of the reliance on fork(), this function will run in serial on Windows, regardless of how many processors are available. :param processes: number of processes to spawn; if None or Auto, the get_processes() function will be used. :param func: target function (will be pickled) :param fixed_func: "fixed" target function, which will be be propagated to the child process when forked (instead of pickling) :param iterable: argument list :param args: same as iterable (alternate keyword) :param chunksize: number of arguments to process at once Examples -------- >>> def f (x) : ... return some_long_running_method(x) ... >>> args = range(1000) >>> result = easy_mp.pool_map( ... func=f, ... args=args) ... >>> print len(result) ... 1000 >>> class f_caller (object) : ... def __init__ (self, non_pickleable_object) : ... self._obj = non_pickleable_object ... def __call__ (self, x) : ... return some_long_running_method(x, self._obj) ... >>> args = range(1000) >>> f = f_caller(processed_pdb_file) >>> result = easy_mp.pool_map( ... fixed_func=f, ... args=args) ... """ assert [func, fixed_func].count(None) == 1 assert [iterable, args].count(None) == 1 assert ((call_back_for_serial_run is None) or hasattr(call_back_for_serial_run, "__call__")) if (isinstance(func_wrapper, str)): if (func_wrapper == "simple"): func_wrapper = func_wrapper_simple() else: if (func_wrapper == "buffer_stdout_stderr"): func_wrapper = func_wrapper_simple(buffer_stdout_stderr=True) elif (func_wrapper == "sub_directories"): func_wrapper = func_wrapper_sub_directories() elif (func_wrapper.startswith("sub_directories:")): func_wrapper = func_wrapper_sub_directories( sub_name_format=func_wrapper[16:]) else: raise RuntimeError("Unknown func_wrapper keyword: %s" % func_wrapper) if (maxtasksperchild is Auto and _have_maxtasksperchild): maxtasksperchild = 1 if (chunksize is Auto): chunksize = 1 if (func_wrapper is not None): wrap = getattr(func_wrapper, "wrap", None) if (wrap is None): raise RuntimeError("func_wrapper must have a .wrap() method.") if (func is not None): func = wrap(func) else: fixed_func = wrap(fixed_func) processes = get_processes(processes) # XXX since we want to be able to call this function on Windows too, reset # processes to 1 if (os.name == "nt") or (sys.version_info < (2,6)) : processes = 1 if (args is not None): iterable = args if (processes is not None): processes = min(processes, len(args)) if (index_args): iterable = enumerate(iterable) if (log is not None): print >> log, "multiprocessing pool size:", processes flush = getattr(log, "flush", None) if (flush is not None): flush() import time time_start = time.time() result = None # XXX this allows the function to be used even when parallelization is # not enabled or supported, which should keep calling code simpler. if (processes == 1) or (os.name == "nt") : result = [] for args in iterable : if (func is not None) : result.append(func(args)) else : result.append(fixed_func(args)) if (call_back_for_serial_run is not None) : call_back_for_serial_run(result[-1]) else : pool = Pool( processes=processes, initializer=initializer, initargs=initargs, maxtasksperchild=maxtasksperchild, fixed_func=fixed_func) if (chunksize is Auto): chunksize = None try: if (func is not None): result = pool.map(func=func, iterable=iterable, chunksize=chunksize) else: result = pool.map_fixed_func(iterable=iterable, chunksize=chunksize) finally: pool.close() pool.join() if (log is not None): from libtbx.utils import show_wall_clock_time show_wall_clock_time(seconds=time.time()-time_start, out=log) return result
def build_image_cluster(work_params, reindexing_assistant, image_mdls, usables): n_imgs = len(usables) clusters = [] for i_img,miis_perms in enumerate(usables): clusters.append(cluster_info( i_perm_and_scale_by_i_img={i_img: i_perm_and_scale(0, 1)}, miis_perms=[_ for _,__ in miis_perms], esti_perms=[_ for __,_ in miis_perms])) remaining = range(n_imgs) cluster_pairs = [{} for _ in xrange(n_imgs)] def process_cp(i_rem, j_rem): i_clu = remaining[i_rem] j_clu = remaining[j_rem] cp = clusters[i_clu].build_cluster_pair_info( other=clusters[j_clu], work_params=work_params, reindexing_assistant=reindexing_assistant) if (cp is not None): cluster_pairs[i_clu][j_clu] = cp while (len(remaining) != 1): if (len(remaining) == n_imgs): chunk_size = 3000 # ad-hoc if (not work_params.multiprocessing or n_imgs*(n_imgs-1) <= chunk_size): import time time_start = time.time() for i_rem in xrange(n_imgs): for j_rem in xrange(i_rem+1, n_imgs): process_cp(i_rem, j_rem) from libtbx.utils import show_wall_clock_time show_wall_clock_time(seconds=time.time()-time_start) else: def mp(): ij_list = [] for i_rem in xrange(n_imgs): for j_rem in xrange(i_rem+1, n_imgs): ij_list.append((i_rem,j_rem)) n_chunks = len(ij_list) // chunk_size print "Number of chunks for computing cluster pairs:", n_chunks print def process_chunk(i_chunk): for j_chunk in xrange(chunk_size): i = i_chunk * chunk_size + j_chunk if (i == len(ij_list)): break i_rem, j_rem = ij_list[i] process_cp(i_rem, j_rem) return cluster_pairs from libtbx import easy_mp mp_results = easy_mp.pool_map( fixed_func=process_chunk, args=range(n_chunks), chunksize=1, log=sys.stdout) for cps in mp_results: for main,sub in zip(cluster_pairs,cps): main.update(sub) mp() else: for i_rem in xrange(max_j_rem): i_clu = remaining[i_rem] cps_i = cluster_pairs[i_clu] if (max_j_clu in cps_i): del cps_i[max_j_clu] if (i_rem < max_i_rem): if (max_i_clu in cps_i): del cps_i[max_i_clu] process_cp(i_rem, max_i_rem) for j_rem in xrange(max_i_rem+1, len(remaining)): process_cp(max_i_rem, j_rem) max_score = 0 max_i_rem = None max_j_clu = None for i_rem,i_clu in enumerate(remaining): cps_i = cluster_pairs[i_clu] for j_clu,cp in cps_i.items(): if (max_score < cp.score): max_score = cp.score max_i_rem = i_rem max_j_clu = j_clu if (max_i_rem is None): raise RuntimeError("Insufficient connectivity between images.") max_i_clu = remaining[max_i_rem] max_j_rem = remaining.index(max_j_clu) print "max_score:", max_score, (max_i_rem, max_j_rem) cp = cluster_pairs[max_i_clu][max_j_clu] clusters[max_i_clu].merge( other=clusters[max_j_clu], pair_info=cp, reindexing_assistant=reindexing_assistant, image_mdls=image_mdls) cluster_pairs[max_j_clu] = None clusters[max_j_clu] = None del remaining[max_j_rem] return clusters[remaining[0]]