Esempio n. 1
0
def pool_map(
      processes=None,
      initializer=None,
      initargs=(),
      maxtasksperchild=Auto,
      func=None,
      fixed_func=None,
      iterable=None,
      args=None,
      chunksize=Auto,
      func_wrapper="simple",
      index_args=True,
      log=None,
      call_back_for_serial_run=None):
  """
  Parallelized map() using subclassed multiprocessing.Pool.  If func is not
  None, this function essentially calls the Pool's own map method; this means
  that both func and iterable/args must be pickle-able.  If fixed_func is not
  None, it will not be pickled but instead saved as an attribute of the Pool,
  which will be preserved after the fork() call.  Additional features include
  optional redirection of output and automatic process number determination.

  Note that because of the reliance on fork(), this function will run in serial
  on Windows, regardless of how many processors are available.

  :param processes: number of processes to spawn; if None or Auto, the
    get_processes() function will be used.
  :param func: target function (will be pickled)
  :param fixed_func: "fixed" target function, which will be be propagated to
    the child process when forked (instead of pickling)
  :param iterable: argument list
  :param args: same as iterable (alternate keyword)
  :param chunksize: number of arguments to process at once

  Examples
  --------
  >>> def f (x) :
  ...   return some_long_running_method(x)
  ...
  >>> args = range(1000)
  >>> result = easy_mp.pool_map(
  ...   func=f,
  ...   args=args)
  ...
  >>> print len(result)
  ... 1000

  >>> class f_caller (object) :
  ...   def __init__ (self, non_pickleable_object) :
  ...     self._obj = non_pickleable_object
  ...   def __call__ (self, x) :
  ...     return some_long_running_method(x, self._obj)
  ...
  >>> args = range(1000)
  >>> f = f_caller(processed_pdb_file)
  >>> result = easy_mp.pool_map(
  ...   fixed_func=f,
  ...   args=args)
  ...
  """
  assert [func, fixed_func].count(None) == 1
  assert [iterable, args].count(None) == 1
  assert ((call_back_for_serial_run is None) or
          hasattr(call_back_for_serial_run, "__call__"))
  if (isinstance(func_wrapper, str)):
    if (func_wrapper == "simple"):
      func_wrapper = func_wrapper_simple()
    else:
      if (func_wrapper == "buffer_stdout_stderr"):
        func_wrapper = func_wrapper_simple(buffer_stdout_stderr=True)
      elif (func_wrapper == "sub_directories"):
        func_wrapper = func_wrapper_sub_directories()
      elif (func_wrapper.startswith("sub_directories:")):
        func_wrapper = func_wrapper_sub_directories(
          sub_name_format=func_wrapper[16:])
      else:
        raise RuntimeError("Unknown func_wrapper keyword: %s" % func_wrapper)
      if (maxtasksperchild is Auto and _have_maxtasksperchild):
        maxtasksperchild = 1
      if (chunksize is Auto):
        chunksize = 1
  if (func_wrapper is not None):
    wrap = getattr(func_wrapper, "wrap", None)
    if (wrap is None):
      raise RuntimeError("func_wrapper must have a .wrap() method.")
    if (func is not None):
      func = wrap(func)
    else:
      fixed_func = wrap(fixed_func)
  processes = get_processes(processes)
  # XXX since we want to be able to call this function on Windows too, reset
  # processes to 1
  if (os.name == "nt") or (sys.version_info < (2,6)) :
    processes = 1
  if (args is not None):
    iterable = args
    if (processes is not None):
      processes = min(processes, len(args))
  if (index_args):
    iterable = enumerate(iterable)
  if (log is not None):
    print >> log, "multiprocessing pool size:", processes
    flush = getattr(log, "flush", None)
    if (flush is not None):
      flush()
    import time
    time_start = time.time()
  result = None
  # XXX this allows the function to be used even when parallelization is
  # not enabled or supported, which should keep calling code simpler.
  if (processes == 1) or (os.name == "nt") :
    result = []
    for args in iterable :
      if (func is not None) :
        result.append(func(args))
      else :
        result.append(fixed_func(args))
      if (call_back_for_serial_run is not None) :
        call_back_for_serial_run(result[-1])
  else :
    pool = Pool(
      processes=processes,
      initializer=initializer,
      initargs=initargs,
      maxtasksperchild=maxtasksperchild,
      fixed_func=fixed_func)
    if (chunksize is Auto):
      chunksize = None
    try:
      if (func is not None):
        result = pool.map(func=func, iterable=iterable, chunksize=chunksize)
      else:
        result = pool.map_fixed_func(iterable=iterable, chunksize=chunksize)
    finally:
      pool.close()
      pool.join()
  if (log is not None):
    from libtbx.utils import show_wall_clock_time
    show_wall_clock_time(seconds=time.time()-time_start, out=log)
  return result
Esempio n. 2
0
def build_image_cluster(work_params, reindexing_assistant, image_mdls, usables):
  n_imgs = len(usables)
  clusters = []
  for i_img,miis_perms in enumerate(usables):
    clusters.append(cluster_info(
      i_perm_and_scale_by_i_img={i_img: i_perm_and_scale(0, 1)},
      miis_perms=[_ for _,__ in miis_perms],
      esti_perms=[_ for __,_ in miis_perms]))
  remaining = range(n_imgs)
  cluster_pairs = [{} for _ in xrange(n_imgs)]
  def process_cp(i_rem, j_rem):
    i_clu = remaining[i_rem]
    j_clu = remaining[j_rem]
    cp = clusters[i_clu].build_cluster_pair_info(
      other=clusters[j_clu],
      work_params=work_params,
      reindexing_assistant=reindexing_assistant)
    if (cp is not None):
      cluster_pairs[i_clu][j_clu] = cp
  while (len(remaining) != 1):
    if (len(remaining) == n_imgs):
      chunk_size = 3000 # ad-hoc
      if (not work_params.multiprocessing or n_imgs*(n_imgs-1) <= chunk_size):
        import time
        time_start = time.time()
        for i_rem in xrange(n_imgs):
          for j_rem in xrange(i_rem+1, n_imgs):
            process_cp(i_rem, j_rem)
        from libtbx.utils import show_wall_clock_time
        show_wall_clock_time(seconds=time.time()-time_start)
      else:
        def mp():
          ij_list = []
          for i_rem in xrange(n_imgs):
            for j_rem in xrange(i_rem+1, n_imgs):
              ij_list.append((i_rem,j_rem))
          n_chunks = len(ij_list) // chunk_size
          print "Number of chunks for computing cluster pairs:", n_chunks
          print
          def process_chunk(i_chunk):
            for j_chunk in xrange(chunk_size):
              i = i_chunk * chunk_size + j_chunk
              if (i == len(ij_list)):
                break
              i_rem, j_rem = ij_list[i]
              process_cp(i_rem, j_rem)
            return cluster_pairs
          from libtbx import easy_mp
          mp_results = easy_mp.pool_map(
            fixed_func=process_chunk,
            args=range(n_chunks),
            chunksize=1,
            log=sys.stdout)
          for cps in mp_results:
            for main,sub in zip(cluster_pairs,cps):
              main.update(sub)
        mp()
    else:
      for i_rem in xrange(max_j_rem):
        i_clu = remaining[i_rem]
        cps_i = cluster_pairs[i_clu]
        if (max_j_clu in cps_i):
          del cps_i[max_j_clu]
        if (i_rem < max_i_rem):
          if (max_i_clu in cps_i):
            del cps_i[max_i_clu]
          process_cp(i_rem, max_i_rem)
      for j_rem in xrange(max_i_rem+1, len(remaining)):
        process_cp(max_i_rem, j_rem)
    max_score = 0
    max_i_rem = None
    max_j_clu = None
    for i_rem,i_clu in enumerate(remaining):
      cps_i = cluster_pairs[i_clu]
      for j_clu,cp in cps_i.items():
        if (max_score < cp.score):
          max_score = cp.score
          max_i_rem = i_rem
          max_j_clu = j_clu
    if (max_i_rem is None):
      raise RuntimeError("Insufficient connectivity between images.")
    max_i_clu = remaining[max_i_rem]
    max_j_rem = remaining.index(max_j_clu)
    print "max_score:", max_score, (max_i_rem, max_j_rem)
    cp = cluster_pairs[max_i_clu][max_j_clu]
    clusters[max_i_clu].merge(
      other=clusters[max_j_clu],
      pair_info=cp,
      reindexing_assistant=reindexing_assistant,
      image_mdls=image_mdls)
    cluster_pairs[max_j_clu] = None
    clusters[max_j_clu] = None
    del remaining[max_j_rem]
  return clusters[remaining[0]]
Esempio n. 3
0
def build_image_cluster(work_params, reindexing_assistant, image_mdls, usables):
  n_imgs = len(usables)
  clusters = []
  for i_img,miis_perms in enumerate(usables):
    clusters.append(cluster_info(
      i_perm_and_scale_by_i_img={i_img: i_perm_and_scale(0, 1)},
      miis_perms=[_ for _,__ in miis_perms],
      esti_perms=[_ for __,_ in miis_perms]))
  remaining = range(n_imgs)
  cluster_pairs = [{} for _ in xrange(n_imgs)]
  def process_cp(i_rem, j_rem):
    i_clu = remaining[i_rem]
    j_clu = remaining[j_rem]
    cp = clusters[i_clu].build_cluster_pair_info(
      other=clusters[j_clu],
      work_params=work_params,
      reindexing_assistant=reindexing_assistant)
    if (cp is not None):
      cluster_pairs[i_clu][j_clu] = cp
  while (len(remaining) != 1):
    if (len(remaining) == n_imgs):
      chunk_size = 3000 # ad-hoc
      if (not work_params.multiprocessing or n_imgs*(n_imgs-1) <= chunk_size):
        import time
        time_start = time.time()
        for i_rem in xrange(n_imgs):
          for j_rem in xrange(i_rem+1, n_imgs):
            process_cp(i_rem, j_rem)
        from libtbx.utils import show_wall_clock_time
        show_wall_clock_time(seconds=time.time()-time_start)
      else:
        def mp():
          ij_list = []
          for i_rem in xrange(n_imgs):
            for j_rem in xrange(i_rem+1, n_imgs):
              ij_list.append((i_rem,j_rem))
          n_chunks = len(ij_list) // chunk_size
          print "Number of chunks for computing cluster pairs:", n_chunks
          print
          def process_chunk(i_chunk):
            for j_chunk in xrange(chunk_size):
              i = i_chunk * chunk_size + j_chunk
              if (i == len(ij_list)):
                break
              i_rem, j_rem = ij_list[i]
              process_cp(i_rem, j_rem)
            return cluster_pairs
          from libtbx import easy_mp
          mp_results = easy_mp.pool_map(
            fixed_func=process_chunk,
            args=range(n_chunks),
            chunksize=1,
            log=sys.stdout)
          for cps in mp_results:
            for main,sub in zip(cluster_pairs,cps):
              main.update(sub)
        mp()
    else:
      for i_rem in xrange(max_j_rem):
        i_clu = remaining[i_rem]
        cps_i = cluster_pairs[i_clu]
        if (max_j_clu in cps_i):
          del cps_i[max_j_clu]
        if (i_rem < max_i_rem):
          if (max_i_clu in cps_i):
            del cps_i[max_i_clu]
          process_cp(i_rem, max_i_rem)
      for j_rem in xrange(max_i_rem+1, len(remaining)):
        process_cp(max_i_rem, j_rem)
    max_score = 0
    max_i_rem = None
    max_j_clu = None
    for i_rem,i_clu in enumerate(remaining):
      cps_i = cluster_pairs[i_clu]
      for j_clu,cp in cps_i.items():
        if (max_score < cp.score):
          max_score = cp.score
          max_i_rem = i_rem
          max_j_clu = j_clu
    if (max_i_rem is None):
      raise RuntimeError("Insufficient connectivity between images.")
    max_i_clu = remaining[max_i_rem]
    max_j_rem = remaining.index(max_j_clu)
    print "max_score:", max_score, (max_i_rem, max_j_rem)
    cp = cluster_pairs[max_i_clu][max_j_clu]
    clusters[max_i_clu].merge(
      other=clusters[max_j_clu],
      pair_info=cp,
      reindexing_assistant=reindexing_assistant,
      image_mdls=image_mdls)
    cluster_pairs[max_j_clu] = None
    clusters[max_j_clu] = None
    del remaining[max_j_rem]
  return clusters[remaining[0]]
Esempio n. 4
0
def pool_map(
      processes=None,
      initializer=None,
      initargs=(),
      maxtasksperchild=Auto,
      func=None,
      fixed_func=None,
      iterable=None,
      args=None,
      chunksize=Auto,
      func_wrapper="simple",
      index_args=True,
      log=None,
      call_back_for_serial_run=None):
  """
  Parallelized map() using subclassed multiprocessing.Pool.  If func is not
  None, this function essentially calls the Pool's own map method; this means
  that both func and iterable/args must be pickle-able.  If fixed_func is not
  None, it will not be pickled but instead saved as an attribute of the Pool,
  which will be preserved after the fork() call.  Additional features include
  optional redirection of output and automatic process number determination.

  Note that because of the reliance on fork(), this function will run in serial
  on Windows, regardless of how many processors are available.

  :param processes: number of processes to spawn; if None or Auto, the
    get_processes() function will be used.
  :param func: target function (will be pickled)
  :param fixed_func: "fixed" target function, which will be be propagated to
    the child process when forked (instead of pickling)
  :param iterable: argument list
  :param args: same as iterable (alternate keyword)
  :param chunksize: number of arguments to process at once

  Examples
  --------
  >>> def f (x) :
  ...   return some_long_running_method(x)
  ...
  >>> args = range(1000)
  >>> result = easy_mp.pool_map(
  ...   func=f,
  ...   args=args)
  ...
  >>> print len(result)
  ... 1000

  >>> class f_caller (object) :
  ...   def __init__ (self, non_pickleable_object) :
  ...     self._obj = non_pickleable_object
  ...   def __call__ (self, x) :
  ...     return some_long_running_method(x, self._obj)
  ...
  >>> args = range(1000)
  >>> f = f_caller(processed_pdb_file)
  >>> result = easy_mp.pool_map(
  ...   fixed_func=f,
  ...   args=args)
  ...
  """
  assert [func, fixed_func].count(None) == 1
  assert [iterable, args].count(None) == 1
  assert ((call_back_for_serial_run is None) or
          hasattr(call_back_for_serial_run, "__call__"))
  if (isinstance(func_wrapper, str)):
    if (func_wrapper == "simple"):
      func_wrapper = func_wrapper_simple()
    else:
      if (func_wrapper == "buffer_stdout_stderr"):
        func_wrapper = func_wrapper_simple(buffer_stdout_stderr=True)
      elif (func_wrapper == "sub_directories"):
        func_wrapper = func_wrapper_sub_directories()
      elif (func_wrapper.startswith("sub_directories:")):
        func_wrapper = func_wrapper_sub_directories(
          sub_name_format=func_wrapper[16:])
      else:
        raise RuntimeError("Unknown func_wrapper keyword: %s" % func_wrapper)
      if (maxtasksperchild is Auto and _have_maxtasksperchild):
        maxtasksperchild = 1
      if (chunksize is Auto):
        chunksize = 1
  if (func_wrapper is not None):
    wrap = getattr(func_wrapper, "wrap", None)
    if (wrap is None):
      raise RuntimeError("func_wrapper must have a .wrap() method.")
    if (func is not None):
      func = wrap(func)
    else:
      fixed_func = wrap(fixed_func)
  processes = get_processes(processes)
  # XXX since we want to be able to call this function on Windows too, reset
  # processes to 1
  if (os.name == "nt") or (sys.version_info < (2,6)) :
    processes = 1
  if (args is not None):
    iterable = args
    if (processes is not None):
      processes = min(processes, len(args))
  if (index_args):
    iterable = enumerate(iterable)
  if (log is not None):
    print >> log, "multiprocessing pool size:", processes
    flush = getattr(log, "flush", None)
    if (flush is not None):
      flush()
    import time
    time_start = time.time()
  result = None
  # XXX this allows the function to be used even when parallelization is
  # not enabled or supported, which should keep calling code simpler.
  if (processes == 1) or (os.name == "nt") :
    result = []
    for args in iterable :
      if (func is not None) :
        result.append(func(args))
      else :
        result.append(fixed_func(args))
      if (call_back_for_serial_run is not None) :
        call_back_for_serial_run(result[-1])
  else :
    pool = Pool(
      processes=processes,
      initializer=initializer,
      initargs=initargs,
      maxtasksperchild=maxtasksperchild,
      fixed_func=fixed_func)
    if (chunksize is Auto):
      chunksize = None
    try:
      if (func is not None):
        result = pool.map(func=func, iterable=iterable, chunksize=chunksize)
      else:
        result = pool.map_fixed_func(iterable=iterable, chunksize=chunksize)
    finally:
      pool.close()
      pool.join()
  if (log is not None):
    from libtbx.utils import show_wall_clock_time
    show_wall_clock_time(seconds=time.time()-time_start, out=log)
  return result