def test_become_dask(self): executor = self.client.become_dask() reprs = self.client[:].apply_sync(repr, Reference('distributed_worker')) for r in reprs: self.assertIn("Worker", r) squares = executor.map(lambda x: x * x, range(10)) tot = executor.submit(sum, squares) self.assertEqual(tot.result(), 285) # cleanup self.client.stop_distributed() ar = self.client[:].apply_async(lambda x: x, Reference('distributed_worker')) self.assertRaisesRemote(NameError, ar.get)
def phistogram(view, a, bins=10, rng=None, normed=False): """Compute the histogram of a remote array a. Parameters ---------- view IPython DirectView instance a : str String name of the remote array bins : int Number of histogram bins rng : (float, float) Tuple of min, max of the range to histogram normed : boolean Should the histogram counts be normalized to 1 """ nengines = len(view.targets) # view.push(dict(bins=bins, rng=rng)) with view.sync_imports(): import numpy rets = view.apply_sync(lambda a, b, rng: numpy.histogram(a,b,rng), Reference(a), bins, rng) hists = [ r[0] for r in rets ] lower_edges = [ r[1] for r in rets ] # view.execute('hist, lower_edges = numpy.histogram(%s, bins, rng)' % a) lower_edges = view.pull('lower_edges', targets=0) hist_array = numpy.array(hists).reshape(nengines, -1) # hist_array.shape = (nengines,-1) total_hist = numpy.sum(hist_array, 0) if normed: total_hist = total_hist/numpy.sum(total_hist,dtype=float) return total_hist, lower_edges
def remote_iterator(view,name): """Return an iterator on an object living on a remote engine. """ view.execute('it%s=iter(%s)'%(name,name), block=True) while True: try: result = view.apply_sync(lambda x: x.next(), Reference('it'+name)) # This causes the StopIteration exception to be raised. except RemoteError as e: if e.ename == 'StopIteration': raise StopIteration else: raise e else: yield result
def pwordfreq(view, fnames): """Parallel word frequency counter. view - An IPython DirectView fnames - The filenames containing the split data. """ assert len(fnames) == len(view.targets) view.scatter('fname', fnames, flatten=True) ar = view.apply(wordfreq, Reference('fname')) freqs_list = ar.get() word_set = set() for f in freqs_list: word_set.update(f.keys()) freqs = dict(zip(word_set, repeat(0))) for f in freqs_list: for word, count in f.items(): freqs[word] += count return freqs
pub_url = root.apply_sync(lambda: com.pub_url) # gather the connection information into a dict ar = view.apply_async(lambda: com.info) peers = ar.get_dict() # this is a dict, keyed by engine ID, of the connection info for the EngineCommunicators # connect the engines to each other: def connect(com, peers, tree, pub_url, root_id): """this function will be called on the engines""" com.connect(peers, tree, pub_url, root_id) view.apply_sync(connect, Reference('com'), peers, btree, pub_url, root_id) # functions that can be used for reductions # max and min builtins can be used as well def add(a, b): """cumulative sum reduction""" return a + b def mul(a, b): """cumulative product reduction""" return a * b view['add'] = add
# scatter engine IDs view.scatter('my_id', range(num_procs), flatten=True) # create the engine connectors view.execute('com = EngineCommunicator()') # gather the connection information into a single dict ar = view.apply_async(lambda: com.info) peers = ar.get_dict() # print peers # this is a dict, keyed by engine ID, of the connection info for the EngineCommunicators # setup remote partitioner # note that Reference means that the argument passed to setup_partitioner will be the # object named 'com' in the engine's namespace view.apply_sync(setup_partitioner, Reference('com'), peers, Reference('my_id'), num_procs, grid, partition) time.sleep(1) # convenience lambda to call solver.solve: _solve = lambda *args, **kwargs: solver.solve(*args, **kwargs) if ns.scalar: impl['inner'] = 'scalar' # setup remote solvers view.apply_sync(setup_solver, I, f, c, bc, Lx, Ly,
def gpu_job_runner(job_fnc, job_args, ipp_profile='ssh_gpu_py2', log_name=None, log_dir='~/logs/default', status_interval=600, allow_engine_overlap=True, devices_assigned=False): """ Distribute a set of jobs across an IPyParallel 'GPU cluster' Requires that cluster has already been started with `ipcluster start --profile={}`.forat(ipp_profile) Checks on the jobs every status_interval seconds, logging status. Args: job_fnc: the function to distribute must accept `device` as a kwarg, as this function is wrapped so that device is bound within the engine namespace returned values are ignored job_args: list of args passed to job_fnc - list ipp_profile: profile of GPU IPyParallel profile - str log_name: (optional) name for log log_dir: (optional), default is ~/logs/default which is created if it doesn't exist status_interval: (optional) the amount of time, in seconds, to wait before querying the AsyncResult object for the status of the jobs devices_assigned: (optional) set this to True if devices have already been assigned to the engines on this cluster """ from ipyparallel import Client, RemoteError, Reference import inspect # setup logging log_path = os.path.expanduser(log_dir) log_name = log_name or 'job_runner' logger = setup_logging(log_name, log_path) # TODO: this isn't strictly necessary try: # check that job_fnc accepts a device kwarg args = inspect.getargspec(job_fnc)[0] assert 'device' in args except AssertionError: logger.critical("job_fnc does not except device kwarg. Halting.") client = Client(profile=ipp_profile) logger.info("Succesfully initialized client on %s with %s engines", ipp_profile, len(client)) if not devices_assigned: # assign each engine to a GPU engines_per_host = {} device_assignments = [] engine_hosts = client[:].apply(socket.gethostname).get() for host in engine_hosts: if host in engines_per_host: device_assignments.append('/gpu:{}'.format(engines_per_host[host])) engines_per_host[host] += 1 else: device_assignments.append('/gpu:0') engines_per_host[host] = 1 logger.info("Engines per host: \n") if not allow_engine_overlap: try: # check that we haven't over-provisioned GPUs for host, n_engines in six.iteritems(engines_per_host): logger.info("%s: %s", host, n_engines) assert n_engines <= WS_N_GPUS[host] except AssertionError: logger.critical("Host has more engines than GPUs. Halting.") while True: try: # NOTE: could also be accomplished with process environment variables # broadcast device assignments and job_fnc for engine_id, engine_device in enumerate(device_assignments): print("Pushing to engine {}: device: {}".format(engine_id, engine_device)) client[engine_id].push({'device': engine_device, 'job_fnc': job_fnc}) for engine_id, (host, assigned_device) in enumerate(zip(engine_hosts, device_assignments)): remote_device = client[engine_id].pull('device').get() logger.info("Engine %s: host = %s; device = %s, remote device = %s", engine_id, host, assigned_device, remote_device) break except RemoteError as remote_err: logger.warn("Caught remote error: %s. Sleeping for 10s before retry", remote_err) time.sleep(10) else: try: device_assignments = client[:].pull('device').get() except RemoteError as remote_err: logger.warn('Caught remote error when checking device assignments: %s. You may want to initialize device assignments', remote_err) logger.info("Dispatching jobs: %s", job_args) # dispatch jobs async_result = client[:].map(job_fnc, job_args, [Reference('device')] * len(job_args)) start_time = time.time() while not async_result.ready(): time.sleep(status_interval) n_finished = async_result.progress n_jobs = len(job_args) wall_time = start_time - time.time() logger.info("%s seconds elapsed. %s of %s jobs finished", wall_time, n_finished, n_jobs) logger.info("All jobs finished in %s seconds!", async_result.wall_time)