def add_engines(n=1, profile='default', total=False): """add a number of engines to a given profile. If total is True, then already running engines are counted, and only the additional engines necessary (if any) are started. """ rc = parallel.Client(profile=profile) base = len(rc) if total: n = max(n - base, 0) eps = [] for _ in range(n): ep = TestProcessLauncher() ep.cmd_and_args = ipengine_cmd_argv + [ '--profile=%s' % profile, '--log-level=50', '--InteractiveShell.colors=nocolor' ] ep.start() launchers.append(ep) eps.append(ep) tic = time.time() while len(rc) < base+n: if any([ ep.poll() is not None for ep in eps ]): raise RuntimeError("A test engine failed to start.") elif time.time()-tic > 15: raise RuntimeError("Timeout waiting for engines to connect.") time.sleep(.1) rc.spin() rc.close() return eps
def get_dview(): """Obtain DirectView object for computation""" clients = parallel.Client() dview = clients[:] dview.block = False dview.use_dill() # to serialize messy things return dview
def __init__(self, n_clusters, feat_patches, client, cache_dir='/tmp', algo_name='KMeans', sparse_result=True, random_state=0): """ n_clusters = number of clusters used in KMeans or MiniBatchKmeans feat_patches = patches of feat indices to build clusters on, e.g., [[feat_idx_i1, ..., feat_idx_j1], [feat_idx_i2, .., feat_idx_j2]] the feat_patches can be extracted by sequence generators such as strided_seqs or bootstrap_seqs in the package. cache_dir = the cache dir for shared memory object - in parallel computing algo_name = the clustering algorithm used for now only {'KMeans', 'MiniBatchKmeans'} sparse_result = if the transformed result should be a sparse matrix coo_matrix or normal """ self.n_clusters = n_clusters self.feat_patches = feat_patches self.client = client or parallel.Client() self.cache_dir = cache_dir assert algo_name in ['KMeans', 'MiniBatchKMeans'] self.algo_name = algo_name self.feat_to_kmeans_ = None self.sparse_result = sparse_result self.random_state = random_state random.seed(random_state)
def get_lview(): """Obtain LoadBalancedView object for computation""" clients = parallel.Client() clients.direct_view().use_dill() # Testing, stackoverflow.com/a/24316222 lview = clients.load_balanced_view() lview.block = False return lview
def start_parallel(imports_str=None): """ This function starts a parallel computing environment Parameters ---------- imports_str: a string with the imports that are required to run on your engines, so that they can do their job. For example: 'import osmosis.model as ozm \n import osmosis.utils as ozu' """ try: # Get parallel computing stuff from IPython: from IPython import parallel rc = parallel.Client() except ImportError: warnings.warn("Could not import IPython.parallel") return None except AssertionError: # If you get here, that probably means that you didn't turn on your # cluster... e_s = "Could not get an IPython connection file." e_s += "Did you remember to start your cluster?" warnings.warn(e_s) return None if imports_str is not None: rc[:].execute(imports_str) print("Parallelizing on %s engines"%len(rc)) dview = rc[:] # Now you can do: # out = dview.apply_async(para_func, args) return dview
def main(): X = ... y = ... estimator = ... # concrete BaseGradientBoosting object K = 5 param_grid = { 'n_estimators': [10000], 'min_samples_leaf': [7, 9, 13], 'max_depth': [4, 5, 6, 7], 'max_features': [100, 150, 250], 'learn_rate': [0.05, 0.02, 0.01], } grid = IterGrid(param_grid) grid_size = sum(1 for params in grid) print("_" * 80) print("GridSearch") print("grid size: %d" % grid_size) print("num tasks: %d" % (K * grid_size)) cv = KFold(X.shape[0], K, shuffle=True, random_state=0) # instantiate the tasks - K times the number of grid cells # FIXME use generator to limit memory consumption or do fancy # indexing in _parallel_grid_search. tasks = [(i, k, estimator, params, X[train], y[train], X[test], y[test]) for i, params in enumerate(grid) for k, (train, test) in enumerate(cv)] # distribute tasks on ipcluster rc = parallel.Client() lview = rc.load_balanced_view() results = lview.map(_parallel_grid_search, tasks)
def reseed_project(): # args = parse_commandline() #make sure we have full path to avoid annoying issues with path args.d = os.path.abspath(args.d) if args.t is None: args.t = os.path.join(args.d, "topologies/") else: args.t = os.path.abspath(args.t) client_list = parallel.Client(profile=args.p) client_list[:].execute("from fah_reseeder import *") print("Running on:", len(client_list.ids)) view = client_list.load_balanced_view() view.block = True #extract extract_project_wrapper(args.d, args.t, view) #featurize feature_dict = featurize_project(args.d, args.t, args.f, args.s, view) #ticafy if args.i == True: feature_dict = tica_wrapper(args.d, feature_dict, args.l) #assignment cluster_mdl, assignments = cluster_project_wrapper(args.d, feature_dict, args.n) #cluster and pull frames pull_new_seeds(args.d, args.t, cluster_mdl, assignments, args.r, args.c, args.s, view) return
def run(self, loop, mapPlugin): from IPython import parallel client = parallel.Client() view = client.load_balanced_view() try: return view.map_sync(LoopWrapper(loop), mapPlugin.getWorkload()) finally: pass
def async_avail(): from IPython import parallel try: client = parallel.Client(PARALLEL_PROFILE) return len(client) > 0 except IOError: return False except Exception: return False
def get_client(): from IPython import parallel try: client = parallel.Client(profile=PARALLEL_PROFILE) return client if len(client) > 0 else None except IOError: return None except Exception: return None
def setUpClass(cls): logger = ema_logging.get_logger() mocked_logger = mock.Mock(spec=logger) mocked_logger.handlers = [] ema_logging._logger = mocked_logger cls.client = parallel.Client(profile='default') cls.url = 'tcp://{}:20202'.format(localhost()) cls.watcher = ema.start_logwatcher(cls.url)
def setup_client(): """Get a Client and initialize it. This assumes that all nodes see a shared filesystem. """ global _client if _client is None: _client = parallel.Client() mydir = os.path.split(os.path.abspath(__file__))[0] def cd(path): import os os.chdir(path) _client[:].apply_sync(cd, mydir) return _client
def time_throughput(nmessages, t=0, f=wait): client = parallel.Client() view = client.load_balanced_view() # do one ping before starting timing if f is echo: t = np.random.random(t / 8) view.apply_sync(echo, '') client.spin() tic = time.time() for i in xrange(nmessages): view.apply(f, t) lap = time.time() client.wait() toc = time.time() return lap - tic, toc - tic
def _wait_for_cluster(self, timeout): tic = time.time() #Wait to connect to the controller while True and time.time() - tic < timeout: try: rc = parallel.Client(profile=self.profile_name) break except IOError: time.sleep(2) #wait for all engines to come online while True and time.time() - tic < timeout: if len(rc.ids) == self.nengines: return True else: time.sleep(2) return False
def wait_for_controller(self): """Loops until the controller is ready""" tic = time.time() while True: if time.time() - tic > 30: break self.logger.debug('waiting for controller ' + str(time.time() - tic) ) try: rc = parallel.Client(profile=self.profile) return True except ValueError, e: self.logger.debug(e) time.sleep(2) except IOError, e: self.logger.debug(e) time.sleep(2)
def __init__(self, Y, in_parallel=False): """ An illustration of quadtarture for use with var_EP. """ Tilted.__init__(self, Y) self.Y = Y.flatten() # we're only doing 1D at the moment self.num_data = self.Y.size self.lik = student_t( ) # hard coded right now. Incorporate into GPy when the code is ready. self._has_params = True self.num_params = 2 self.parallel = in_parallel if self.parallel: self.client = parallel.Client() self.dv = self.client.direct_view()
def __init__(self, dsetname='dataset'): n_proposal = 100 self.dsetname = dsetname print("master>>init() dsetname: {}".format(dsetname)) #creat dview print("master>> create dview") # init cluster client self.clients = parallel.Client(packer='pickle') self.clients.block = True #0 use master as engine #1 donot use master as engine self.dview = self.clients.direct_view(self.clients.ids[0:]) self.dview.block = True #engine init self.dview.execute("""import os; os.chdir(r'%s')""" % os.getcwd()) #self.eng=engine() print("master>> init engine") try: __import__('imp').find_module('pforest') print "Found pforest" self.dview.execute('from pforest.dataset import dataset') self.dview.execute('from pforest.engine import engine') except ImportError: print "Not found pforest. Importing local modules" self.dview.execute('from %s import dataset' % (dsetname)) self.dview.execute('from engine import engine') #self.dview.execute("reload(dataset)") for i, dv in enumerate(self.clients): dv.execute('dset=dataset(%d,%d,_prefix="%s")'\ %(i,n_proposal//len(self.clients.ids), dsetname )) self.dview.execute('eng=engine(dset)') self.engines_path = self.dview.gather('dset.path') print "debug:master:__init__: %s" % self.engines_path #dont need to gather # print("master>> gather engines") # self.engs=self.dview.gather('eng') # print("master>>engs:\n{}".format(self.engs)) #init local variables print("master>> init local variables") self.minbagsize = 2 self.maxdepth = 20 #self.maxdepth=10 self.queue = None self.root = None self.node = None
def test_get_result(self): """test getting results from the Hub.""" c = pmod.Client(profile='iptest') # self.add_engines(1) t = c.ids[-1] v = c[t] v2 = self.client[t] ar = v.apply_async(wait, 1) # give the monitor time to notice the message time.sleep(.25) ahr = v2.get_result(ar.msg_ids[0]) self.assertTrue(isinstance(ahr, AsyncHubResult)) self.assertEqual(ahr.get(), ar.get()) ar2 = v2.get_result(ar.msg_ids[0]) self.assertFalse(isinstance(ar2, AsyncHubResult)) c.spin() c.close()
def main(nodes, edges): """Generate a random graph, submit jobs, then validate that the dependency order was enforced. Finally, plot the graph, with time on the x-axis, and in-degree on the y (just for spread). All arrows must point at least slightly to the right if the graph is valid. """ from matplotlib import pyplot as plt from matplotlib.dates import date2num from matplotlib.cm import gist_rainbow print("building DAG") G = random_dag(nodes, edges) jobs = {} pos = {} colors = {} for node in G: jobs[node] = randomwait client = parallel.Client() view = client.load_balanced_view() print("submitting %i tasks with %i dependencies" % (nodes, edges)) results = submit_jobs(view, G, jobs) print("waiting for results") view.wait() print("done") for node in G: md = results[node].metadata start = date2num(md.started) runtime = date2num(md.completed) - start pos[node] = (start, runtime) colors[node] = md.engine_id validate_tree(G, results) nx.draw(G, pos, node_list=colors.keys(), node_color=colors.values(), cmap=gist_rainbow, with_labels=False) x, y = zip(*pos.values()) xmin, ymin = map(min, (x, y)) xmax, ymax = map(max, (x, y)) xscale = xmax - xmin yscale = ymax - ymin plt.xlim(xmin - xscale * .1, xmax + xscale * .1) plt.ylim(ymin - yscale * .1, ymax + yscale * .1) return G, results
def wait_for_engines(self): """Loops until engies have started""" tic = time.time() while True and time.time() - tic < 120: try: rc = parallel.Client(profile=self.profile) if len(rc.ids) == len(self.engines): self.logger.debug('Engines started ' + str(len(rc.ids)) ) return True else: self.logger.debug('waiting for engines ' + str(time.time() - tic) + ' ' + str(len(rc.ids))) time.sleep(2) except ValueError, e: self.logger.debug(e) time.sleep(2) except IOError, e: self.logger.debug(e) time.sleep(2)
def run_commands(self, commands): """Maps the commands to the execute_command function, in parallel""" self.logger.debug('running') rc = parallel.Client(profile=self.profile) lview = rc.load_balanced_view() lview.retries = 10 number_of_jobs = len(commands) self.logger.debug(number_of_jobs) tic = time.time() ar = lview.map(execute_command, commands) for i,r in enumerate(ar): self.logger.debug("task: %i finished on %s, %.3f percent finished at time %.3f "%( i, r['host'], 100*((i+1)/float(number_of_jobs)), time.time()-tic )) self.logger.debug('done')
def __init__(self, ensemble_path, scorefn, votefn, random_seed=0, client=None): """ scorefn = function used to score model (in greedy search) sig = scorefn(y, yhat) RETURNS score votefn = function used to combine different model outputs sig = votefn(yhats) RETURNS combined_yhat client = client to IPython.parallel.Client, if None, create new one """ self.ensemble_path = ensemble_path self.scorefn = scorefn self.votefn = votefn self.random_seed = random_seed self.client = client or parallel.Client() self.ensemble_ = []
def add_images(self, image_urls, image_ids=None): """ Add all images in a list of URLs. If ipcluster is running, load images in parallel. Parameters ---------- image_urls : list image_ids : list, optional If given, images are stored with the given ids. If None, the index of the image in the dataset is its id. """ collection.ensure_index('id') # Construct the arguments list due to IPython.parallel's pickling if image_ids is None: jobs = [(url, None, self.palette) for url in image_urls] else: jobs = [(url, _id, self.palette) for url, _id in zip(image_urls, image_ids)] print("Loading images...") tt = TicToc() parallelized = False try: rc = parallel.Client() lview = rc.load_balanced_view() parallelized = True except: warn( Warning("Launch an IPython cluster to parallelize \ ImageCollection loading.")) if parallelized: results = lview.map(process_image, jobs) results.wait_interactive() else: results = map(process_image, jobs) collection.ensure_index('id') print("Finished inserting {} images in {:.3f} s".format( len(image_urls), tt.qtoc()))
def start_validation(setup_code): """ Perform the validation with IPython parallel processing. Parameters ---------- setup_code : string Path to .py file containing the setup for the validation. """ c = parallel.Client() dv = c[:] lview = c.load_balanced_view() dv.run(setup_code, block=True) jobs = None try: jobs = dv['jobs'][0] except parallel.CompositeError: print("Variable 'jobs' is not defined!") save_path = None try: save_path = dv['save_path'][0] except parallel.CompositeError: print("Variable 'save_path' is not defined!") to_write = len(jobs) if (jobs is not None) and (save_path is not None): with lview.temp_flags(retries=2): amr = lview.map_async(func, jobs) results = zip(amr, jobs) for result, job in results: netcdf_results_manager(result, save_path) to_write -= 1 print('job = ' + str(job), 'remaining jobs = ' + str(to_write)) c[:].clear()
from IPython import parallel from datetime import datetime from DataMining.code.com import log,parallels import os rc= parallel.Client() lview = rc.load_balanced_view() lview.block = True from DataMining.code.com.BigData import BigData input_files = BigData.GetInputFiles('./DataMining/data/') @lview.parallel() def processFile(filep): from DataMining.code.com import log, parallels import os from ujson import loads, dumps import gzip outfilep = './DataMining/uncompressed/sel_cities/'+ os.path.basename(filep) + '.json' f = gzip.open(filep) logger = log.logger('Parallel/'+os.path.basename(filep)) logger.log( 'finding all records with location for: ' + f.name) locs = {} tot_lines =0 loc_lines =0 line = f.readline() while line:
def optimize(self, method, quantiles=(.1, .3, .5, .7, .9), n_runs=3, n_bootstraps=0, parallel_profile=None): """ Optimize model using ML, chi^2 or G^2. :Input: method : str Optimization method ('ML', 'chisquare' or 'gsquare'). quantiles : tuple A sequence of quantiles to be used for chi^2 and G^2. Default values are the ones used by Ratcliff (.1, .3, .5, .7, .9). n_runs : int <default=3> Number of attempts to optimize. n_bootstraps : int <default=0> Number of bootstrap iterations. parrall_profile : str <default=None> IPython profile for parallelization. :Output: results <dict> - a results dictionary of the parameters values. :Note: The values of the nodes in single subject model is updated according to the results. The nodes of group models are not updated """ results = self._run_optimization(method=method, quantiles=quantiles, n_runs=n_runs) #bootstrap if requested if n_bootstraps == 0: return results #init DataFrame to save results res = pd.DataFrame(np.zeros((n_bootstraps, len(self.values))), columns=list(self.values.keys())) #prepare view for parallelization if parallel_profile is not None: #create view client = parallel.Client(profile=parallel_profile) view = client.load_balanced_view() runs_list = [None] * n_bootstraps else: view = None #define single iteration bootstrap function def single_bootstrap(data, accumulator_class=self.__class__, class_kwargs=self._kwargs, method=method, quantiles=quantiles, n_runs=n_runs): #resample data new_data = data.iloc[np.random.randint(0, len(data), len(data))] new_data = new_data.set_index(pd.Index(list(range(len(data))))) h = accumulator_class(new_data, **class_kwargs) #run optimization h._run_optimization(method=method, quantiles=quantiles, n_runs=n_runs) return pd.Series(h.values, dtype=np.float) #bootstrap iterations for i_strap in range(n_bootstraps): if view is None: res.iloc[i_strap] = single_bootstrap(self.data) else: # append to job queue runs_list[i_strap] = view.apply_async(single_bootstrap, self.data) #get parallel results if view is not None: view.wait(runs_list) for i_strap in range(n_bootstraps): res.iloc[i_strap] = runs_list[i_strap].get() #get statistics stats = res.describe() for q in [2.5, 97.5]: stats = stats.append( pd.DataFrame(res.quantile(q / 100.), columns=[repr(q) + '%']).T) self.bootstrap_stats = stats.sort_index() return results
from IPython import parallel with drctview.sync_imports(): import numpy clients = parallel.Client(profile=’testprofile’) drctview = clients[:] drctview.activate() drctview.block=True %px dummymatrix = numpy.random.rand(4,4) %px eigenvalue = numpy.linalg.eigvals(dummymatrix) drctview['eigenvalue'] %pxconfig --noblock %autopx maximum_egnvals = [] for idx in range(50): arr = numpy.random.rand(10,10) egnvals = numpy.linalg.eigvals(arr) maximum_egnvals.append(egnvals[0].real) %autopx %pxconfig --block %px answer= "The average maximum eigenvalue is: %f"%(sum(maximum_egnvals)/len(maximum_egnvals)) dv['answer'] %%px --block --group-outputs=engine import numpy as np arr = np.random.random (4,4) egnvals = numpy.linalg.eigvals(arr) print egnvals egnvals.max() egnvals.min()
def _get_engines(self): rc = parallel.Client() view = rc[:] return view
import numpy as np import matplotlib.pyplot as plt import os import sys import netCDF4 import numpy.ma as ma from pylab import * sys.path.append('/noc/users/hb1g13/Python/python_functions/') import SG as SG import layers_calc_numba sys.path.append('/noc/users/hb1g13/Python/python_functions/MITgcmUtils/') import utils # Set up processors rc = parallel.Client( '/noc/users/hb1g13/.ipython/profile_maelstrom/security/ipcontroller-client.json' ) dv = rc[:] rc.ids # Now each processor needs to know where my modules are: dv.execute('import sys') dv.execute('sys.path.append("/noc/users/hb1g13/Python/python_functions/")') dv.execute('import layers_calc_numba') dv.execute( 'sys.path.append("/noc/users/hb1g13/Python/python_functions/MITgcmUtils/")' ) dv.execute('import utils') # Some parameteres to ensure right files are picked up: Full = 'N' # 9 Pannels isn't ideal for presentations N option give 4 plots
# coding: utf-8 # In[87]: from IPython import parallel c = parallel.Client(profile='sge', sshserver='[email protected]') view = c[:] c.ids # In[88]: get_ipython().magic(u"px print('Hello World!')") # In[89]: get_ipython().run_cell_magic( u'px', u'', u'import os\nimport socket\nprint os.getpid()\nprint socket.gethostname()') # In[90]: A = 'Shared var ' get_ipython().magic(u"px A = 'My var'") # In[91]: def myfunc(x): import os import socket