def cloud_pooler(func, in_q, chunk_size=1000, _env='python-web', _type='c2', _max_runtime=60, get_results=True, **kwargs): import cloud if chunk_size > 1: if isinstance(in_q, collections.Iterable): in_q = WebQueue(in_q) chunks = [] chunk = [] while not in_q.empty(): chunk.append(in_q.get()) if len(chunk) == chunk_size: chunks.append(chunk) chunk = [] if len(chunk): chunks.append(chunk) else: chunks = in_q partial_func = functools.partial(func, **kwargs) jids = cloud.map(partial_func, chunks, _env=_env, _type=_type, _max_runtime=_max_runtime) if get_results: print jids for result in cloud.iresult(jids, ignore_errors=True): if result: yield result else: for jid in jids: yield jid
def picloud(): t1 = time.time() jids = cloud.map(testfunc, np.arange(K), _type="f2", _vol="my-vol", _env="base/precise") # get the results cloud.result(jids) t2 = time.time() return t2 - t1
def start_inference(infilename, outfilename): ITERS = SAMPLER_ITERS indata = pickle.load(open(infilename, 'r')) filenames = indata['filenames'] data_filename = filenames['data'] latent_filenames = [] config_filenames = [] for chain, v in filenames['chains'].iteritems(): latent_filenames.append(v['latent']) config_filenames.append(v['config']) jids = cloud.map(inference_run_ld, latent_filenames, [data_filename] * CHAINS_TO_RUN, config_filenames, [ITERS] * CHAINS_TO_RUN, range(CHAINS_TO_RUN), _env='connectivitymotif', _type='f2') # fixme save all inputs pickle.dump( { 'infile': indata['infile'], 'filenames': filenames, 'jids': jids }, open(outfilename, 'w'))
def timing_triple_map_cloud(): execfile('picloud_venture_credentials.py') exp_params = experiment.exp_param_defaults({}) exp_params['intermediate_iter'] = 1 exp_params['max_initial_run_time'] = 30 exp_params['max_burn_time'] = 30 exp_params['max_sample_time'] = 30 exp_params['n_samples'] = 25 print experiment.exp_params_to_str(exp_params) data = scipy.io.loadmat("../data/irm_synth/irm_synth_20.mat", squeeze_me=True) observed = list(zip(data['train_i'].flat, data['train_j'].flat, data['train_v'].flat)) missing = list(zip(data['test_i'].flat, data['test_j'].flat, data['test_v'].flat)) data = {'observations' : observed, 'missing' : missing} model = models.product_IRM model_params = {'D' : 1, 'alpha' : 1, 'symmetric' : True} # Timing run print 'Timing' print 'Mapping' job_id = cloud.call(experiment.network_cv_timing_run, data, model, exp_params, model_params, _max_runtime=5, _env=cloud_environment) print 'Waiting' time_per_mh_iter = cloud.result(job_id)['time_per_mh_iter'] # Live run print 'Live' print 'Mapping' exp_params['intermediate_iter'] = max(1, int(round(0.9 * exp_params['max_sample_time'] / (exp_params['n_samples'] * time_per_mh_iter)))) job_ids = cloud.map(experiment.network_cv_single_run, itertools.repeat(data, 5), itertools.repeat(model, 5), itertools.repeat(exp_params, 5), itertools.repeat(model_params, 5), _max_runtime=5, _env=cloud_environment) print 'Waiting' cloud.join(job_id) print cloud.result(job_ids)
def multimap(fun, args, naccounts=None): if naccounts is None: naccounts = len(api_keys) max_parallel = sum(array(parallelism[:naccounts])) if len(args) <= max_parallel: naccounts = nonzero(cumsum(parallelism) - len(args) >= 0)[0][0] + 1 size = parallelism[:naccounts] else: size = [len(args) / naccounts for _ in xrange(naccounts)] if naccounts > 1: size[-1] = len(args) - sum(array(size[:-1])) else: size[0] = len(args) # jids[i] contains the job indices for account i jids = [None for _ in xrange(naccounts)] # Launches the jobs k = 0 for i in xrange(naccounts): api_key = api_keys[i] api_secretkey = api_secretkeys[i] n = size[i] args_tmp = args[k:k + n] if len(args_tmp) > 0: print "Launching %d jobs with account %d..." % (len(args_tmp), i + 1) cloud.setkey(api_key=api_key, api_secretkey=api_secretkey) jids[i] = cloud.map(fun, args_tmp, _high_cpu=True) print " Jobs:", jids[i] k += n return jids
def cloudHandEVByCount(players=6, TOTAL_SHOES=100000000, CHUNKING=100000): ''' Accumulate the calcHandEVByCount of __many__ games using pycloud. ''' CHUNKS = TOTAL_SHOES / CHUNKING totEV = {} totEVName = "HandEVByCount-%d-p%d.p" % (TOTAL_SHOES, players) # benchmarking start = time.clock() cloudIds = cloud.map(workerHandEVByCount, (players,)*CHUNKS, (CHUNKING,)*CHUNKS) for ev in cloud.iresult(cloudIds, ignore_errors=True): if not isinstance(ev, dict): print "error" print ev else: for (key, (gResult, gTotHands)) in ev.iteritems(): result, totHands = totEV.get(key, (0,0)) result += gResult totHands += gTotHands totEV[key] = (result, totHands) # benchmarking end = time.clock() print end-start totEVFile = open(totEVName, 'w') pickle.dump(totEV, totEVFile) totEVFile.close return totEV
def start_inference(infilename, outfilename): ITERS = SAMPLER_ITERS indata = pickle.load(open(infilename, 'r')) filenames = indata['filenames'] data_filename = filenames['data'] latent_filenames = [] config_filenames = [] for chain, v in filenames['chains'].iteritems(): latent_filenames.append(v['latent']) config_filenames.append(v['config']) jids = cloud.map(inference_run_ld, latent_filenames, [data_filename]*CHAINS_TO_RUN, config_filenames, [ITERS] * CHAINS_TO_RUN, range(CHAINS_TO_RUN), _env='connectivitymotif', _type='f2') # fixme save all inputs pickle.dump({'infile' : indata['infile'], 'filenames' : filenames, 'jids' : jids}, open(outfilename, 'w'))
def main(n_rbms=5, save_folder='../data/mnist/many-rbm-samples/default', cloud_simulation=True): pass execfile('picloud_misc_credentials.py') if cloud_simulation: cloud.start_simulator() if not os.path.isdir(save_folder): os.makedirs(save_folder) seeds = [np.random.randint(2**31) for dummy in range(n_rbms)] print 'Sending jobs' job_ids = cloud.map(train_and_sample, seeds, _type='f2', _cores=1) print 'Jobs sent' images = np.zeros((0, 28 * 28)) labels = np.zeros((0, 1)) count = 1 for (some_images, some_labels) in cloud.iresult(job_ids): print 'Job %d of %d complete' % (count, n_rbms) count += 1 images = np.vstack((images, some_images)) labels = np.vstack((labels, some_labels)) np.savetxt(os.path.join(save_folder, 'images.csv'), images, delimiter=',') np.savetxt(os.path.join(save_folder, 'labels.csv'), labels, delimiter=',') return (images, labels)
def main(n_rbms=5, save_folder='../data/mnist/many-rbm-samples/default', cloud_simulation=True): execfile('picloud_misc_credentials.py') if cloud_simulation: cloud.start_simulator() #n_rbms = 4 #save_folder = 'picloud_test' if not os.path.isdir(save_folder): os.makedirs(save_folder) seeds = [np.random.randint(2**31) for dummy in range(n_rbms)] print 'Sending jobs' job_ids = cloud.map(train_and_sample, seeds, _type='f2', _cores=1) print 'Jobs sent' images = np.zeros((0,28*28)) labels = np.zeros((0,1)) count = 1 for (some_images, some_labels) in cloud.iresult(job_ids): print 'Job %d of %d complete' % (count, n_rbms) count += 1 images = np.vstack((images, some_images)) labels = np.vstack((labels, some_labels)) np.savetxt(os.path.join(save_folder, 'images.csv'), images, delimiter=',') np.savetxt(os.path.join(save_folder, 'labels.csv'), labels, delimiter=',') return (images, labels)
def run_jobs(self, f, jobs): if self.usecloud: jids = cloud.map(f, jobs, _env=self.cloud_env, _profile=True, _depends_on=self.preprocess_job) ires = cloud.iresult(jids) else: pool = ThreadPool(processes=cv2.getNumberOfCPUs()) ires = pool.imap_unordered(f, jobs) return ires
def outer_map(y): jids=cloud.map(inner_map,range(y)) cloud.join(jids) results = cloud.result(jids) return list(results)
def run_experiment(experiment, reset_database=False): """Create and run all jobs for experiment.""" url = sql.get_database_url() if reset_database: print "Resetting database..." sql.reset_database(experiment.SQLBase, url) print "Creating jobs..." jobs = experiment.create_jobs(num_jobs=500) session = sql.get_session(url) session.add_all(jobs) session.commit() job_ids = [job.id for job in jobs] session.close() print "Running jobs..." run = lambda job_id: run_job(experiment, job_id, url) cloud.map(run, job_ids, _type="f2") print "Done!"
def traintst(Gs,ls,L): m = np.shape(Gs)[0] traintest = [] for i in range(10): sl = slice(i*m//10,(i+1)*m//10) traintest.append( (np.delete(Gs,sl,0),np.delete(ls,sl,0),Gs[sl,:],ls[sl,:],L) ) jids = cloud.map(crossvalidate,traintest,_type='c1') return cloud.result(jids)
def outer_map(y): jids = cloud.map(inner_map, range(y)) cloud.join(jids) results = cloud.result(jids) return list(results)
def run_experiments(experiment_args): if args.picloud: import cloud jids = cloud.map(run_single_experiment_wrapper, experiment_args, _env="test", _type="c2") print "Now waiting for results..." results = cloud.result(jids) return zip(experiment_args, results) else: return zip(experiment_args, [run_single_experiment(**a) for a in experiment_args])
def traintst(Gs, ls, L): m = np.shape(Gs)[0] traintest = [] for i in range(10): sl = slice(i * m // 10, (i + 1) * m // 10) traintest.append((np.delete(Gs, sl, 0), np.delete(ls, sl, 0), Gs[sl, :], ls[sl, :], L)) jids = cloud.map(crossvalidate, traintest, _type='c1') return cloud.result(jids)
def picloud(): t1 = time.time() jids = cloud.map(testfunc, np.arange(K), _type='f2', _vol="my-vol", _env="base/precise") # get the results cloud.result(jids) t2 = time.time() return t2 - t1
def main(): jobs = [] for seed in [0, 1, 2, 3, 4]: for learner_class_index in [0, 1]: jobs.append((learner_class_index, seed)) print "Scheduling jobs..." jids = cloud.map(run, jobs, _type="f2") print "Waiting for results..." results = cloud.result(jids) for job, result in zip(jobs, results): print job, result
def simulate_all_worlds(scenario,K): ''' Simulate a given parameter scenario K times. ''' print 'Simulating %d worlds.' % K # create K copies of the scenario scenarios = [] for k in xrange(K): scenarios.append(copy.copy(scenario)) jobs = cloud.map(get_stats,scenarios,_label='%d worlds' % K,_type='c2') return jobs
def hessian_on_the_cloud(name, args, chunk_size = 500): print("submitting") dims = [ A.size for A in args ] + [len(args), len(args)] H = np.empty(dims, 'double') ws_product_chunked = grouper(itertools.product(*args), chunk_size) jids = cloud.map(lambda ws: _hessian_on_the_cloud(name, ws), ws_product_chunked) print("waiting") chunked_results = cloud.result(jids) print("assembling") H.flat = list(itertools.chain.from_iterable(chunked_results)) return H
def simulate_all_worlds(scenario, K): ''' Simulate a given parameter scenario K times. ''' print 'Simulating %d worlds.' % K # create K copies of the scenario scenarios = [] for k in xrange(K): scenarios.append(copy.copy(scenario)) jobs = cloud.map(get_stats, scenarios, _label='%d worlds' % K, _type='c2') return jobs
def process_s3_files(input_bucket_name, key_glob='*', output_bucket_name_1ms=None, output_bucket_name_100ms=None, overwrite=False, use_cloud=True): if output_bucket_name_1ms is None: output_bucket_name_1ms = input_bucket_name + "-hdf-1ms" if output_bucket_name_100ms is None: output_bucket_name_100ms = input_bucket_name + "-hdf" s3_cxn = get_s3_cxn() in_bucket = s3_cxn.get_bucket(input_bucket_name) # create output buckets if they don't already exist # it's better to do this before launching remote computations s3_cxn.create_bucket(output_bucket_name_1ms) s3_cxn.create_bucket(output_bucket_name_100ms) matching_keys = [] for k in in_bucket: if fnmatch.fnmatch(k.name, key_glob): matching_keys.append(k.name) if use_cloud: print "Launching %d jobs" % len(matching_keys) def do_work(key_name): return process_s3_file(input_bucket_name, key_name, output_bucket_name_1ms, output_bucket_name_100ms, overwrite) jids = cloud.map(do_work, matching_keys, _type='f2', _label='generate HDF') progress = progressbar.ProgressBar(len(jids)).start() n_finished = 0 for _ in cloud.iresult(jids): n_finished += 1 progress.update(n_finished) progress.finish() else: print "Running locally..." print "%d keys match the pattern \'%s\'" % (len(matching_keys), key_glob) for key in matching_keys: process_s3_file(input_bucket_name, key, output_bucket_name_1ms, output_bucket_name_100ms) print "Done!"
def create_alternative_worlds(self): ''' Simulate each of the economies. ''' T = self.T I = self.I print 'Simulating a world with %d countries.' % I # these are the individual simulation jobs jobs = cloud.map(simulate_an_economy,self.economies,[T]*I,_label='A world with %d countries.' % I,_type='c2') # this is a harvester job, collecting the results of each individual simulation return cloud.call(GDP_collector, jobs, T, T1, T2,_depends_on=jobs,_label='Collecting GDP results.')
def distrubutedly_train_terms(train_terms, terms, chunk_size=1000, use_cloud=False): logging.info(u'Starting distributed training <{0}>'.format(['locally','remotely'][int(use_cloud)])) if use_cloud: import cloud job_idds = cloud.map(train_terms, terms) term_iters = cloud.result(job_ids) else: import collections chunks = collections.defaultdict(list) chunk_count = len(terms) / chunk_size + 1 for term in terms: chunks[hash(term) % chunk_count].append(term) term_iters = map(train_terms, chunks.values()) return itertools.chain.from_iterable(term_iters)
def Jgrad_picloud(params, encoding_size, decoded_size, training, wd, num_cores): W_e, W_d, b_e, b_d = unroll_params(params, encoding_size, decoded_size) gradW_e = np.zeros(W_e.shape) gradW_d = np.zeros(W_d.shape) gradb_e = np.zeros(b_e.shape) gradb_d = np.zeros(b_d.shape) # split the training set into batches, send out to picloud cores for backprop # offset = num_cores - len(training)%num_cores # for index in range(offset): # training. split = len(training) / num_cores final_training = [] for i in range(num_cores): final_training.append(training[i * split : (i + 1) * split]) offset = len(training) % num_cores if offset > 0: final_training.append(training[len(training) - offset :]) jids = cloud.map( Jgrad_picloud_sub, [params] * num_cores, [encoding_size] * num_cores, [decoded_size] * num_cores, final_training, _type="c2", ) # call for results results = cloud.result(jids) for result in results: gradW_e += result[0] gradW_d += result[1] gradb_e += result[2] gradb_d += result[3] # add weight decay factor and normalization coefficient a = 1.0 / len(training) grad_J_W_e = a * gradW_e + wd * W_e grad_J_W_d = a * gradW_d + wd * W_d grad_J_b_e = a * gradb_e grad_J_b_d = a * gradb_d # roll up and return as 1-d array return np.concatenate((grad_J_W_e.flatten(), grad_J_W_d.flatten(), grad_J_b_e.flatten(), grad_J_b_d.flatten()))
def main(): names = list(uai_net.names(500)) print "Submitting {} jobs...".format(len(names)) jids = cloud.map(run_job, names, _type="f2") print "Waiting for results..." for name, marginals_string, is_exact in cloud.result(jids): print name exact = "true" if is_exact else "approx" f = open( os.path.join(os.path.dirname(__file__), "../data/marginals/uai/{}.{}.mar".format(name, exact)), "w") f.write(marginals_string) f.close()
def calc_pi(): num_jobs = 8 tests_per_call = total_tests/num_jobs # argument list has 8 duplicate elements jids = cloud.map(monte_carlo, [tests_per_call]*num_jobs, _type='c2') # get list of all counts num_in_circle_list = cloud.result(jids) # sum all counts num_in_circle = sum(num_in_circle_list) pi = (4 * num_in_circle) / float(total_tests) return pi
def calc_pi(): num_jobs = 8 tests_per_call = total_tests / num_jobs # argument list has 8 duplicate elements jids = cloud.map(monte_carlo, [tests_per_call] * num_jobs, _type='c2') # get list of all counts num_in_circle_list = cloud.result(jids) # sum all counts num_in_circle = sum(num_in_circle_list) pi = (4 * num_in_circle) / float(total_tests) return pi
def test(batch, remote, debug, dependency = []): params = cache.get("batch/%s/params" % batch, remote) numEpisodes = params['episodes']['num'] i_ = range(numEpisodes) f = lambda i : tester.test(batch, params, i, remote, debug) logging.info("running %s test instances" % len(i_)) if (remote): k_ = cloud.map(f, i_, _label = "%s/test" % batch, _depends_on = dependency, _type = 'c1', _max_runtime = 30) logging.info("k_ %s" % k_) return k_ else: results = map(f, i_) return results
def process_s3_files(input_bucket_name, key_glob = '*', output_bucket_name_1ms = None, output_bucket_name_100ms = None, overwrite = False, use_cloud = True): if output_bucket_name_1ms is None: output_bucket_name_1ms = input_bucket_name + "-hdf-1ms" if output_bucket_name_100ms is None: output_bucket_name_100ms = input_bucket_name + "-hdf" s3_cxn = get_s3_cxn() in_bucket = s3_cxn.get_bucket(input_bucket_name) # create output buckets if they don't already exist # it's better to do this before launching remote computations s3_cxn.create_bucket(output_bucket_name_1ms) s3_cxn.create_bucket(output_bucket_name_100ms) matching_keys = [] for k in in_bucket: if fnmatch.fnmatch(k.name, key_glob): matching_keys.append(k.name) if use_cloud: print "Launching %d jobs" % len(matching_keys) def do_work(key_name): return process_s3_file( input_bucket_name, key_name, output_bucket_name_1ms, output_bucket_name_100ms, overwrite) jids = cloud.map(do_work, matching_keys, _type = 'f2', _label='generate HDF') progress = progressbar.ProgressBar(len(jids)).start() n_finished = 0 for _ in cloud.iresult(jids): n_finished += 1 progress.update(n_finished) progress.finish() else: print "Running locally..." print "%d keys match the pattern \'%s\'" % (len(matching_keys), key_glob) for key in matching_keys: process_s3_file(input_bucket_name, key, output_bucket_name_1ms, output_bucket_name_100ms) print "Done!"
def process_s3_files( input_bucket_name, key_glob="*", output_bucket_name_1ms=None, output_bucket_name_100ms=None, overwrite=False, use_cloud=True, ): if output_bucket_name_1ms is None: output_bucket_name_1ms = input_bucket_name + "-hdf-1ms" if output_bucket_name_100ms is None: output_bucket_name_100ms = input_bucket_name + "-hdf" matching_keys = cloud_helpers.get_matching_key_names(input_bucket_name, key_glob) s3_cxn = cloud_helpers.get_s3_cxn() # create output buckets if they don't already exist # it's better to do this before launching remote computations s3_cxn.create_bucket(output_bucket_name_1ms) s3_cxn.create_bucket(output_bucket_name_100ms) if use_cloud: print "Launching %d jobs" % len(matching_keys) def do_work(key_name): return process_s3_file( input_bucket_name, key_name, output_bucket_name_1ms, output_bucket_name_100ms, overwrite ) label = "Generate HDF files for %s/%s" % (input_bucket_name, key_glob) jids = cloud.map(do_work, matching_keys, _type="f2", _label=label) progress = progressbar.ProgressBar(len(jids)).start() n_finished = 0 for _ in cloud.iresult(jids): n_finished += 1 progress.update(n_finished) progress.finish() else: print "Running locally..." print "%d keys match the pattern '%s'" % (len(matching_keys), key_glob) for key in matching_keys: process_s3_file(input_bucket_name, key, output_bucket_name_1ms, output_bucket_name_100ms) print "Done!"
def launch_jobs(bucket, key_names, work_fn, combine, acc, label, _type, accept_none_as_result, retry_timeouts = True): jids = cloud.map(\ lambda name: work_fn(bucket, name), key_names, _type = _type, _label= label, _env = 'compute') timed_out = [] try: progress = progressbar.ProgressBar(len(jids)).start() for (i, result) in enumerate(cloud.iresult(jids, num_in_parallel = 25)): if result is None and not accept_none_as_result: print "Job #", jids[i], key_names[i], "returned None" elif combine: # client-side reduction! Be careful about not doing too much # work here new_acc = combine(acc, result) if new_acc is not None: acc = new_acc progress.update(i+1) except KeyboardInterrupt: print "Caught keyboard interrupt, killing active workers..." cloud.kill(jids) return acc except cloud.CloudException as e: if isinstance(e.parameter, ssl.SSLError) and retry_timeouts: print "Job #", jids[i], "timed out" timed_out.append(key_names[i]) else: print "Killing workers..." cloud.kill(jids) raise except: print "Killing workers..." cloud.kill(jids) raise finally: progress.finish() if len(timed_out) > 0: return launch_jobs(bucket, key_names, work_fn, combine, acc, label, _type, accept_none_as_result, retry_timeouts = None) return acc
def train(batch, remote, debug, dependency = []): params = cache.get("batch/%s/params" % batch, remote) numEpisodes = params['episodes']['num'] trainParams = params['train'] numIters = trainParams['iters'] ij_ = [(i, j) for i, j in it.product(range(numEpisodes), range(numIters))] f = lambda (i, j) : trainer.train(batch, params, i, j, remote, debug) logging.info("running %s train instances" % len(ij_)) if (remote): k_ = cloud.map(f, ij_, _label = "%s/train" % batch, _depends_on = dependency, _type = 'c1', _max_runtime = 30) logging.info("k_ %s" % k_) return k_ else: results = map(f, ij_) return results
def get_shares_bulk(urls,limit=-1,use_cloud=False): if use_cloud: # using picloud. parallelizing on chunks chunks = list(parallel.partitions(urls,100)) def f(url): return map(lambda url: (url,get_shares(url)),url) jids = cloud.map(f,chunks) ret = list(itertools.chain(*cloud.result(jids))) else: # local. parallelizing using pool ret = list(parallel.imap(get_shares,urls,threads=10)) ret.sort(key= lambda (u,r) : -r['shares_count']) if limit == -1: return ret else: return ret[:limit]
def call_on_cloud(cmd_params, core_type, num_batches, start_batch_num, end_batch_num): ntests = len(cmd_params) batch_size = int(math.ceil(ntests/(num_batches+0.0))) batch_edges = batch_size*np.array(xrange(num_batches))[start_batch_num : end_batch_num] print batch_edges for i in xrange(len(batch_edges)): if i==len(batch_edges)-1: cmds = cmd_params[batch_edges[i]:] else: cmds = cmd_params[batch_edges[i]:min(batch_edges[i+1], len(cmd_params))] print colorize("calling on cloud..", "yellow", True) try: jids = cloud.map(run_sim_test, cmds, _vol='rss_dat', _env='RSS3', _type=core_type) res = cloud.result(jids) print colorize("got results for batch %d/%d "%(i, len(batch_edges)), "green", True) save_results(res) except Exception as e: print "Found exception %s. Not saving data for this demo."%e
def picloud(func, *args, **kwargs): """ Runs the given function in parallel over the PiCloud cluster. Parameters ---------- func : function Function to run in parallel. In addition to the function 'func' to be run in parallel, the picloud function accepts a series of arguments that are passed to the function as variables. In general, the function can have multiple input variables, and these arguments must be passed in the same order as they are defined in the function definition. Furthermore, several keyword arguments may be given that set the settings for the PiCloud cluster: _type - Type of core used in picloud: 'c1', 'c2', 'f2' (default), 'm1', 's1' _cores - Number of cores used: 1 (default) _env - Custom environment for computation. Set to current version of qutip. _label - Provide a label for the current computation. For more information see the PiCloud website: http://www.picloud.com/ """ kw = _default_cloud_settings() for keys in kwargs.keys(): if keys not in kw.keys(): raise Exception(str(keys) + ' is not a valid kwarg.') else: kw[keys] = kwargs[keys] job_ids = cloud.map(func, *args, **kw) results = cloud.result(job_ids) if isinstance(results[0], tuple): par_return = [elem for elem in results] num_elems = len(results[0]) return [ np.array([elem[ii] for elem in results]) for ii in range(num_elems) ] else: return list(results)
def picloud(func, *args, **kwargs): """ Runs the given function in parallel over the PiCloud cluster. Parameters ---------- func : function Function to run in parallel. In addition to the function 'func' to be run in parallel, the picloud function accepts a series of arguments that are passed to the function as variables. In general, the function can have multiple input variables, and these arguments must be passed in the same order as they are defined in the function definition. Furthermore, several keyword arguments may be given that set the settings for the PiCloud cluster: _type - Type of core used in picloud: 'c1', 'c2', 'f2' (default), 'm1', 's1' _cores - Number of cores used: 1 (default) _env - Custom environment for computation. Set to current version of qutip. _label - Provide a label for the current computation. For more information see the PiCloud website: http://www.picloud.com/ """ kw = _default_cloud_settings() for keys in kwargs.keys(): if keys not in kw.keys(): raise Exception(str(keys) + ' is not a valid kwarg.') else: kw[keys] = kwargs[keys] job_ids = cloud.map(func, *args, **kw) results = cloud.result(job_ids) if isinstance(results[0], tuple): par_return = [elem for elem in results] num_elems = len(results[0]) return [np.array([elem[ii] for elem in results]) for ii in range(num_elems)] else: return list(results)
def cloud_map(func, args, jobs=None, return_jobs=False, **cloud_opts): """ Call cloud.map, with some standard logging info Parameters ---------- func : function to map args : list of mapping arguments jobs : list of pre-existing job ids, or None If present, will fetch the results from these jobs return_jobs : boolean (optional, default false) If True, return the job IDs instead of the job results cloud_opts : dict (optional) Extra keyword arguments to pass to cloud.map Returns ------- Result of cloud.map if return_jobs=False, else the job ids """ import cloud cloud_opts.setdefault('_env', 'mwp') cloud_opts.setdefault('_type', 'c2') cloud_opts.setdefault('_label', func.__name__) if jobs is None: log = logging.getLogger(func.__module__) log.debug( "Starting %i jobs on PiCloud for %s" % (len(args), func.__name__)) jobs = cloud.map(func, args, **cloud_opts) log.debug("To re-fetch results, use \n" "%s(jobs=range(%i, %i))" % (func.__name__, min(jobs), max(jobs) + 1)) if return_jobs: return jobs return cloud.result(jobs)
def url_chunker(url, chunksize=1024): """Returns an iterator over contents of a file *Params* #file - an open FILE object #chunksize - how many lines to read at once? """ #url=book[0] #bookname=book[1] user_agent = {'User-agent': 'Mozilla/5.0'} result = requests.get(url, headers=user_agent) try: doc = result.content except: raise Exception("URL " + url + "not responding") text_in = StringIO(doc) chunks = [] stop = False while not stop: text = "" for x in range(chunksize): try: text += text_in.next() except StopIteration: chunks.append(text) stop = True break chunks.append(text) jobids = cloud.map(wordcount, [(url, c) for c in chunks]) cloud.join(jobids, deadlock_check=False) results = cloud.result(jobids) index = reduce_results(results) mongo_insert(index) return "OK"
def url_chunker(url, chunksize=1024): """Returns an iterator over contents of a file *Params* #file - an open FILE object #chunksize - how many lines to read at once? """ #url=book[0] #bookname=book[1] user_agent = {'User-agent': 'Mozilla/5.0'} result=requests.get(url,headers=user_agent) try: doc = result.content except: raise Exception("URL "+url+"not responding") text_in=StringIO(doc) chunks = [] stop = False while not stop: text="" for x in range(chunksize): try: text+=text_in.next() except StopIteration: chunks.append(text) stop=True break chunks.append(text) jobids = cloud.map(wordcount, [(url,c) for c in chunks]) cloud.join(jobids,deadlock_check=False) results = cloud.result(jobids) index=reduce_results(results) mongo_insert(index) return "OK"
def run_ip(): #Figure out how many jobs I want to create and how many requests per job job_count = int(sys.argv[1]) job_rows = range(0, job_count) #Now actually map them to run in the cloud #The "s1" type gives unique IP addresses. Eek print "Creating job map for {0} jobs.".format(len(job_rows)) jids = cloud.map(download_ip, job_rows, _type="s1") print "Waiting for jobs to complete." #The possible statuses and the statuses we are waiting for possible_job_statutes = ["waiting", "queued", "processing", "done", "error", "killed", "stalled"] pending_job_statuses = Set(["waiting", "queued", "processing"]) #Keep looping until no job statuses are in the pending_job_statuses statuses = [] while True: statuses = cloud.status(jids) tally = Counter() for status in statuses: tally[status] += 1 print "Status of jobs: " + str(tally) #If none of the statuses are in pending_job_statuses, we are done! if len(pending_job_statuses.intersection(Set(statuses))) == 0: break #Wait for 5 seconds between checks sleep(5) #Now loop through the jobs and retrieve the results ip_counter = Counter() results = cloud.result(jids) for result in results: ip_counter[result] += 1 print "IP Addresses: " + str(ip_counter)
def detect_trials(pos_path, neg_path, threshold, test_frac, cmpr_window, cmpr_step, w_smooth, gamma, p_sample, detection_step, min_dist_step, detection_window_hrs, req_consec_detections): trials = 5 pos_path_ = [pos_path] * trials neg_path_ = [neg_path] * trials threshold_ = [threshold] * trials test_frac_ = [test_frac] * trials cmpr_window_ = [cmpr_window] * trials cmpr_step_ = [cmpr_step] * trials w_smooth_ = [w_smooth] * trials gamma_ = [gamma] * trials p_sample_ = [p_sample] * trials detection_step_ = [detection_step] * trials min_dist_step_ = [min_dist_step] * trials detection_window_hrs_ = [detection_window_hrs] * trials req_consec_detections_ = [req_consec_detections] * trials jids = cloud.map(detect_trial, pos_path_, neg_path_, threshold_, test_frac_, cmpr_window_, cmpr_step_, w_smooth_, gamma_, p_sample_, detection_step_, min_dist_step_, detection_window_hrs_, req_consec_detections_, _type = 'f2') params = Params(pos_path, neg_path, threshold, test_frac, cmpr_window, cmpr_step, w_smooth, gamma, p_sample, detection_step, min_dist_step, detection_window_hrs, req_consec_detections) return params, jids
def detect_trials(pos_path, neg_path, threshold, test_frac, cmpr_window, cmpr_step, w_smooth, gamma, p_sample, detection_step, min_dist_step, detection_window_hrs, req_consec_detections): trials = 5 pos_path_ = [pos_path] * trials neg_path_ = [neg_path] * trials threshold_ = [threshold] * trials test_frac_ = [test_frac] * trials cmpr_window_ = [cmpr_window] * trials cmpr_step_ = [cmpr_step] * trials w_smooth_ = [w_smooth] * trials gamma_ = [gamma] * trials p_sample_ = [p_sample] * trials detection_step_ = [detection_step] * trials min_dist_step_ = [min_dist_step] * trials detection_window_hrs_ = [detection_window_hrs] * trials req_consec_detections_ = [req_consec_detections] * trials jids = cloud.map(detect_trial, pos_path_, neg_path_, threshold_, test_frac_, cmpr_window_, cmpr_step_, w_smooth_, gamma_, p_sample_, detection_step_, min_dist_step_, detection_window_hrs_, req_consec_detections_, _type='f2') params = Params(pos_path, neg_path, threshold, test_frac, cmpr_window, cmpr_step, w_smooth, gamma, p_sample, detection_step, min_dist_step, detection_window_hrs, req_consec_detections) return params, jids
def create_alternative_worlds(self): ''' Simulate each of the economies. ''' T = self.T I = self.I print 'Simulating a world with %d countries.' % I # these are the individual simulation jobs jobs = cloud.map(simulate_an_economy, self.economies, [T] * I, _label='A world with %d countries.' % I, _type='c2') # this is a harvester job, collecting the results of each individual simulation return cloud.call(GDP_collector, jobs, T, T1, T2, _depends_on=jobs, _label='Collecting GDP results.')
def extract_snapshots_on_cloud(demo_type, core_type): """ runs snapshot extraction on the cloud and saves the result on local machine. """ demo_testing_dir = osp.join(testing_results_dir, demo_type) env_state_files = find_recursive(demo_testing_dir, '*.cp') state_infos = [] for env_state_file in env_state_files[0:2]: with open(env_state_file,"r") as fh: seg_info = cp.load(fh)['seg_info'] if seg_info == None: continue save_dir = osp.join(osp.dirname(env_state_file), 'snapshots', osp.splitext(osp.basename(env_state_file))[0]) state_infos.append((seg_info, save_dir)) print colorize("calling on cloud..", "yellow", True) jids = cloud.map(get_state_snapshots, state_infos, _env='RSS3', _type=core_type) res = cloud.result(jids) print colorize("got snapshots from cloud for : %s. Saving..."%demo_type, "green", True) save_snapshots_from_cloud(res)
TRIALS = int(sys.argv[1]) NUM_PARTICLES = int(sys.argv[2]) DELTA = int(sys.argv[3]) INTEGRAL_PATHS = int(sys.argv[4]) def run_on_instance(trial_id): global number_of_clusters global if_zero_shortlearning global experiment_name import subprocess import os os.environ['DISPLAY'] = ":1" print "Starting" ls_output = subprocess.Popen(["/home/picloud/julia/julia", "runner.jl", str(NUM_PARTICLES), str(DELTA), str(INTEGRAL_PATHS)], \ cwd = "/home/picloud/DPMixtureModel/DPMM_SMC/", \ stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = ls_output.communicate() return out #result = run_on_instance([1]) jids = cloud.map(run_on_instance, range(TRIALS), _env=cloud_environment, _type='c2', _cores=1) print jids result = cloud.result(jids) pickle.dump(result, open("result_"+str(NUM_PARTICLES)+"particles_"+str(DELTA)+"delta_"+str(INTEGRAL_PATHS)+"path.pkl","wb")) print "RESULT:", result
def calc_pure_python(show_output): # make a list of x and y values which will represent q # xx and yy are the co-ordinates, for the default configuration they'll look like: # if we have a 1000x1000 plot # xx = [-2.13, -2.1242, -2.1184000000000003, ..., 0.7526000000000064, 0.7584000000000064, 0.7642000000000064] # yy = [1.3, 1.2948, 1.2895999999999999, ..., -1.2844000000000058, -1.2896000000000059, -1.294800000000006] x_step = (float(x2 - x1) / float(w)) * 2 y_step = (float(y1 - y2) / float(h)) * 2 x = [] y = [] ycoord = y2 while ycoord > y1: y.append(ycoord) ycoord += y_step xcoord = x1 while xcoord < x2: x.append(xcoord) xcoord += x_step q = [] for ycoord in y: for xcoord in x: q.append(complex(xcoord, ycoord)) print "Total elements:", len(q) # split work list into continguous chunks, one per CPU # build this into chunks which we'll apply to map_async nbr_chunks = 128 # experiment with different nbrs of chunks #nbr_chunks = multiprocessing.cpu_count() chunk_size = len(q) / nbr_chunks # split our long work list into smaller chunks # make sure we handle the edge case where nbr_chunks doesn't evenly fit into len(q) if len(q) % nbr_chunks != 0: # make sure we get the last few items of data when we have # an odd size to chunks (e.g. len(q) == 100 and nbr_chunks == 3 nbr_chunks += 1 chunks = [(q[x*chunk_size:(x+1)*chunk_size], maxiter) \ for x in xrange(nbr_chunks)] print chunk_size, len(chunks), len(chunks[0][0]) print "Size of complex nbr:", sys.getsizeof(0+0j) print "We have %d complex numbers in the q array" % (len(q)) print "In total we're sending %d bytes" % (len(q) * sys.getsizeof(0+0j)) print "And receiving %d bytes" % (len(q) * sys.getsizeof(1)) # create a Pool which will create Python processes start_time = datetime.datetime.now() print "Running cloud.map on %d chunks" % (len(chunks)) jids = cloud.map(calculate_z, chunks) # we get a list of lists back, one per chunk, so we have to # flatten them back together # return a list of lists of results print "Jobs submitted, waiting on results" results = cloud.result(jids) # [[ints...], [ints...], []] print "cloud.result completed" output = [] for res in results: output += res end_time = datetime.datetime.now() secs = end_time - start_time print "Main took", secs validation_sum = sum(output) print "Total sum of elements (for validation):", validation_sum if show_output: show(output) return validation_sum
def run_exp((data_filename, inits), wait_file, kernel_config_name): # put the filenames in the data irm.experiments.to_bucket(data_filename, BUCKET_BASE) [irm.experiments.to_bucket(init_f, BUCKET_BASE) for init_f in inits] kc = KERNEL_CONFIGS[kernel_config_name] CHAINS_TO_RUN = len(inits) ITERS = kc['ITERS'] kernel_config = kc['kernels'] fixed_k = kc.get('fixed_k', False) jids = cloud.map(irm.experiments.inference_run, inits, [data_filename] * CHAINS_TO_RUN, [kernel_config] * CHAINS_TO_RUN, [ITERS] * CHAINS_TO_RUN, range(CHAINS_TO_RUN), [BUCKET_BASE] * CHAINS_TO_RUN, [None] * CHAINS_TO_RUN, [fixed_k] * CHAINS_TO_RUN, _label="%s-%s-%s" % (data_filename, inits[0], kernel_config_name), _env='connectivitymotif', _type='f2') pickle.dump( { 'jids': jids, 'data_filename': data_filename, 'inits': inits, 'kernel_config_name': kernel_config_name }, open(wait_file, 'w')) @transform(run_exp, suffix('.wait'), '.samples')
def launch_jobs(hdf_bucket, training_keys, testing_keys, raw_features = None, start_hour = 3, end_hour = 7, num_features = 1, future_offset = 600, profile = True): all_possible_params = gen_feature_params(raw_features) chosen_params = [] def worker_wrapper(new_param): return download_and_eval(hdf_bucket, training_keys, testing_keys, chosen_params, new_param, start_hour = start_hour, end_hour = end_hour, future_offset = future_offset) for feature_num in xrange(num_features): print "=== Searching for feature #%d ===" % (feature_num+1) print "Launching %d jobs over %d training files and %d testing files" % \ (len(all_possible_params), len(training_keys), len(testing_keys)) label = 'Evaluating %d parameter sets for feature #%d' % \ (len(all_possible_params), feature_num+1) jids =\ cloud.map(worker_wrapper, all_possible_params, _env = 'compute', _label=label, _type = 'f2', _profile = profile) results = {} best_result = None best_model = None for (i, (curr_best_score, curr_best_model, results)) in enumerate(cloud.iresult(jids)): if results is None: results = {} else: assert isinstance(results, dict) print "Got %d results with best score = %s" % (len(results), curr_best_score) for (param, r) in results.items(): key = tuple(chosen_params + [param]) results[key] = r #if r is None: # print param, "<skipped>" if r is not None: train_result, test_result = r score = train_result.score if best_result is None or not np.isfinite(best_result['train'].score) or \ best_result['train'].score < score: print "New best:", key print "result for training data:", train_result if test_result: print "result for testing data:", test_result print best_result = { 'params':key, 'train': train_result, 'test': test_result } best_model = curr_best_model print print "Current best for %d features: %s" % (feature_num+1, best_result) print curr_best_params = best_result['params'] if len(curr_best_params) < feature_num+1: print "Got no improvement from adding %d'th feature, stopping..." break else: chosen_params.append(curr_best_params[-1]) return best_result, best_model, results
==PARAMS== results: A list of Counter() objects, as produced by cloud.result() method ==RETURNS== a Counter() object with total word-counts for the whole body of text """ total_wordcount = Counter() for r in results: total_wordcount.update(r) return total_wordcount ##job_ids=cloud.map(wordcount,chunker(f)) ## where are the files we care about? path = '../www.gutenberg.lib.md.us/etext00' ## start cloud jobs over chunks of text job_ids = cloud.map(wordcount, filechunker(path)) while True: c = cloud_status(job_ids) print c if c['processing'] == 0: break else: sleep(10) res = cloud.result(job_ids)
@files(experiment_generator) def run_exp((data_filename, inits), wait_file, kernel_config_name): # put the filenames in the data irm.experiments.to_bucket(data_filename, BUCKET_BASE) [irm.experiments.to_bucket(init_f, BUCKET_BASE) for init_f in inits] kc = KERNEL_CONFIGS[kernel_config_name] CHAINS_TO_RUN = len(inits) ITERS = kc['ITERS'] kernel_config = kc['kernels'] init_type = kc.get('init', None) jids = cloud.map(irm.experiments.inference_run, inits, [data_filename] * CHAINS_TO_RUN, [kernel_config] * CHAINS_TO_RUN, [ITERS] * CHAINS_TO_RUN, range(CHAINS_TO_RUN), [BUCKET_BASE] * CHAINS_TO_RUN, [init_type] * CHAINS_TO_RUN, _env='connectivitymotif', _type='f2') pickle.dump( { 'jids': jids, 'data_filename': data_filename, 'inits': inits, 'kernel_config_name': kernel_config_name }, open(wait_file, 'w')) @transform(run_exp, suffix('.wait'), '.samples') def get_results(exp_wait, exp_results):
def param_search( features, train_files, test_files, debug=False, regression=False, signal=signals.bid_offer_cross, ensemble=ClassifierEnsemble, base_models=[ClusteredClassifier(20)], num_models=[25], bagging_percents=[0.75], dict_types=[None], # , 'kmeans' dict_sizes=[None], # , 50 pca_types=[None, 'whiten'], compute_pairwise_products=[False], binning=[False], stacking_models=[None, LogisticRegression()], start_hour=None, end_hour=None): print "Features:", features print "Training files:", train_files print "Testing files:", test_files def do_work(p): return worker(p, features, train_files, test_files, start_hour, end_hour) oversampling_factors = [0] possible_encoder_params = { 'dictionary_type': dict_types, 'dictionary_size': dict_sizes, 'pca_type': pca_types, 'compute_pairwise_products': compute_pairwise_products, 'binning': binning, } all_encoders = [ FeatureEncoder(**p) for p in cartesian_product(possible_encoder_params) if (p['dictionary_size'] is not None or p['dictionary_type'] is None) ] possible_ensemble_params = { 'base_model': base_models, 'num_models': num_models, 'stacking_model': stacking_models, 'verbose': [True], 'feature_subset_percent': [0.75], 'bagging_percent': bagging_percents, } # classification ensembles get weighted by F-score if not regression: possible_ensemble_params['weighting'] = [0.25] all_ensembles = [ ensemble(**params) for params in cartesian_product(possible_ensemble_params) ] if regression: train_params = {} else: train_params = {'class_weight': {0: 1, 1: 5, -1: 10}} worklist = [] for smote_factor in oversampling_factors: general_params = { 'oversampling_factor': smote_factor, 'signal': signal, 'regression': regression, } for encoder in all_encoders: for model in all_ensembles: params = { 'general': general_params, 'encoder': encoder, 'model': model, 'training': train_params } worklist.append(params) if debug: print "[Debug mode]" result_list = map(do_work, worklist[:1]) for params, features, e, svm, result in result_list: print params, "=>", result else: init_cloud() label = ", ".join(train_files) jobids = cloud.map(do_work, worklist, _fast_serialization=2, _type='m1', _label=label, _env='param_search') results = [] print "Launched", len(worklist), "jobs, waiting for results..." for x in cloud.iresult(jobids): if x is not None: results.append(x) print x['params'] print x['model'] print x['result'] print "---" def cmp(x, y): return int(np.sign(x['result']['cmp'] - y['result']['cmp'])) results.sort(cmp=cmp) print "Best:" for item in results[-3:]: print item['params'] r = item['result'] print[(k, r[k]) for k in sorted(r.keys())]
# now the input args chunk_size = 80000 chunks = int(np.ceil(len(sv) / float(chunk_size))) args = [] for i in range(chunks): args += [(i * chunk_size, (i + 1) * chunk_size)] CN = chunks results = [] if USE_CLOUD: print "MAPPING TO THE CLOUD" jids = cloud.map(picloud_score_frame, [dataset_name] * CN, [x_range] * CN, [y_range] * CN, [phi_range] * CN, [theta_range] * CN, args, [frame] * CN, [EO] * CN, [likelihood_i] * CN, _type='f2', _vol="my-vol", _env="base/precise") else: jids = map(picloud_score_frame, [dataset_name] * CN, [x_range] * CN, [y_range] * CN, [phi_range] * CN, [theta_range] * CN, args, [frame] * CN, [EO] * CN, [likelihood_i] * CN) np.savez_compressed(outfile_npz, x_range=x_range, y_range=y_range, phi_range=phi_range, theta_range=theta_range) pickle.dump( {
cmpr_step, w_smooth, gamma, p_sample, detection_step, min_dist_step, detection_window_hrs, req_consec_detections) param_product_old = set(param_product_old) param_product_new = set(param_product_new) param_product = param_product_new.difference(param_product_old) """ jids = cloud.map(detect_trials, *zip(*param_product), _type='f2') params_sub_jids = cloud.result(jids) params = [elt[0] for elt in params_sub_jids] sub_jids = [elt[1] for elt in params_sub_jids] stats = cloud.result(sub_jids) dt = datetime.now() # Write out as plain text just in case. out_path_txt = 'data/param_explore_%d%d%d%d%d%d.txt' % \ (dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second) open(out_path_txt, 'w').write(str((params, stats))) params, stats = fix_results_nesting((params, stats)) out_path_pkl = 'data/param_explore_%d%d%d%d%d%d.pkl' % \