def main(): Log.set_loglevel(logging.DEBUG) modulename = "sample_ozone_posterior_average_slurm" if not FileSystem.cmd_exists("sbatch"): engine = SerialComputationEngine() else: johns_slurm_hack = "#SBATCH --partition=intel-ivy,wrkstn,compute" johns_slurm_hack = "#SBATCH --partition=intel-ivy,compute" folder = os.sep + os.sep.join(["nfs", "data3", "ucabhst", modulename]) batch_parameters = BatchClusterParameters( foldername=folder, max_walltime=24 * 60 * 60, resubmit_on_timeout=False, memory=3, parameter_prefix=johns_slurm_hack) engine = SlurmComputationEngine(batch_parameters, check_interval=1, do_clean_up=True) prior = Gaussian(Sigma=eye(2) * 100) num_estimates = 100 posterior = OzonePosteriorAverageEngine(computation_engine=engine, num_estimates=num_estimates, prior=prior) posterior.logdet_method = "shogun_estimate" proposal_cov = diag([4.000000000000000e-05, 1.072091680000000e+02]) mcmc_sampler = StandardMetropolis(posterior, scale=1.0, cov=proposal_cov) start = asarray([-11.35, -13.1]) mcmc_params = MCMCParams(start=start, num_iterations=2000) chain = MCMCChain(mcmc_sampler, mcmc_params) chain.append_mcmc_output(StatisticsOutput(print_from=1, lag=1)) home = expanduser("~") folder = os.sep.join([home, modulename]) store_chain_output = StoreChainOutput(folder) chain.append_mcmc_output(store_chain_output) loaded = store_chain_output.load_last_stored_chain() if loaded is None: logging.info("Running chain from scratch") else: logging.info("Running chain from iteration %d" % loaded.iteration) chain = loaded chain.run() f = open(folder + os.sep + "final_chain", "w") dump(chain, f) f.close()
def test_shogun_on_serial_engine(self): home = expanduser("~") folder = os.sep.join([home, "unit_test_shogun_on_sge_dummy_result"]) try: shutil.rmtree(folder) except OSError: pass engine = SerialComputationEngine() num_submissions = 3 sleep_times = randint(0, 3, num_submissions) self.engine_tester(engine, sleep_times)
def main(): Log.set_loglevel(logging.DEBUG) prior = Gaussian(Sigma=eye(2) * 100) num_estimates = 2 home = expanduser("~") folder = os.sep.join([home, "sample_ozone_posterior_rr_sge"]) computation_engine = SerialComputationEngine() rr_instance = RussianRoulette(1e-3, block_size=10) posterior = OzonePosteriorRREngine(rr_instance=rr_instance, computation_engine=computation_engine, num_estimates=num_estimates, prior=prior) posterior.logdet_method = "shogun_estimate" proposal_cov = diag([4.000000000000000e-05, 1.072091680000000e+02]) mcmc_sampler = StandardMetropolis(posterior, scale=1.0, cov=proposal_cov) start = asarray([-11.35, -13.1]) mcmc_params = MCMCParams(start=start, num_iterations=200) chain = MCMCChain(mcmc_sampler, mcmc_params) # chain.append_mcmc_output(PlottingOutput(None, plot_from=1, lag=1)) chain.append_mcmc_output(StatisticsOutput(print_from=1, lag=1)) store_chain_output = StoreChainOutput(folder, lag=50) chain.append_mcmc_output(store_chain_output) loaded = store_chain_output.load_last_stored_chain() if loaded is None: logging.info("Running chain from scratch") else: logging.info("Running chain from iteration %d" % loaded.iteration) chain = loaded chain.run() f = open(folder + os.sep + "final_chain", "w") dump(chain, f) f.close()
return job if __name__ == "__main__": logger.setLevel(10) num_repetitions = 10 # plain MCMC parameters, plan is to use every 200th sample thin_step = 1 num_iterations = 5200 num_warmup = 200 compute_local = False if not FileSystem.cmd_exists("sbatch") or compute_local: engine = SerialComputationEngine() else: johns_slurm_hack = "#SBATCH --partition=intel-ivy,wrkstn,compute" folder = os.sep + os.sep.join(["nfs", "data3", "ucabhst", modulename]) batch_parameters = BatchClusterParameters( foldername=folder, resubmit_on_timeout=False, parameter_prefix=johns_slurm_hack) engine = SlurmComputationEngine(batch_parameters, check_interval=1, do_clean_up=True) engine.max_jobs_in_queue = 1000 engine.store_fire_and_forget = True aggs = []
No aggregators are stored and results can be picked up from disc when ready. This script also illustrates a typical use case in scientific computing: Run the same function with different parameters a certain number of times. Make sure to read the minimal example first. """ Log.set_loglevel(10) # filename of the result database home = expanduser("~") foldername = os.path.join(home, "test") db_fname = os.path.join(foldername, "test.txt") batch_parameters = BatchClusterParameters(foldername=foldername) engine = SerialComputationEngine() # engine = SlurmComputationEngine(batch_parameters) # here are some example parameters for jobs # we here create all combinations and then shuffle them # this randomizes the runs over the parameter space params_x = np.linspace(-3, 3, num=25) params_y = np.linspace(-2, 2, num=12) all_parameters = itertools.product(params_x, params_y) all_parameters = list(all_parameters) shuffle(all_parameters) print "Number of parameter combinations:", len(all_parameters) for params in all_parameters[:len(all_parameters) / 300]: x = params[0] y = params[1]
def test_serial_engine(self): num_submissions = 3 sleep_times = randint(0, 3, num_submissions) self.engine_helper(SerialComputationEngine(), sleep_times)
this script, we can collect results from the cluster and potentially submit more jobs. """ Log.set_loglevel(10) # oflder for all job files home = expanduser("~") foldername = os.sep.join([home, "minimal_example"]) # parameters for the cluster (folder, name, etcI batch_parameters = BatchClusterParameters(foldername=foldername) # engine is the objects that jobs are submitted to # there are implementations for different batch cluster systems # the serial one runs everything locally engine = SerialComputationEngine() # engine = SGEComputationEngine(batch_parameters) # engine = SlurmComputationEngine(batch_parameters) # On submission, the engine returns aggregators that can be # used to retreive results after potentially doing postprocessing returned_aggregators = [] for i in range(3): job = MyJob(ScalarResultAggregator()) agg = engine.submit_job(job) returned_aggregators.append(agg) # This call blocks until all jobs are finished (magic happens here) logger.info("Waiting for all jobs to be completed.") engine.wait_for_all()
num_warmup = 500 thin_step = 1 num_iterations = 2000 + num_warmup num_iterations = 100 num_warmup = 0 # hmc parameters num_steps_min = 10 num_steps_max = 100 sigma_p = 1. momentum_seed = np.random.randint(time.time()) compute_local = False if not FileSystem.cmd_exists("sbatch") or compute_local: engine = SerialComputationEngine() else: johns_slurm_hack = "#SBATCH --partition=intel-ivy,wrkstn,compute" folder = os.sep + os.sep.join(["nfs", "data3", "ucabhst", modulename]) batch_parameters = BatchClusterParameters( foldername=folder, resubmit_on_timeout=False, parameter_prefix=johns_slurm_hack) engine = SlurmComputationEngine(batch_parameters, check_interval=1, do_clean_up=True) engine.max_jobs_in_queue = 1000 engine.store_fire_and_forget = True aggs_hmc_kmc = {}
def run_problem(prob_label): """Run the experiment""" # /////// submit jobs ////////// # create folder name string #result_folder = glo.result_folder() from kcgof.config import get_default_config config = get_default_config() tmp_dir = config['ex_scratch_path'] foldername = os.path.join(tmp_dir, 'kcgof_slurm', 'e%d' % ex) logger.info("Setting engine folder to %s" % foldername) # create parameter instance that is needed for any batch computation engine logger.info("Creating batch parameter instance") batch_parameters = BatchClusterParameters(foldername=foldername, job_name_base="e%d_" % ex, parameter_prefix="") use_cluster = glo._get_key_from_default_config('ex_use_slurm_cluster') if use_cluster: # use a Slurm cluster partitions = config['ex_slurm_partitions'] if partitions is None: engine = SlurmComputationEngine(batch_parameters) else: engine = SlurmComputationEngine(batch_parameters, partition=partitions) else: # Use the following line if Slurm queue is not used. engine = SerialComputationEngine() n_methods = len(method_funcs) # problem setting ns, p, rx, cs = get_ns_model_source(prob_label) # repetitions x len(ns) x #methods aggregators = np.empty((reps, len(ns), n_methods), dtype=object) for r in range(reps): for ni, n in enumerate(ns): for mi, f in enumerate(method_funcs): # name used to save the result func_name = f.__name__ fname = '%s-%s-n%d_r%d_a%.3f.p' \ %(prob_label, func_name, n, r, alpha,) if not is_rerun and glo.ex_file_exists(ex, prob_label, fname): logger.info('%s exists. Load and return.' % fname) job_result = glo.ex_load_result(ex, prob_label, fname) sra = SingleResultAggregator() sra.submit_result(SingleResult(job_result)) aggregators[r, ni, mi] = sra else: # result not exists or rerun job = Ex1Job(SingleResultAggregator(), prob_label, r, f, n) agg = engine.submit_job(job) aggregators[r, ni, mi] = agg # let the engine finish its business logger.info("Wait for all call in engine") engine.wait_for_all() # ////// collect the results /////////// logger.info("Collecting results") job_results = np.empty((reps, len(ns), n_methods), dtype=object) for r in range(reps): for ni, n in enumerate(ns): for mi, f in enumerate(method_funcs): logger.info("Collecting result (%s, r=%d, n=%d)" % (f.__name__, r, n)) # let the aggregator finalize things aggregators[r, ni, mi].finalize() # aggregators[i].get_final_result() returns a SingleResult instance, # which we need to extract the actual result job_result = aggregators[r, ni, mi].get_final_result().result job_results[r, ni, mi] = job_result #func_names = [f.__name__ for f in method_funcs] #func2labels = exglobal.get_func2label_map() #method_labels = [func2labels[f] for f in func_names if f in func2labels] # save results results = { 'job_results': job_results, # 'p': p, # 'cond_source': cs, 'alpha': alpha, 'repeats': reps, 'ns': ns, 'method_funcs': method_funcs, 'prob_label': prob_label, } # class name fname = 'ex%d-%s-me%d_rs%d_nmi%d_nma%d_a%.3f.p' \ %(ex, prob_label, n_methods, reps, min(ns), max(ns), alpha,) glo.ex_save_result(ex, results, fname) logger.info('Saved aggregated results to %s' % fname)
home = expanduser("~") foldername = os.sep.join([home, "minimal_example"]) logger.info("Setting engine folder to %s" % foldername) # create parameter instance that is needed for any batch computation engine logger.info("Creating batch parameter instance") batch_parameters = BatchClusterParameters(foldername=foldername) # possibly create SGE engine instance, which can be used to submit jobs to # there are more engines available. # logger.info("creating SGE engine instance") # engine = SGEComputationEngine(batch_parameters, check_interval=1) # # create serial engine (which works locally) logger.info("Creating serial engine instance") engine = SerialComputationEngine() # we have to collect aggregators somehow aggregators = [] # submit job three times logger.info("Starting loop over job submission") for i in range(3): logger.info("Submitting job %d" % i) job = MyJob(ScalarResultAggregator()) aggregators.append(engine.submit_job(job)) # let the engine finish its business logger.info("Wait for all call in engine") engine.wait_for_all()
def compute(fname_base, job_generator, Ds, Ns, num_repetitions, num_steps, step_size, max_steps=None, compute_local=False): if not FileSystem.cmd_exists("sbatch") or compute_local: engine = SerialComputationEngine() else: johns_slurm_hack = "#SBATCH --partition=intel-ivy,wrkstn,compute" folder = os.sep + os.sep.join(["nfs", "data3", "ucabhst", fname_base]) batch_parameters = BatchClusterParameters( foldername=folder, resubmit_on_timeout=False, parameter_prefix=johns_slurm_hack) engine = SlurmComputationEngine(batch_parameters, check_interval=1, do_clean_up=True) engine.max_jobs_in_queue = 1000 engine.store_fire_and_forget = True # fixed order of aggregators aggregators = [] for D in Ds: for N in Ns: for j in range(num_repetitions): logger.info("%s trajectory, D=%d/%d, N=%d/%d repetition %d/%d" % \ (str(job_generator), D, np.max(Ds), N, np.max(Ns), j + 1, num_repetitions)) job = job_generator(D, N, N) aggregators += [engine.submit_job(job)] time.sleep(0.1) # block until all done engine.wait_for_all() avg_accept = np.zeros((num_repetitions, len(Ds), len(Ns))) avg_accept_est = np.zeros((num_repetitions, len(Ds), len(Ns))) log_dets = np.zeros((num_repetitions, len(Ds), len(Ns))) log_dets_est = np.zeros((num_repetitions, len(Ds), len(Ns))) avg_steps_taken = np.zeros((num_repetitions, len(Ds), len(Ns))) agg_counter = 0 for i in range(len(Ds)): for k in range(len(Ns)): for j in range(num_repetitions): agg = aggregators[agg_counter] agg_counter += 1 agg.finalize() result = agg.get_final_result() agg.clean_up() avg_accept[j, i, k] = result.acc_mean avg_accept_est[j, i, k] = result.acc_est_mean log_dets[j, i, k] = result.vol log_dets_est[j, i, k] = result.vol_est avg_steps_taken[j, i, k] = result.steps_taken with open(fname_base + ".csv", 'a+') as f: line = np.array([ Ds[i], Ns[k], avg_accept[j, i, k], avg_accept_est[j, i, k], log_dets[j, i, k], log_dets_est[j, i, k], avg_steps_taken[j, i, k], ]) f.write(" ".join(map(str, line)) + os.linesep)