def process_omniglot(data_dir, quiet): try: omniglot_dir = process_path(os.path.join(data_dir, 'omniglot')) if _validate_omniglot(omniglot_dir): print("Omniglot data seems to be present already.") return else: try: shutil.rmtree(omniglot_dir) except FileNotFoundError: pass os.makedirs(omniglot_dir, exist_ok=False) with cd(omniglot_dir): subprocess.run("git clone https://github.com/brendenlake/omniglot --depth=1".split(), check=True) with cd('omniglot/python'): zip_ref = zipfile.ZipFile('images_evaluation.zip', 'r') zip_ref.extractall('.') zip_ref.close() zip_ref = zipfile.ZipFile('images_background.zip', 'r') zip_ref.extractall('.') zip_ref.close() subprocess.run('mv omniglot/python/images_background/* .', shell=True, check=True) subprocess.run('mv omniglot/python/images_evaluation/* .', shell=True, check=True) print("Done setting up Omniglot data.") finally: try: shutil.rmtree(os.path.join(omniglot_dir, 'omniglot')) except FileNotFoundError: pass
def download_backgrounds(data_dir): """ Download backgrounds. Result is that a file called `emnist-byclass.mat` is stored in `data_dir`. Parameters ---------- path: str Path to directory where files should be stored. """ with cd(data_dir): if not os.path.exists('backgrounds'): command = "git clone {}".format(background_url).split() subprocess.run(command, check=True)
def _download_emnist(data_dir): """ Download the emnist data. Result is that a directory called "emnist_raw" is created inside `data_dir` which contains 4 files. Parameters ---------- path: str Path to directory where files should be stored. """ emnist_raw_dir = os.path.join(data_dir, "emnist_raw") os.makedirs(emnist_raw_dir, exist_ok=True) with cd(emnist_raw_dir): if not os.path.exists('gzip.zip'): print("Downloading...") command = "wget --output-document=gzip.zip {}".format( emnist_url).split() subprocess.run(command, check=True) else: print("Found existing copy of gzip.zip, not downloading.") print("Extracting...") for fname in emnist_gz_names: if not os.path.exists(fname): subprocess.run('unzip gzip.zip gzip/{}'.format(fname), shell=True, check=True) shutil.move('gzip/{}'.format(fname), '.') else: print("{} already exists, skipping extraction.".format(fname)) try: shutil.rmtree('gzip') except FileNotFoundError: pass return emnist_raw_dir
def submit_job( archive_path, category, exp_name, wall_time="1year", tasks_per_node=1, cpus_per_task=1, mem_per_cpu=0, queue="", kind="local", gpu_set="", project="rrg-bengioy-ad_gpu", installation_script_path=None, gpu_kind=None, **run_kwargs): assert kind in "slurm slurm-local".split() run_kwargs.update( wall_time=wall_time, tasks_per_node=tasks_per_node, cpus_per_task=cpus_per_task, kind=kind, gpu_set=gpu_set, mem_per_cpu=mem_per_cpu) run_kwargs['env_vars'] = dict(TF_CPP_MIN_LOG_LEVEL=3, CUDA_VISIBLE_DEVICES='-1') run_kwargs['dry_run'] = False scratch = os.path.join(cfg.parallel_experiments_run_dir, category) session = ParallelSession(exp_name, archive_path, 'map', scratch=scratch, **run_kwargs) job_path = session.job_path # Not strictly required if kind == "parallel", but do it anyway for completeness. with open(os.path.join(job_path, "session.pkl"), 'wb') as f: dill.dump(session, f, protocol=dill.HIGHEST_PROTOCOL, recurse=True) if kind == "slurm-local": session.run() return session if not installation_script_path: raise Exception() installation_script_path = os.path.realpath(installation_script_path) entry_script = """#!/bin/bash echo "Building venv..." echo "Command: " echo "srun -v --nodes=$SLURM_JOB_NUM_NODES --ntasks=$SLURM_JOB_NUM_NODES {installation_script_path}" srun -v --nodes="$SLURM_JOB_NUM_NODES" --ntasks=$SLURM_JOB_NUM_NODES {installation_script_path} echo "Sourcing venv..." source "$SLURM_TMPDIR/env/bin/activate" cd {job_path} echo "Dropping into python..." python run.py """.format(installation_script_path=installation_script_path, job_path=job_path) with open(os.path.join(job_path, "run.sh"), 'w') as f: f.write(entry_script) python_script = """#!{} import datetime start = datetime.datetime.now() print("Starting job at " + str(start)) import dill with open("./session.pkl", "rb") as f: session = dill.load(f) session.run() end = datetime.datetime.now() print("Finishing job at " + str(end)) print(str((end - start).total_seconds()) + " seconds elapsed between start and finish.") """.format(sys.executable) with open(os.path.join(job_path, "run.py"), 'w') as f: f.write(python_script) wall_time_minutes = int(np.ceil(session.wall_time_seconds / 60)) resources = "--nodes={} --ntasks-per-node={} --cpus-per-task={} --time={}".format( session.n_nodes, session.tasks_per_node, cpus_per_task, wall_time_minutes) if mem_per_cpu: resources = "{} --mem-per-cpu={}".format(resources, mem_per_cpu) if gpu_set: n_gpus = len([int(i) for i in gpu_set.split(',')]) if gpu_kind: gpu_string = f"--gres=gpu:{gpu_kind}:{n_gpus}" else: gpu_string = f"--gres=gpu:{n_gpus}" resources = resources + ' ' + gpu_string email = "*****@*****.**" if queue: queue = "-p " + queue command = ( "sbatch --job-name {exp_name} -D {job_path} --mail-type=ALL [email protected] " "-A {project} {queue} --export=ALL {resources} " "-o stdout -e stderr run.sh".format( exp_name=exp_name, job_path=job_path, email=email, project=project, queue=queue, resources=resources ) ) print("\n" + "~" * 40) print(command) with cd(job_path): subprocess.run(command.split()) return session
def run(self): if self.dry_run: print("Dry run, so not running.") return # Have to jump through a hoop to get the proper node-local storage on cedar/graham. self.local_scratch_prefix = self.get_slurm_var("SLURM_TMPDIR") self.local_scratch = os.path.join( self.local_scratch_prefix, os.path.basename(self.job_path)) # Compute new time limits based on the actual time remaining (protect against e.g. job starting late) print("Time limits before adjustment:") self.print_time_limits() job_id = os.getenv("SLURM_JOBID") command = 'squeue -h -j {} -o "%L"'.format(job_id) returncode, stdout, stderr = self.execute_command(command, frmt=False, robust=False) days = 0 if "-" in stdout: days, time = stdout.split("-") days = int(days) else: time = stdout time = time.split(":") hours = int(time[-3]) if len(time) > 2 else 0 minutes = int(time[-2]) if len(time) > 1 else 0 seconds = int(time[-1]) wall_time_delta = datetime.timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds) wall_time_seconds = int(wall_time_delta.total_seconds()) print("Actual remaining walltime: {}".format(wall_time_delta)) print("Time limits after adjustment:") (self.wall_time_seconds, self.total_seconds_per_step, self.parallel_seconds_per_step, self.python_seconds_per_step) = \ self.compute_time_limits( wall_time_seconds, self.cleanup_time, self.slack_time, self.step_time_limit, self.n_steps) self.print_time_limits() with cd(self.job_path): print("\n" + ("=" * 80)) job_start = datetime.datetime.now() print("Starting job at {}".format(job_start)) job = ReadOnlyJob(self.input_zip) subjobs_remaining = sorted([op.idx for op in job.ready_incomplete_ops(sort=False)]) n_failures = defaultdict(int) dead_jobs = set() i = 0 while subjobs_remaining: step_start = datetime.datetime.now() print("\nStarting step {} at: ".format(i) + "=" * 90) print("{} ({} since start of job)".format(step_start, step_start - job_start)) p = subprocess.run( 'scontrol show hostnames $SLURM_JOB_NODELIST', stdout=subprocess.PIPE, shell=True) host_pool = list(set([host.strip() for host in p.stdout.decode().split('\n') if host])) self.hosts, n_tasks_for_step = self.recruit_hosts( host_pool, self.tasks_per_node, max_tasks=len(subjobs_remaining)) indices_for_step = subjobs_remaining[:n_tasks_for_step] self._step(i, indices_for_step) self._checkpoint(i) job = ReadOnlyJob(self.archive_root) subjobs_remaining = set([op.idx for op in job.ready_incomplete_ops(sort=False)]) for j in indices_for_step: if j in subjobs_remaining: n_failures[j] += 1 if n_failures[j] > self.n_retries: print("All {} attempts at completing job with index {} have failed, " "permanently removing it from set of eligible jobs.".format(n_failures[j], j)) dead_jobs.add(j) subjobs_remaining = [idx for idx in subjobs_remaining if idx not in dead_jobs] subjobs_remaining = sorted(subjobs_remaining) i += 1 print("Step duration: {}.".format(datetime.datetime.now() - step_start)) self.execute_command("rm -rf {archive_root}", robust=True) print("Cleaning up dirty hosts...") command = "rm -rf {local_scratch}" for host in self.dirty_hosts: print("Cleaning host {}...".format(host)) self.ssh_execute(command, host, robust=True)
def __init__( self, name, input_zip, pattern, scratch, local_scratch_prefix='/tmp/dps/hyper/', ppn=12, cpp=1, pmem=None, wall_time="1hour", cleanup_time="1min", slack_time="1min", add_date=True, dry_run=0, parallel_exe=None, kind="parallel", host_pool=None, load_avg_threshold=8., min_hosts=None, max_hosts=1, env_vars=None, output_to_files=True, n_retries=0, gpu_set="", copy_venv="", python_startup=False, step_time_limit=None, ignore_gpu=False, ssh_options=None, loud_output=True, rsync_verbosity=0): args = locals().copy() del args['self'] print("\nParallelSession args:") print(args) launch_venv = os.getenv('VIRTUAL_ENV') if launch_venv: launch_venv = os.path.split(launch_venv)[1] if not parallel_exe: parallel_exe = "$HOME/.local/bin/parallel" if ssh_options is None: ssh_options = ( "-oPasswordAuthentication=no " "-oStrictHostKeyChecking=no " "-oConnectTimeout=5 " "-oServerAliveInterval=2" ) if kind == "pbs": local_scratch_prefix = "\\$RAMDISK" assert kind in "parallel pbs slurm slurm-local".split() hpc = kind != "parallel" # Create directory to run the job from - should be on scratch. scratch = os.path.abspath(os.path.expandvars(scratch)) es = ExperimentStore(scratch, prefix="run_search") job_dir = es.new_experiment(name, 0, add_date=add_date, force_fresh=1) job_dir.record_environment() with open(job_dir.path_for('run_kwargs.json'), 'w') as f: json.dump(args, f, default=str, indent=4, sort_keys=True) del f del args job_path = job_dir.path job_dir.make_directory('experiments') input_zip_stem = path_stem(input_zip) input_zip = shutil.copy(input_zip, job_dir.path_for("orig.zip")) input_zip_abs = process_path(input_zip) input_zip_base = os.path.basename(input_zip) archive_root = zip_root(input_zip) self.copy_files( job_dir, input_zip, archive_root, ["README.md", "sampled_configs.txt", "config.json", "config.pkl"]) # storage local to each node, from the perspective of that node local_scratch = os.path.join(local_scratch_prefix, os.path.basename(job_path)) output_to_files = "--output-to-files" if output_to_files else "" env = os.environ.copy() env_vars = env_vars or {} env.update({e: str(v) for e, v in env_vars.items()}) env_vars = ' '.join('--env ' + k for k in env_vars) rsync_verbosity = "" if not rsync_verbosity else "-" + "v" * rsync_verbosity ro_job = ReadOnlyJob(input_zip) indices_to_run = sorted([op.idx for op in ro_job.ready_incomplete_ops(sort=False)]) del ro_job n_jobs_to_run = len(indices_to_run) if n_jobs_to_run == 0: print("All jobs are finished! Exiting.") return dirty_hosts = set() if hpc: host_pool = [] n_nodes = max_hosts n_procs = n_nodes * ppn n_steps = int(np.ceil(n_jobs_to_run / n_procs)) else: self.__dict__.update(locals()) host_pool = host_pool or DEFAULT_HOST_POOL if isinstance(host_pool, str): host_pool = host_pool.split() # Get an estimate of the number of hosts we'll have available. with cd(job_path): hosts, n_procs = self.recruit_hosts( hpc, min_hosts, max_hosts, host_pool, ppn, max_procs=np.inf) n_nodes = len(hosts) if n_jobs_to_run < n_procs: n_steps = 1 n_nodes = int(np.ceil(n_jobs_to_run / ppn)) n_procs = n_nodes * ppn hosts = hosts[:n_nodes] else: n_steps = int(np.ceil(n_jobs_to_run / n_procs)) node_file = " --sshloginfile nodefile.txt " wall_time_seconds, total_seconds_per_step, parallel_seconds_per_step, python_seconds_per_step = \ self.compute_time_limits(wall_time, cleanup_time, slack_time, step_time_limit, n_steps) self.__dict__.update(locals()) self.print_time_limits()
def submit_job( archive_path, name, wall_time="1year", ppn=1, cpp=1, pmem=0, queue="", kind="local", gpu_set="", project="rpp-bengioy", **run_kwargs): assert kind in "pbs slurm slurm-local parallel".split() if "slurm" in kind and not pmem: raise Exception("Must supply a value for pmem (per-process-memory in mb) when using SLURM") run_kwargs.update( wall_time=wall_time, ppn=ppn, cpp=cpp, kind=kind, gpu_set=gpu_set, pmem=pmem) run_kwargs['env_vars'] = dict(TF_CPP_MIN_LOG_LEVEL=3, CUDA_VISIBLE_DEVICES='-1') run_kwargs['dry_run'] = False session = ParallelSession( name, archive_path, 'map', cfg.parallel_experiments_run_dir, **run_kwargs) job_path = session.job_path # Not strictly required if kind == "parallel", but do it anyway for completeness. with open(os.path.join(job_path, "session.pkl"), 'wb') as f: dill.dump(session, f, protocol=dill.HIGHEST_PROTOCOL, recurse=True) if kind in "parallel slurm-local".split(): session.run() return session python_script = """#!{} import datetime start = datetime.datetime.now() print("Starting job at " + str(start)) import dill with open("./session.pkl", "rb") as f: session = dill.load(f) session.run() end = datetime.datetime.now() print("Finishing job at " + str(end)) print(str((end - start).total_seconds()) + " seconds elapsed between start and finish.") """.format(sys.executable) with open(os.path.join(job_path, "run.py"), 'w') as f: f.write(python_script) if kind == "pbs": resources = "nodes={}:ppn={},walltime={}".format(session.n_nodes, session.ppn, session.wall_time_seconds) if pmem: resources = "{},pmem={}mb".format(resources, pmem) email = "*****@*****.**" if queue: queue = "-q " + queue command = ( "qsub -N {name} -d {job_path} -w {job_path} -m abe -M {email} " "-A {project} {queue} -V -l {resources} " "-j oe output.txt run.py".format( name=name, job_path=job_path, email=email, project=project, queue=queue, resources=resources ) ) elif kind == "slurm": wall_time_minutes = int(np.ceil(session.wall_time_seconds / 60)) resources = "--nodes={} --ntasks-per-node={} --cpus-per-task={} --time={}".format( session.n_nodes, session.ppn, cpp, wall_time_minutes) if pmem: resources = "{} --mem-per-cpu={}mb".format(resources, pmem) if gpu_set: n_gpus = len([int(i) for i in gpu_set.split(',')]) resources = "{} --gres=gpu:{}".format(resources, n_gpus) email = "*****@*****.**" if queue: queue = "-p " + queue command = ( "sbatch --job-name {name} -D {job_path} --mail-type=ALL [email protected] " "-A {project} {queue} --export=ALL {resources} " "-o stdout -e stderr run.py".format( name=name, job_path=job_path, email=email, project=project, queue=queue, resources=resources ) ) else: raise Exception() print("\n" + "~" * 40) print(command) with cd(job_path): subprocess.run(command.split()) return session
from dps.utils import cd parser = argparse.ArgumentParser( "Test reinforcement learning on grid_arithmetic. " "Run for each new commit to make sure that it still works." ) parser.add_argument("kind", choices="parallel slurm".split()) parser.add_argument("length", choices="short long".split()) parser.add_argument("queue", choices="cpu gpu".split()) args = parser.parse_args() if args.kind == "parallel": pass elif args.kind == "slurm": with cd(os.path.dirname(dps.__file__)): sha = subprocess.check_output("git rev-parse --verify --short HEAD".split()).decode().strip() hostname = socket.gethostname() if "gra" in hostname: resources = "--max-hosts=4 --ppn=8 --pmem=3800" gpu = "--gpu-set=0,1 --ignore-gpu=True" elif "cedar" in hostname: if args.queue == "gpu": resources = "--max-hosts=5 --ppn=6 --pmem=7700" else: resources = "--max-hosts=4 --ppn=8 --pmem=3800" gpu = "--gpu-set=0,1,2,3 --ignore-gpu=True" else: raise Exception("Unknown host: {}".format(hostname))
def maybe_download_emnist(data_dir, quiet=0, shape=None): """ Download emnist data if it hasn't already been downloaded. Do some post-processing to put it in a more useful format. End result is a directory called `emnist-byclass` which contains a separate pklz file for each emnist class. Pixel values of stored images are uint8 values up to 255. Images for each class are put into a numpy array with shape (n_images_in_class, 28, 28). This numpy array is pickled and stored in a zip file with name <class char>.pklz. Parameters ---------- data_dir: str Directory where files should be stored. """ emnist_dir = os.path.join(data_dir, 'emnist') if _validate_emnist(emnist_dir): print("EMNIST data seems to be present already.") else: print("EMNIST data not found, downloading and processing...") try: shutil.rmtree(emnist_dir) except FileNotFoundError: pass raw_dir = _download_emnist(data_dir) with cd(raw_dir): images, labels = _emnist_load_helper(emnist_gz_names[0], emnist_gz_names[1]) images1, labels1 = _emnist_load_helper(emnist_gz_names[2], emnist_gz_names[3]) with cd(data_dir): os.makedirs('emnist', exist_ok=False) print("Processing...") with cd('emnist'): x = np.concatenate((images, images1), 0) y = np.concatenate((labels, labels1), 0) # Give images the right orientation so that plt.imshow(x[0]) just works. x = np.moveaxis(x.reshape(-1, 28, 28), 1, 2) for i in sorted(set(y.flatten())): keep = y == i x_i = x[keep.flatten(), :] if i >= 36: char = chr(i - 36 + ord('a')) elif i >= 10: char = chr(i - 10 + ord('A')) else: char = str(i) if quiet >= 2: pass elif quiet == 1: print(char) elif quiet <= 0: print(char) print(image_to_string(x_i[0, ...])) file_i = char + '.pklz' with gzip.open(file_i, 'wb') as f: dill.dump(x_i, f, protocol=dill.HIGHEST_PROTOCOL) if shape is not None: maybe_convert_emnist_shape(data_dir, shape)
def _validate_omniglot(path): if not os.path.isdir(path): return False with cd(path): return set(os.listdir(path)) == set(omniglot_alphabets)
def make_dataset_in_parallel(run_kwargs, dataset_cls, param_values=None): """ Uses dps.hyper.parallel_session.ParallelSession to create a dataset in parallel. """ # Get run_kwargs from command line sig = inspect.signature(ParallelSession.__init__) default_run_kwargs = sig.bind_partial() default_run_kwargs.apply_defaults() cl_run_kwargs = clify.command_line(default_run_kwargs.arguments).parse() run_kwargs.update(cl_run_kwargs) param_values = param_values or dataset_cls._capture_param_values() param_values = Config(param_values) seed = param_values["seed"] if seed is None or seed < 0: seed = gen_seed() n_examples = param_values["n_examples"] n_examples_per_shard = run_kwargs["n_examples_per_shard"] experiment_store = ExperimentStore( cfg.parallel_experiments_build_dir, prefix="build_{}".format(dataset_cls.__name__)) count = 0 name = "attempt=0" has_built = False while not has_built: try: exp_dir = experiment_store.new_experiment(name, seed, add_date=True, force_fresh=True) has_built = True except FileExistsError: count += 1 name = "attempt_{}".format(count) print("Building dataset.") job = Job(exp_dir.path) n_examples_remaining = n_examples with NumpySeed(seed): inputs = [] idx = 0 while n_examples_remaining: seed = gen_seed() cur_n_examples = min(n_examples_remaining, n_examples_per_shard) n_examples_remaining -= cur_n_examples inputs.append((idx, seed, cur_n_examples)) idx += 1 job.map(_BuildDataset(dataset_cls, param_values), inputs) job.save_object('metadata', 'param_values', param_values) print(job.summary()) archive_path = job.zip(delete=True) print("Zipped {} as {}.".format(exp_dir.path, archive_path)) run_kwargs = run_kwargs.copy() del run_kwargs['n_examples_per_shard'] run_kwargs.update( archive_path=archive_path, name=name, kind="parallel", parallel_exe=cfg.parallel_exe) parallel_session = submit_job(**run_kwargs) with cd(os.path.join(parallel_session.job_path, 'experiments')): dataset_files = [] for dir_path, dirs, files in os.walk('.'): if not dir_path.startswith("./exp__seed="): continue df = [f for f in files if not f.endswith('.cfg')] assert len(df) == 1 dataset_files.append(os.path.join(dir_path, df[0])) cached_filename = os.path.join(cfg.data_dir, "cached_datasets", dataset_cls.__name__, str(get_param_hash(param_values))) command = "cat " + " ".join(dataset_files) + " > " + cached_filename print("Running command: \n" + command) subprocess.run(command, shell=True, check=True) print("Done.") with open(cached_filename + ".cfg", 'w') as f: f.write(pprint.pformat(param_values)) return parallel_session