def start_stage(self): if self.n_procs > 1: config = cfg.freeze() self.inter_comm.bcast(config, root=MPI.ROOT) seeds = [gen_seed() for _ in range(self.n_procs - 1)] self.inter_comm.scatter(seeds, root=MPI.ROOT)
def framework_initialize_stage(self, stack): # Configure and create session and graph for stage. session_config = tf.ConfigProto() session_config.intra_op_parallelism_threads = cfg.get( 'intra_op_parallelism_threads', 0) session_config.inter_op_parallelism_threads = cfg.get( 'inter_op_parallelism_threads', 0) session_config.log_device_placement = cfg.get('log_device_placement', 0) if cfg.use_gpu: per_process_gpu_memory_fraction = getattr( cfg, 'per_process_gpu_memory_fraction', None) if per_process_gpu_memory_fraction: session_config.gpu_options.per_process_gpu_memory_fraction = per_process_gpu_memory_fraction gpu_allow_growth = getattr(cfg, 'gpu_allow_growth', None) if gpu_allow_growth: session_config.gpu_options.allow_growth = gpu_allow_growth _print("Using GPU if available.") _print("Using {}% of GPU memory.".format( 100 * session_config.gpu_options.per_process_gpu_memory_fraction)) _print("Allowing growth of GPU memory: {}".format( session_config.gpu_options.allow_growth)) graph = tf.Graph() sess = tf.Session(graph=graph, config=session_config) # This HAS to come after the creation of the session, otherwise # it allocates all GPU memory if using the GPU. _print("\nAvailable devices: ") from tensorflow.python.client import device_lib _print(device_lib.list_local_devices()) if not cfg.use_gpu: _print("Not using GPU.") stack.enter_context(graph.device("/cpu:0")) stack.enter_context(graph.as_default()) stack.enter_context(sess) stack.enter_context(sess.as_default()) # Set the seed for the stage. tf_seed = gen_seed() _print( "Setting tensorflow seed to generated seed: {}\n".format(tf_seed)) tf.set_random_seed(tf_seed) tf.logging.set_verbosity(tf.logging.ERROR)
def sample_configs(distributions, n_repeats, n_samples=None): """ Samples configs from a distribution for hyper-parameter search. Parameters ---------- distributions: dict or None Mapping from parameter names to distributions (objects with member function ``rvs`` which accepts a shape and produces an array of samples with that shape). n_repeats: int > 0 Number of different seeds to use for each sampled configuration. n_samples: int > 0 Number of configs to sample. """ samples = [] if distributions is None: samples = [Config()] elif isinstance(distributions, list): samples = distributions + [] if n_samples: samples = list(np.random.permutation(samples)[:n_samples]) else: if not n_samples: samples = generate_all(distributions) else: samples = nested_sample(distributions, n_samples) print("Sampled configs:") pprint(samples) configs = [] for i, s in enumerate(samples): s['idx'] = i for r in range(n_repeats): _new = copy.deepcopy(s) _new['repeat'] = r _new['seed'] = gen_seed() configs.append(_new) return configs
def framework_initialize_stage(self, stack): # Set the seed for the stage. torch_seed = gen_seed() _print( "Setting pytorch seed to generated seed: {}\n".format(torch_seed)) torch.manual_seed(torch_seed) torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = cfg.pytorch_cudnn_benchmark torch.backends.cudnn.deterministic = cfg.pytorch_cudnn_deterministic if cfg.use_gpu: _print("Trying to use GPU...") try: device = torch.cuda.current_device() use_gpu = True except AssertionError: tb.print_exc() use_gpu = False else: use_gpu = False if use_gpu: _print("Using GPU.") _print("Device count: {}".format(torch.cuda.device_count())) _print("Device idx: {}".format(device)) _print("Device name: {}".format( torch.cuda.get_device_name(device))) _print("Device capability: {}".format( torch.cuda.get_device_capability(device))) set_pytorch_device('cuda') else: _print("Not using GPU.") set_pytorch_device('cpu') torch.set_printoptions(profile='full')
def build_and_submit(category, exp_name, config, distributions, n_param_settings=0, n_repeats=1, do_local_test=False, kind="local", readme="", tasks_per_gpu=1, **run_kwargs): """ Build a job and submit it. Meant to be called from within a script. Parameters ---------- category: str High-level category of the experiment. Determines the ExperimentStore where the experiment data will be stored. exp_name: str Low-level name of the experiment. config: Config instance or dict Configuration to use as the base config for all jobs. distributions: dict Object used to generate variations of the base config (so that different jobs test different parameters). n_param_settings: int Number of different configurations to sample from `distributions`. If not supplied, it is assumed that `distributions` actually specifies a grid search, and an attempt is made to generate all possible configurations int that grid search. n_repeats: int Number of experiments to run (with different random seeds) for each generated configuration. do_local_test: bool If True, sample one of the generated configurations and use it to run a short test locally, to ensure that the jobs will run properly. kind: str One of pbs, slurm, slurm-local, parallel, local. Specifies which method should be used to run the jobs in parallel. readme: str A string outlining the purpose/context for the created experiment. **run_kwargs: Additional arguments that are ultimately passed to `ParallelSession` in order to run the job. """ # Get run_kwargs from command line sig = inspect.signature(ParallelSession.__init__) default_run_kwargs = sig.bind_partial() default_run_kwargs.apply_defaults() cl_run_kwargs = clify.command_line(default_run_kwargs.arguments).parse() run_kwargs.update(cl_run_kwargs) if config.seed is None or config.seed < 0: config.seed = gen_seed() assert kind in "pbs slurm slurm-local parallel local".split() assert 'build_command' not in config config['build_command'] = ' '.join(sys.argv) print(config['build_command']) if kind == "local": with config: from dps.train import training_loop return training_loop() else: config.name = category config = config.copy() if readme == "_vim_": readme = edit_text(prefix="dps_readme_", editor="vim", initial_text="README.md: \n") scratch = os.path.join(cfg.parallel_experiments_build_dir, category) archive_path, n_tasks = build_search(scratch, exp_name, distributions, config, add_date=1, _zip=True, do_local_test=do_local_test, n_param_settings=n_param_settings, n_repeats=n_repeats, readme=readme) run_kwargs.update(archive_path=archive_path, category=category, exp_name=exp_name, kind=kind) gpu_kind = run_kwargs.get('gpu_kind', None) resources = compute_required_resources(n_tasks, tasks_per_gpu, gpu_kind) run_kwargs.update(resources) parallel_session = submit_job(**run_kwargs) return parallel_session
def build_search(path, name, distributions, config, n_repeats, n_param_settings=None, _zip=True, add_date=0, do_local_test=True, readme=""): """ Create a job implementing a hyper-parameter search. Parameters ---------- path: str Path to the directory where the search archive will be saved. name: str Name for the search. distributions: dict (str -> (list or distribution)) Distributions to sample from. Can also be a list of samples. config: Config instance The base configuration. n_repeats: int Number of different random seeds to run each sample with. n_param_settings: int Number of parameter settings to sample. If not supplied, all possibilities are generated. _zip: bool Whether to zip the created search directory. add_date: bool Whether to add time to name of experiment directory. do_local_test: bool If True, run a short test using one of the sampled configs on the local machine to catch any dumb errors before starting the real experiment. readme: str String specifiying context/purpose of search. """ if config.get('seed', None) is None: config.seed = gen_seed() with NumpySeed(config.seed): es = ExperimentStore(path, prefix="build_search") count = 0 base_name = name has_built = False while not has_built: try: exp_dir = es.new_experiment(name, config.seed, add_date=add_date, force_fresh=1) has_built = True except FileExistsError: name = "{}_{}".format(base_name, count) count += 1 if readme: with open(exp_dir.path_for('README.md'), 'w') as f: f.write(readme) print(config) exp_dir.record_environment(config=config) print("Building parameter search at {}.".format(exp_dir.path)) job = Job(exp_dir.path) new_configs = sample_configs(distributions, n_repeats, n_param_settings) with open(exp_dir.path_for("sampled_configs.txt"), "w") as f: f.write("\n".join("idx={}: {}".format(c["idx"], pformat(c)) for c in new_configs)) print("{} configs were sampled for parameter search.".format( len(new_configs))) if do_local_test: print("\nStarting local test " + ("=" * 80)) test_config = new_configs[0].copy() test_config.update(max_steps=1000, render_hook=None) _RunTrainingLoop(config)(test_config) print("Done local test " + ("=" * 80) + "\n") job.map(_RunTrainingLoop(config.copy()), new_configs) job.save_object('metadata', 'distributions', distributions) job.save_object('metadata', 'config', config) print(job.summary()) if _zip: path = job.zip(delete=True) else: path = exp_dir.path print("Zipped {} as {}.".format(exp_dir.path, path)) return path, len(new_configs)
def seed(self, seed=None): np.random.seed(seed) for env in self._env_copies: s = gen_seed() env.seed(s)
def _run(self): print(cfg.to_string()) threshold_reached = True self.global_step = 0 self.n_global_experiences = 0 self.curriculum_remaining = self.curriculum + [] self.curriculum_complete = [] stage_idx = 0 while self.curriculum_remaining: print("\n" + "=" * 50) self.timestamp("Starting stage {}".format(stage_idx)) print("\n") if cfg.start_tensorboard: restart_tensorboard(self.experiment_store.path, cfg.tbport, cfg.reload_interval) stage_config = self.curriculum_remaining.pop(0) stage_config = Config(stage_config) self.data.start_stage(stage_idx, stage_config) with ExitStack() as stack: # --------------- Stage set-up ------------------- print("\n" + "-" * 10 + " Stage set-up " + "-" * 10) print("\nNew config values for this stage are: \n{}\n".format(pformat(stage_config))) stack.enter_context(stage_config) stage_prepare_func = cfg.get("stage_prepare_func", None) if callable(stage_prepare_func): stage_prepare_func() # Modify the stage config in arbitrary ways before starting stage self.mpi_context.start_stage() # Configure and create session and graph for stage. session_config = tf.ConfigProto() session_config.intra_op_parallelism_threads = cfg.get('intra_op_parallelism_threads', 0) session_config.inter_op_parallelism_threads = cfg.get('inter_op_parallelism_threads', 0) session_config.log_device_placement = cfg.get('log_device_placement', 0) if cfg.use_gpu: per_process_gpu_memory_fraction = getattr(cfg, 'per_process_gpu_memory_fraction', None) if per_process_gpu_memory_fraction: session_config.gpu_options.per_process_gpu_memory_fraction = per_process_gpu_memory_fraction gpu_allow_growth = getattr(cfg, 'gpu_allow_growth', None) if gpu_allow_growth: session_config.gpu_options.allow_growth = gpu_allow_growth if cfg.use_gpu: print("Using GPU if available.") print("Using {}% of GPU memory.".format( 100 * session_config.gpu_options.per_process_gpu_memory_fraction)) print("Allowing growth of GPU memory: {}".format(session_config.gpu_options.allow_growth)) graph = tf.Graph() sess = tf.Session(graph=graph, config=session_config) # This HAS to come after the creation of the session, otherwise # it allocates all GPU memory if using the GPU. print("\nAvailable devices: ") from tensorflow.python.client import device_lib print(device_lib.list_local_devices()) if not cfg.use_gpu: print("Not using GPU.") stack.enter_context(graph.device("/cpu:0")) stack.enter_context(graph.as_default()) stack.enter_context(sess) stack.enter_context(sess.as_default()) # Set the seed for the stage. Notice we generate a new tf seed for each stage. tf_seed = gen_seed() print("Setting tensorflow seed to generated seed: {}\n".format(tf_seed)) tf.set_random_seed(tf_seed) # Set limit on CPU RAM for the stage cpu_ram_limit_mb = cfg.get("cpu_ram_limit_mb", None) if cpu_ram_limit_mb is not None: stack.enter_context(memory_limit(cfg.cpu_ram_limit_mb)) print("Building env...\n") # Maybe build env if stage_idx == 0 or not cfg.preserve_env: if getattr(self, 'env', None): self.env.close() self.env = cfg.build_env() if hasattr(self.env, "print_memory_footprint"): self.env.print_memory_footprint() print("\nDone building env.\n") print("Building updater...\n") import warnings with warnings.catch_warnings(): warnings.simplefilter('once') if cfg.n_procs > 1: updater = cfg.get_updater(self.env, mpi_context=self.mpi_context) else: updater = cfg.get_updater(self.env) updater.stage_idx = stage_idx updater.exp_dir = self.exp_dir updater.build_graph() print("\nDone building updater.\n") walk_variable_scopes(max_depth=3) # Maybe initialize network weights. # Let a *path_specification* be one of three things: # 1. An integer specifying a stage to load the best hypothesis from. # 2. A string of format: "stage_idx,kind" where `stage_idx` specifies a stage to load from # and `kind` is either "final" or "best", specifying whether to load final or best # hypothesis from that stage. # 3. A path on the filesystem that gives a prefix for a tensorflow checkpoint file to load from. # # Then cfg.load_path can either be a path_specification itself, in which case all variables # in the network will be loaded from that path_specification, or a dictionary mapping from # variable scope names to path specifications, in which case all variables in each supplied # variable scope name will be loaded from the path_specification paired with that scope name. load_path = cfg.load_path if load_path is not None: if isinstance(load_path, str) or isinstance(load_path, int): load_path = {"": load_path} load_path = dict(load_path) # Sort in increasing order, so that it if one variable scope lies within another scope, # the outer scope gets loaded before the inner scope, rather than having the outer scope # wipe out the inner scope. items = sorted(load_path.items()) for var_scope, path in items: variables = {v.name: v for v in trainable_variables(var_scope, for_opt=False)} if not variables: print("No variables to load in scope {}.".format(str(var_scope))) continue saver = tf.train.Saver(variables) load_stage, kind = None, None if isinstance(path, int): load_stage = path kind = "best" elif isinstance(path, str): try: split = path.split(',') load_stage = int(split[0]) kind = 'best' if len(split) > 1 else split[1] assert kind in 'best final'.split(), "path={}".format(path) except Exception: load_stage, kind = None, None if load_stage is not None: if stage_idx == 0: print( "Not loading var scope \"{}\" from stage {}, " "currently in stage 0.".format(var_scope, load_stage)) continue else: key = kind + '_path' completed_history = self.data.history[:-1] path = completed_history[load_stage][key] path = os.path.realpath(path) saver.restore(tf.get_default_session(), path) print("Loading var scope \"{}\" from {}.".format(var_scope, path)) else: print("Using a fresh set of weights, not loading anything.") tf.train.get_or_create_global_step() sess.run(uninitialized_variables_initializer()) sess.run(tf.assert_variables_initialized()) for hook in cfg.hooks: assert isinstance(hook, Hook) hook.start_stage(self, updater, stage_idx) threshold_reached = False reason = None try: # --------------- Run stage ------------------- start = time.time() phys_memory_before = memory_usage(physical=True) gpu_memory_before = gpu_memory_usage() threshold_reached, reason = self._run_stage(stage_idx, updater) except KeyboardInterrupt: reason = "User interrupt" except NotImplementedError as e: # There is a bug in pdb_postmortem that prevents instances of `NotImplementedError` # from being handled properly, so replace it with an instance of `Exception`. if cfg.robust: traceback.print_exc() reason = "Exception occurred ({})".format(repr(e)) else: raise Exception("NotImplemented") from e except Exception as e: reason = "Exception occurred ({})".format(repr(e)) if cfg.robust: traceback.print_exc() else: raise except Alarm: reason = "Time limit exceeded" raise finally: phys_memory_after = memory_usage(physical=True) gpu_memory_after = gpu_memory_usage() self.data.record_values_for_stage( stage_duration=time.time()-start, phys_memory_before_mb=phys_memory_before, phys_memory_delta_mb=phys_memory_after - phys_memory_before, gpu_memory_before_mb=gpu_memory_before, gpu_memory_delta_mb=gpu_memory_after - gpu_memory_before ) self.data.record_values_for_stage(reason=reason) print("\n" + "-" * 10 + " Optimization complete " + "-" * 10) print("\nReason: {}.\n".format(reason)) final_path = self.data.path_for('weights/final_for_stage_{}'.format(stage_idx)) final_path = cfg.get('save_path', final_path) final_path = updater.save(tf.get_default_session(), final_path) self.data.record_values_for_stage(final_path=final_path) # --------------- Maybe render performance of best hypothesis ------------------- do_final_testing = ( "Exception occurred" not in reason and reason != "Time limit exceeded" and 'best_path' in self.data.current_stage_record) if do_final_testing: try: print("\n" + "-" * 10 + " Final testing/rendering " + "-" * 10) print("Best hypothesis for this stage was found on " "step (l: {best_local_step}, g: {best_global_step}) " "with stopping criteria ({sc_name}) of {best_stopping_criteria}.".format( sc_name=self.stopping_criteria_name, **self.data.current_stage_record)) best_path = self.data.current_stage_record['best_path'] print("Loading best hypothesis for this stage " "from file {}...".format(best_path)) updater.restore(sess, best_path) test_record = updater.evaluate(cfg.batch_size, mode="test") for hook in cfg.hooks: if hook.call_per_timestep and hook.final: hook_record = hook.step(self, updater) if hook_record: assert len(hook_record) == 1 for k, d in dict(hook_record).items(): test_record.update(d) self.data.record_values_for_stage( **{'_test_' + k: v for k, v in test_record.items()}) if cfg.render_step > 0 and cfg.render_hook is not None: print("Rendering...") cfg.render_hook(updater) print("Done rendering.") except BaseException: print("Exception occurred while performing final testing/rendering: ") traceback.print_exc() else: print("\n" + "-" * 10 + " Skipping final testing/rendering " + "-" * 10) # --------------- Finish up the stage ------------------- self.data.end_stage(updater.n_updates) print("\n" + "-" * 10 + " Running end-of-stage hooks " + "-" * 10 + "\n") for hook in cfg.hooks: hook.end_stage(self, stage_idx) print() self.timestamp("Done stage {}".format(stage_idx)) print("=" * 50) stage_idx += 1 self.curriculum_complete.append(stage_config) if not (threshold_reached or cfg.power_through): print("Failed to reach stopping criteria threshold on stage {} " "of the curriculum, terminating.".format(stage_idx)) break
def run(self, start_time): """ Run the training loop. Parameters ---------- start_time: int Start time (in seconds since epoch) for measuring elapsed time for purposes of interrupting the training loop. """ if start_time is None: start_time = time.time() self.start_time = start_time self.timestamp("Entering TrainingLoop.run") prepare_func = cfg.get("prepare_func", None) if callable(prepare_func): prepare_func() # Modify the config in arbitrary ways before training else: try: prepare_funcs = list(prepare_func) except (TypeError, ValueError): pass else: for f in prepare_funcs: if callable(f): f() self.curriculum = cfg.curriculum + [] if cfg.seed is None or cfg.seed < 0: cfg.seed = gen_seed() # Create a directory to store the results of the training session. self.experiment_store = ExperimentStore(os.path.join(cfg.local_experiments_dir, cfg.env_name)) exp_dir = self.experiment_store.new_experiment( self.exp_name, cfg.seed, add_date=1, force_fresh=1, update_latest=False) self.exp_dir = exp_dir cfg.path = exp_dir.path breaker = "-" * 40 header = "{}\nREADME.md - {}\n{}\n\n\n".format(breaker, os.path.basename(exp_dir.path), breaker) readme = header + (cfg.readme if cfg.readme else "") + "\n\n" with open(exp_dir.path_for('README.md'), 'w') as f: f.write(readme) self.data = _TrainingLoopData(exp_dir) self.data.setup() frozen_data = None with ExitStack() as stack: if cfg.pdb: stack.enter_context(pdb_postmortem()) print("`pdb` is turned on, so forcing setting robust=False") cfg.robust = False stack.enter_context(redirect_stream('stdout', self.data.path_for('stdout'), tee=cfg.tee)) stack.enter_context(redirect_stream('stderr', self.data.path_for('stderr'), tee=cfg.tee)) print("\n\n" + "=" * 80) self.timestamp("Starting training run (name={})".format(self.exp_name)) print("\nDirectory for this training run is {}.".format(exp_dir.path)) stack.enter_context(NumpySeed(cfg.seed)) print("\nSet numpy random seed to {}.\n".format(cfg.seed)) limiter = time_limit( self.time_remaining, verbose=True, timeout_callback=lambda limiter: print("Training run exceeded its time limit.")) self.mpi_context = MPI_MasterContext(cfg.get('n_procs', 1), exp_dir) try: with limiter: self._run() finally: self.data.summarize() self.timestamp("Done training run (name={})".format(self.exp_name)) print("=" * 80) print("\n\n") frozen_data = self.data.freeze() self.timestamp("Leaving TrainingLoop.run") return frozen_data
def make_dataset_in_parallel(run_kwargs, dataset_cls, param_values=None): """ Uses dps.hyper.parallel_session.ParallelSession to create a dataset in parallel. """ # Get run_kwargs from command line sig = inspect.signature(ParallelSession.__init__) default_run_kwargs = sig.bind_partial() default_run_kwargs.apply_defaults() cl_run_kwargs = clify.command_line(default_run_kwargs.arguments).parse() run_kwargs.update(cl_run_kwargs) param_values = param_values or dataset_cls._capture_param_values() param_values = Config(param_values) seed = param_values["seed"] if seed is None or seed < 0: seed = gen_seed() n_examples = param_values["n_examples"] n_examples_per_shard = run_kwargs["n_examples_per_shard"] experiment_store = ExperimentStore( cfg.parallel_experiments_build_dir, prefix="build_{}".format(dataset_cls.__name__)) count = 0 name = "attempt=0" has_built = False while not has_built: try: exp_dir = experiment_store.new_experiment(name, seed, add_date=True, force_fresh=True) has_built = True except FileExistsError: count += 1 name = "attempt_{}".format(count) print("Building dataset.") job = Job(exp_dir.path) n_examples_remaining = n_examples with NumpySeed(seed): inputs = [] idx = 0 while n_examples_remaining: seed = gen_seed() cur_n_examples = min(n_examples_remaining, n_examples_per_shard) n_examples_remaining -= cur_n_examples inputs.append((idx, seed, cur_n_examples)) idx += 1 job.map(_BuildDataset(dataset_cls, param_values), inputs) job.save_object('metadata', 'param_values', param_values) print(job.summary()) archive_path = job.zip(delete=True) print("Zipped {} as {}.".format(exp_dir.path, archive_path)) run_kwargs = run_kwargs.copy() del run_kwargs['n_examples_per_shard'] run_kwargs.update( archive_path=archive_path, name=name, kind="parallel", parallel_exe=cfg.parallel_exe) parallel_session = submit_job(**run_kwargs) with cd(os.path.join(parallel_session.job_path, 'experiments')): dataset_files = [] for dir_path, dirs, files in os.walk('.'): if not dir_path.startswith("./exp__seed="): continue df = [f for f in files if not f.endswith('.cfg')] assert len(df) == 1 dataset_files.append(os.path.join(dir_path, df[0])) cached_filename = os.path.join(cfg.data_dir, "cached_datasets", dataset_cls.__name__, str(get_param_hash(param_values))) command = "cat " + " ".join(dataset_files) + " > " + cached_filename print("Running command: \n" + command) subprocess.run(command, shell=True, check=True) print("Done.") with open(cached_filename + ".cfg", 'w') as f: f.write(pprint.pformat(param_values)) return parallel_session
def run_stage(mpi_context, env, stage_idx, exp_dir): config, seed = mpi_context.start_stage() with ExitStack() as stack: stack.enter_context(config) stack.enter_context(NumpySeed(seed)) # Accept config for new stage print("\n" + "-" * 10 + " Stage set-up " + "-" * 10) print(cfg.to_string()) # Configure and create session and graph for stage. session_config = tf.ConfigProto() session_config.intra_op_parallelism_threads = cfg.get( 'intra_op_parallelism_threads', 0) session_config.inter_op_parallelism_threads = cfg.get( 'inter_op_parallelism_threads', 0) # if cfg.use_gpu: # per_process_gpu_memory_fraction = getattr(cfg, 'per_process_gpu_memory_fraction', None) # if per_process_gpu_memory_fraction: # session_config.gpu_options.per_process_gpu_memory_fraction = \ # per_process_gpu_memory_fraction # gpu_allow_growth = getattr(cfg, 'gpu_allow_growth', None) # if gpu_allow_growth: # session_config.gpu_options.allow_growth = gpu_allow_growth # if cfg.use_gpu: # print("Using GPU if available.") # print("Using {}% of GPU memory.".format( # 100 * session_config.gpu_options.per_process_gpu_memory_fraction)) # print("Allowing growth of GPU memory: {}".format(session_config.gpu_options.allow_growth)) graph = tf.Graph() sess = tf.Session(graph=graph, config=session_config) # This HAS to come after the creation of the session, otherwise # it allocates all GPU memory if using the GPU. print("\nAvailable devices:") from tensorflow.python.client import device_lib print(device_lib.list_local_devices()) # if not cfg.use_gpu: # print("Not using GPU.") # stack.enter_context(graph.device("/cpu:0")) stack.enter_context(graph.device("/cpu:0")) stack.enter_context(graph.as_default()) stack.enter_context(sess) stack.enter_context(sess.as_default()) tf_seed = gen_seed() print( "Setting tensorflow seed to generated seed: {}\n".format(tf_seed)) tf.set_random_seed(tf_seed) # Set limit on CPU RAM for the stage cpu_ram_limit_mb = cfg.get("cpu_ram_limit_mb", None) if cpu_ram_limit_mb is not None: stack.enter_context(memory_limit(cfg.cpu_ram_limit_mb)) print("Building env...\n") # Maybe build env if stage_idx == 0 or not cfg.preserve_env: if env is not None: env.close() env = cfg.build_env() if hasattr(env, "print_memory_footprint"): env.print_memory_footprint() print("\nDone building env.\n") print("Building updater...\n") updater = cfg.get_updater(env, mpi_context=mpi_context) updater.stage_idx = stage_idx updater.exp_dir = exp_dir updater.build_graph() print("\nDone building updater.\n") # walk_variable_scopes(max_depth=3) tf.train.get_or_create_global_step() sess.run(uninitialized_variables_initializer()) sess.run(tf.assert_variables_initialized()) updater.worker_code() stage_idx += 1 return env