Example #1
0
    def start_stage(self):
        if self.n_procs > 1:
            config = cfg.freeze()
            self.inter_comm.bcast(config, root=MPI.ROOT)

            seeds = [gen_seed() for _ in range(self.n_procs - 1)]
            self.inter_comm.scatter(seeds, root=MPI.ROOT)
Example #2
0
    def framework_initialize_stage(self, stack):
        # Configure and create session and graph for stage.
        session_config = tf.ConfigProto()
        session_config.intra_op_parallelism_threads = cfg.get(
            'intra_op_parallelism_threads', 0)
        session_config.inter_op_parallelism_threads = cfg.get(
            'inter_op_parallelism_threads', 0)
        session_config.log_device_placement = cfg.get('log_device_placement',
                                                      0)

        if cfg.use_gpu:
            per_process_gpu_memory_fraction = getattr(
                cfg, 'per_process_gpu_memory_fraction', None)
            if per_process_gpu_memory_fraction:
                session_config.gpu_options.per_process_gpu_memory_fraction = per_process_gpu_memory_fraction

            gpu_allow_growth = getattr(cfg, 'gpu_allow_growth', None)
            if gpu_allow_growth:
                session_config.gpu_options.allow_growth = gpu_allow_growth

            _print("Using GPU if available.")
            _print("Using {}% of GPU memory.".format(
                100 *
                session_config.gpu_options.per_process_gpu_memory_fraction))
            _print("Allowing growth of GPU memory: {}".format(
                session_config.gpu_options.allow_growth))

        graph = tf.Graph()
        sess = tf.Session(graph=graph, config=session_config)

        # This HAS to come after the creation of the session, otherwise
        # it allocates all GPU memory if using the GPU.
        _print("\nAvailable devices: ")
        from tensorflow.python.client import device_lib
        _print(device_lib.list_local_devices())

        if not cfg.use_gpu:
            _print("Not using GPU.")
            stack.enter_context(graph.device("/cpu:0"))

        stack.enter_context(graph.as_default())
        stack.enter_context(sess)
        stack.enter_context(sess.as_default())

        # Set the seed for the stage.
        tf_seed = gen_seed()
        _print(
            "Setting tensorflow seed to generated seed: {}\n".format(tf_seed))
        tf.set_random_seed(tf_seed)

        tf.logging.set_verbosity(tf.logging.ERROR)
Example #3
0
def sample_configs(distributions, n_repeats, n_samples=None):
    """ Samples configs from a distribution for hyper-parameter search.

    Parameters
    ----------
    distributions: dict or None
        Mapping from parameter names to distributions (objects with
        member function ``rvs`` which accepts a shape and produces
        an array of samples with that shape).
    n_repeats: int > 0
        Number of different seeds to use for each sampled configuration.
    n_samples: int > 0
        Number of configs to sample.

    """
    samples = []

    if distributions is None:
        samples = [Config()]
    elif isinstance(distributions, list):
        samples = distributions + []

        if n_samples:
            samples = list(np.random.permutation(samples)[:n_samples])
    else:
        if not n_samples:
            samples = generate_all(distributions)
        else:
            samples = nested_sample(distributions, n_samples)

    print("Sampled configs:")
    pprint(samples)

    configs = []
    for i, s in enumerate(samples):
        s['idx'] = i
        for r in range(n_repeats):
            _new = copy.deepcopy(s)
            _new['repeat'] = r
            _new['seed'] = gen_seed()
            configs.append(_new)

    return configs
Example #4
0
    def framework_initialize_stage(self, stack):
        # Set the seed for the stage.
        torch_seed = gen_seed()
        _print(
            "Setting pytorch seed to generated seed: {}\n".format(torch_seed))
        torch.manual_seed(torch_seed)

        torch.backends.cudnn.enabled = True

        torch.backends.cudnn.benchmark = cfg.pytorch_cudnn_benchmark
        torch.backends.cudnn.deterministic = cfg.pytorch_cudnn_deterministic

        if cfg.use_gpu:
            _print("Trying to use GPU...")
            try:
                device = torch.cuda.current_device()
                use_gpu = True
            except AssertionError:
                tb.print_exc()
                use_gpu = False
        else:
            use_gpu = False

        if use_gpu:
            _print("Using GPU.")

            _print("Device count: {}".format(torch.cuda.device_count()))
            _print("Device idx: {}".format(device))
            _print("Device name: {}".format(
                torch.cuda.get_device_name(device)))
            _print("Device capability: {}".format(
                torch.cuda.get_device_capability(device)))

            set_pytorch_device('cuda')
        else:
            _print("Not using GPU.")
            set_pytorch_device('cpu')

        torch.set_printoptions(profile='full')
Example #5
0
def build_and_submit(category,
                     exp_name,
                     config,
                     distributions,
                     n_param_settings=0,
                     n_repeats=1,
                     do_local_test=False,
                     kind="local",
                     readme="",
                     tasks_per_gpu=1,
                     **run_kwargs):
    """ Build a job and submit it. Meant to be called from within a script.

    Parameters
    ----------
    category: str
        High-level category of the experiment. Determines the ExperimentStore
        where the experiment data will be stored.
    exp_name: str
        Low-level name of the experiment.
    config: Config instance or dict
        Configuration to use as the base config for all jobs.
    distributions: dict
        Object used to generate variations of the base config (so that different
        jobs test different parameters).
    n_param_settings: int
        Number of different configurations to sample from `distributions`. If not
        supplied, it is assumed that `distributions` actually specifies a grid
        search, and an attempt is made to generate all possible configurations int
        that grid search.
    n_repeats: int
        Number of experiments to run (with different random seeds) for each
        generated configuration.
    do_local_test: bool
        If True, sample one of the generated configurations and use it to run a
        short test locally, to ensure that the jobs will run properly.
    kind: str
        One of pbs, slurm, slurm-local, parallel, local. Specifies which method
        should be used to run the jobs in parallel.
    readme: str
        A string outlining the purpose/context for the created experiment.
    **run_kwargs:
        Additional arguments that are ultimately passed to `ParallelSession` in
        order to run the job.

    """
    # Get run_kwargs from command line
    sig = inspect.signature(ParallelSession.__init__)
    default_run_kwargs = sig.bind_partial()
    default_run_kwargs.apply_defaults()
    cl_run_kwargs = clify.command_line(default_run_kwargs.arguments).parse()
    run_kwargs.update(cl_run_kwargs)

    if config.seed is None or config.seed < 0:
        config.seed = gen_seed()

    assert kind in "pbs slurm slurm-local parallel local".split()
    assert 'build_command' not in config
    config['build_command'] = ' '.join(sys.argv)
    print(config['build_command'])

    if kind == "local":
        with config:
            from dps.train import training_loop
            return training_loop()
    else:
        config.name = category
        config = config.copy()

        if readme == "_vim_":
            readme = edit_text(prefix="dps_readme_",
                               editor="vim",
                               initial_text="README.md: \n")

        scratch = os.path.join(cfg.parallel_experiments_build_dir, category)

        archive_path, n_tasks = build_search(scratch,
                                             exp_name,
                                             distributions,
                                             config,
                                             add_date=1,
                                             _zip=True,
                                             do_local_test=do_local_test,
                                             n_param_settings=n_param_settings,
                                             n_repeats=n_repeats,
                                             readme=readme)

        run_kwargs.update(archive_path=archive_path,
                          category=category,
                          exp_name=exp_name,
                          kind=kind)

        gpu_kind = run_kwargs.get('gpu_kind', None)
        resources = compute_required_resources(n_tasks, tasks_per_gpu,
                                               gpu_kind)
        run_kwargs.update(resources)

        parallel_session = submit_job(**run_kwargs)

        return parallel_session
Example #6
0
def build_search(path,
                 name,
                 distributions,
                 config,
                 n_repeats,
                 n_param_settings=None,
                 _zip=True,
                 add_date=0,
                 do_local_test=True,
                 readme=""):
    """ Create a job implementing a hyper-parameter search.

    Parameters
    ----------
    path: str
        Path to the directory where the search archive will be saved.
    name: str
        Name for the search.
    distributions: dict (str -> (list or distribution))
        Distributions to sample from. Can also be a list of samples.
    config: Config instance
        The base configuration.
    n_repeats: int
        Number of different random seeds to run each sample with.
    n_param_settings: int
        Number of parameter settings to sample. If not supplied, all
        possibilities are generated.
    _zip: bool
        Whether to zip the created search directory.
    add_date: bool
        Whether to add time to name of experiment directory.
    do_local_test: bool
        If True, run a short test using one of the sampled
        configs on the local machine to catch any dumb errors
        before starting the real experiment.
    readme: str
        String specifiying context/purpose of search.

    """
    if config.get('seed', None) is None:
        config.seed = gen_seed()

    with NumpySeed(config.seed):
        es = ExperimentStore(path, prefix="build_search")

        count = 0
        base_name = name
        has_built = False
        while not has_built:
            try:
                exp_dir = es.new_experiment(name,
                                            config.seed,
                                            add_date=add_date,
                                            force_fresh=1)
                has_built = True
            except FileExistsError:
                name = "{}_{}".format(base_name, count)
                count += 1

        if readme:
            with open(exp_dir.path_for('README.md'), 'w') as f:
                f.write(readme)

        print(config)
        exp_dir.record_environment(config=config)

        print("Building parameter search at {}.".format(exp_dir.path))

        job = Job(exp_dir.path)

        new_configs = sample_configs(distributions, n_repeats,
                                     n_param_settings)

        with open(exp_dir.path_for("sampled_configs.txt"), "w") as f:
            f.write("\n".join("idx={}: {}".format(c["idx"], pformat(c))
                              for c in new_configs))

        print("{} configs were sampled for parameter search.".format(
            len(new_configs)))

        if do_local_test:
            print("\nStarting local test " + ("=" * 80))
            test_config = new_configs[0].copy()
            test_config.update(max_steps=1000, render_hook=None)
            _RunTrainingLoop(config)(test_config)
            print("Done local test " + ("=" * 80) + "\n")

        job.map(_RunTrainingLoop(config.copy()), new_configs)

        job.save_object('metadata', 'distributions', distributions)
        job.save_object('metadata', 'config', config)

        print(job.summary())

        if _zip:
            path = job.zip(delete=True)
        else:
            path = exp_dir.path

        print("Zipped {} as {}.".format(exp_dir.path, path))

        return path, len(new_configs)
Example #7
0
File: env.py Project: alcinos/dps
 def seed(self, seed=None):
     np.random.seed(seed)
     for env in self._env_copies:
         s = gen_seed()
         env.seed(s)
Example #8
0
File: train.py Project: alcinos/dps
    def _run(self):
        print(cfg.to_string())

        threshold_reached = True
        self.global_step = 0
        self.n_global_experiences = 0
        self.curriculum_remaining = self.curriculum + []
        self.curriculum_complete = []

        stage_idx = 0
        while self.curriculum_remaining:
            print("\n" + "=" * 50)
            self.timestamp("Starting stage {}".format(stage_idx))
            print("\n")

            if cfg.start_tensorboard:
                restart_tensorboard(self.experiment_store.path, cfg.tbport, cfg.reload_interval)

            stage_config = self.curriculum_remaining.pop(0)
            stage_config = Config(stage_config)

            self.data.start_stage(stage_idx, stage_config)

            with ExitStack() as stack:

                # --------------- Stage set-up -------------------

                print("\n" + "-" * 10 + " Stage set-up " + "-" * 10)

                print("\nNew config values for this stage are: \n{}\n".format(pformat(stage_config)))
                stack.enter_context(stage_config)

                stage_prepare_func = cfg.get("stage_prepare_func", None)
                if callable(stage_prepare_func):
                    stage_prepare_func()  # Modify the stage config in arbitrary ways before starting stage

                self.mpi_context.start_stage()

                # Configure and create session and graph for stage.
                session_config = tf.ConfigProto()
                session_config.intra_op_parallelism_threads = cfg.get('intra_op_parallelism_threads', 0)
                session_config.inter_op_parallelism_threads = cfg.get('inter_op_parallelism_threads', 0)
                session_config.log_device_placement = cfg.get('log_device_placement', 0)

                if cfg.use_gpu:
                    per_process_gpu_memory_fraction = getattr(cfg, 'per_process_gpu_memory_fraction', None)
                    if per_process_gpu_memory_fraction:
                        session_config.gpu_options.per_process_gpu_memory_fraction = per_process_gpu_memory_fraction

                    gpu_allow_growth = getattr(cfg, 'gpu_allow_growth', None)
                    if gpu_allow_growth:
                        session_config.gpu_options.allow_growth = gpu_allow_growth

                if cfg.use_gpu:
                    print("Using GPU if available.")
                    print("Using {}% of GPU memory.".format(
                        100 * session_config.gpu_options.per_process_gpu_memory_fraction))
                    print("Allowing growth of GPU memory: {}".format(session_config.gpu_options.allow_growth))

                graph = tf.Graph()
                sess = tf.Session(graph=graph, config=session_config)

                # This HAS to come after the creation of the session, otherwise
                # it allocates all GPU memory if using the GPU.
                print("\nAvailable devices: ")
                from tensorflow.python.client import device_lib
                print(device_lib.list_local_devices())

                if not cfg.use_gpu:
                    print("Not using GPU.")
                    stack.enter_context(graph.device("/cpu:0"))

                stack.enter_context(graph.as_default())
                stack.enter_context(sess)
                stack.enter_context(sess.as_default())

                # Set the seed for the stage. Notice we generate a new tf seed for each stage.
                tf_seed = gen_seed()
                print("Setting tensorflow seed to generated seed: {}\n".format(tf_seed))
                tf.set_random_seed(tf_seed)

                # Set limit on CPU RAM for the stage
                cpu_ram_limit_mb = cfg.get("cpu_ram_limit_mb", None)
                if cpu_ram_limit_mb is not None:
                    stack.enter_context(memory_limit(cfg.cpu_ram_limit_mb))

                print("Building env...\n")

                # Maybe build env
                if stage_idx == 0 or not cfg.preserve_env:
                    if getattr(self, 'env', None):
                        self.env.close()

                    self.env = cfg.build_env()

                if hasattr(self.env, "print_memory_footprint"):
                    self.env.print_memory_footprint()

                print("\nDone building env.\n")
                print("Building updater...\n")

                import warnings
                with warnings.catch_warnings():
                    warnings.simplefilter('once')

                    if cfg.n_procs > 1:
                        updater = cfg.get_updater(self.env, mpi_context=self.mpi_context)
                    else:
                        updater = cfg.get_updater(self.env)

                    updater.stage_idx = stage_idx
                    updater.exp_dir = self.exp_dir

                    updater.build_graph()
                    print("\nDone building updater.\n")

                walk_variable_scopes(max_depth=3)

                # Maybe initialize network weights.
                # Let a *path_specification* be one of three things:
                #     1. An integer specifying a stage to load the best hypothesis from.
                #     2. A string of format: "stage_idx,kind" where `stage_idx` specifies a stage to load from
                #        and `kind` is either "final" or "best", specifying whether to load final or best
                #        hypothesis from that stage.
                #     3. A path on the filesystem that gives a prefix for a tensorflow checkpoint file to load from.
                #
                # Then cfg.load_path can either be a path_specification itself, in which case all variables
                # in the network will be loaded from that path_specification, or a dictionary mapping from
                # variable scope names to path specifications, in which case all variables in each supplied
                # variable scope name will be loaded from the path_specification paired with that scope name.
                load_path = cfg.load_path
                if load_path is not None:
                    if isinstance(load_path, str) or isinstance(load_path, int):
                        load_path = {"": load_path}

                    load_path = dict(load_path)

                    # Sort in increasing order, so that it if one variable scope lies within another scope,
                    # the outer scope gets loaded before the inner scope, rather than having the outer scope
                    # wipe out the inner scope.
                    items = sorted(load_path.items())

                    for var_scope, path in items:
                        variables = {v.name: v for v in trainable_variables(var_scope, for_opt=False)}
                        if not variables:
                            print("No variables to load in scope {}.".format(str(var_scope)))
                            continue

                        saver = tf.train.Saver(variables)

                        load_stage, kind = None, None

                        if isinstance(path, int):
                            load_stage = path
                            kind = "best"
                        elif isinstance(path, str):
                            try:
                                split = path.split(',')
                                load_stage = int(split[0])
                                kind = 'best' if len(split) > 1 else split[1]
                                assert kind in 'best final'.split(), "path={}".format(path)
                            except Exception:
                                load_stage, kind = None, None

                        if load_stage is not None:
                            if stage_idx == 0:
                                print(
                                    "Not loading var scope \"{}\" from stage {}, "
                                    "currently in stage 0.".format(var_scope, load_stage))
                                continue
                            else:
                                key = kind + '_path'
                                completed_history = self.data.history[:-1]
                                path = completed_history[load_stage][key]

                        path = os.path.realpath(path)

                        saver.restore(tf.get_default_session(), path)

                        print("Loading var scope \"{}\" from {}.".format(var_scope, path))
                else:
                    print("Using a fresh set of weights, not loading anything.")

                tf.train.get_or_create_global_step()
                sess.run(uninitialized_variables_initializer())
                sess.run(tf.assert_variables_initialized())

                for hook in cfg.hooks:
                    assert isinstance(hook, Hook)
                    hook.start_stage(self, updater, stage_idx)

                threshold_reached = False
                reason = None

                try:
                    # --------------- Run stage -------------------

                    start = time.time()
                    phys_memory_before = memory_usage(physical=True)
                    gpu_memory_before = gpu_memory_usage()

                    threshold_reached, reason = self._run_stage(stage_idx, updater)

                except KeyboardInterrupt:
                    reason = "User interrupt"

                except NotImplementedError as e:
                    # There is a bug in pdb_postmortem that prevents instances of `NotImplementedError`
                    # from being handled properly, so replace it with an instance of `Exception`.
                    if cfg.robust:
                        traceback.print_exc()
                        reason = "Exception occurred ({})".format(repr(e))
                    else:
                        raise Exception("NotImplemented") from e

                except Exception as e:
                    reason = "Exception occurred ({})".format(repr(e))
                    if cfg.robust:
                        traceback.print_exc()
                    else:
                        raise

                except Alarm:
                    reason = "Time limit exceeded"
                    raise

                finally:
                    phys_memory_after = memory_usage(physical=True)
                    gpu_memory_after = gpu_memory_usage()

                    self.data.record_values_for_stage(
                        stage_duration=time.time()-start,
                        phys_memory_before_mb=phys_memory_before,
                        phys_memory_delta_mb=phys_memory_after - phys_memory_before,
                        gpu_memory_before_mb=gpu_memory_before,
                        gpu_memory_delta_mb=gpu_memory_after - gpu_memory_before
                    )

                    self.data.record_values_for_stage(reason=reason)

                    print("\n" + "-" * 10 + " Optimization complete " + "-" * 10)
                    print("\nReason: {}.\n".format(reason))

                    final_path = self.data.path_for('weights/final_for_stage_{}'.format(stage_idx))
                    final_path = cfg.get('save_path', final_path)
                    final_path = updater.save(tf.get_default_session(), final_path)
                    self.data.record_values_for_stage(final_path=final_path)

                    # --------------- Maybe render performance of best hypothesis -------------------

                    do_final_testing = (
                        "Exception occurred" not in reason
                        and reason != "Time limit exceeded"
                        and 'best_path' in self.data.current_stage_record)

                    if do_final_testing:
                        try:
                            print("\n" + "-" * 10 + " Final testing/rendering " + "-" * 10)

                            print("Best hypothesis for this stage was found on "
                                  "step (l: {best_local_step}, g: {best_global_step}) "
                                  "with stopping criteria ({sc_name}) of {best_stopping_criteria}.".format(
                                      sc_name=self.stopping_criteria_name, **self.data.current_stage_record))

                            best_path = self.data.current_stage_record['best_path']
                            print("Loading best hypothesis for this stage "
                                  "from file {}...".format(best_path))
                            updater.restore(sess, best_path)

                            test_record = updater.evaluate(cfg.batch_size, mode="test")

                            for hook in cfg.hooks:
                                if hook.call_per_timestep and hook.final:
                                    hook_record = hook.step(self, updater)

                                    if hook_record:
                                        assert len(hook_record) == 1
                                        for k, d in dict(hook_record).items():
                                            test_record.update(d)

                            self.data.record_values_for_stage(
                                **{'_test_' + k: v for k, v in test_record.items()})

                            if cfg.render_step > 0 and cfg.render_hook is not None:
                                print("Rendering...")
                                cfg.render_hook(updater)
                                print("Done rendering.")

                        except BaseException:
                            print("Exception occurred while performing final testing/rendering: ")
                            traceback.print_exc()

                    else:
                        print("\n" + "-" * 10 + " Skipping final testing/rendering " + "-" * 10)

                    # --------------- Finish up the stage -------------------

                    self.data.end_stage(updater.n_updates)

                    print("\n" + "-" * 10 + " Running end-of-stage hooks " + "-" * 10 + "\n")
                    for hook in cfg.hooks:
                        hook.end_stage(self, stage_idx)

                    print()
                    self.timestamp("Done stage {}".format(stage_idx))
                    print("=" * 50)

                    stage_idx += 1
                    self.curriculum_complete.append(stage_config)

                if not (threshold_reached or cfg.power_through):
                    print("Failed to reach stopping criteria threshold on stage {} "
                          "of the curriculum, terminating.".format(stage_idx))
                    break
Example #9
0
File: train.py Project: alcinos/dps
    def run(self, start_time):
        """ Run the training loop.

        Parameters
        ----------
        start_time: int
            Start time (in seconds since epoch) for measuring elapsed time for
            purposes of interrupting the training loop.

        """
        if start_time is None:
            start_time = time.time()
        self.start_time = start_time

        self.timestamp("Entering TrainingLoop.run")

        prepare_func = cfg.get("prepare_func", None)
        if callable(prepare_func):
            prepare_func()  # Modify the config in arbitrary ways before training
        else:
            try:
                prepare_funcs = list(prepare_func)
            except (TypeError, ValueError):
                pass
            else:
                for f in prepare_funcs:
                    if callable(f):
                        f()

        self.curriculum = cfg.curriculum + []

        if cfg.seed is None or cfg.seed < 0:
            cfg.seed = gen_seed()

        # Create a directory to store the results of the training session.
        self.experiment_store = ExperimentStore(os.path.join(cfg.local_experiments_dir, cfg.env_name))
        exp_dir = self.experiment_store.new_experiment(
            self.exp_name, cfg.seed, add_date=1, force_fresh=1, update_latest=False)
        self.exp_dir = exp_dir
        cfg.path = exp_dir.path

        breaker = "-" * 40
        header = "{}\nREADME.md - {}\n{}\n\n\n".format(breaker, os.path.basename(exp_dir.path), breaker)
        readme = header + (cfg.readme if cfg.readme else "") + "\n\n"

        with open(exp_dir.path_for('README.md'), 'w') as f:
            f.write(readme)

        self.data = _TrainingLoopData(exp_dir)
        self.data.setup()

        frozen_data = None

        with ExitStack() as stack:
            if cfg.pdb:
                stack.enter_context(pdb_postmortem())
                print("`pdb` is turned on, so forcing setting robust=False")
                cfg.robust = False

            stack.enter_context(redirect_stream('stdout', self.data.path_for('stdout'), tee=cfg.tee))
            stack.enter_context(redirect_stream('stderr', self.data.path_for('stderr'), tee=cfg.tee))

            print("\n\n" + "=" * 80)
            self.timestamp("Starting training run (name={})".format(self.exp_name))

            print("\nDirectory for this training run is {}.".format(exp_dir.path))

            stack.enter_context(NumpySeed(cfg.seed))
            print("\nSet numpy random seed to {}.\n".format(cfg.seed))

            limiter = time_limit(
                self.time_remaining, verbose=True,
                timeout_callback=lambda limiter: print("Training run exceeded its time limit."))

            self.mpi_context = MPI_MasterContext(cfg.get('n_procs', 1), exp_dir)

            try:
                with limiter:
                    self._run()

            finally:
                self.data.summarize()

                self.timestamp("Done training run (name={})".format(self.exp_name))
                print("=" * 80)
                print("\n\n")

                frozen_data = self.data.freeze()

        self.timestamp("Leaving TrainingLoop.run")

        return frozen_data
Example #10
0
def make_dataset_in_parallel(run_kwargs, dataset_cls, param_values=None):
    """ Uses dps.hyper.parallel_session.ParallelSession to create a dataset in parallel. """

    # Get run_kwargs from command line
    sig = inspect.signature(ParallelSession.__init__)
    default_run_kwargs = sig.bind_partial()
    default_run_kwargs.apply_defaults()
    cl_run_kwargs = clify.command_line(default_run_kwargs.arguments).parse()
    run_kwargs.update(cl_run_kwargs)

    param_values = param_values or dataset_cls._capture_param_values()
    param_values = Config(param_values)
    seed = param_values["seed"]
    if seed is None or seed < 0:
        seed = gen_seed()

    n_examples = param_values["n_examples"]
    n_examples_per_shard = run_kwargs["n_examples_per_shard"]

    experiment_store = ExperimentStore(
        cfg.parallel_experiments_build_dir, prefix="build_{}".format(dataset_cls.__name__))

    count = 0
    name = "attempt=0"
    has_built = False
    while not has_built:
        try:
            exp_dir = experiment_store.new_experiment(name, seed, add_date=True, force_fresh=True)
            has_built = True
        except FileExistsError:
            count += 1
            name = "attempt_{}".format(count)

    print("Building dataset.")

    job = Job(exp_dir.path)
    n_examples_remaining = n_examples

    with NumpySeed(seed):
        inputs = []
        idx = 0
        while n_examples_remaining:
            seed = gen_seed()
            cur_n_examples = min(n_examples_remaining, n_examples_per_shard)
            n_examples_remaining -= cur_n_examples

            inputs.append((idx, seed, cur_n_examples))
            idx += 1

        job.map(_BuildDataset(dataset_cls, param_values), inputs)
        job.save_object('metadata', 'param_values', param_values)

    print(job.summary())
    archive_path = job.zip(delete=True)
    print("Zipped {} as {}.".format(exp_dir.path, archive_path))

    run_kwargs = run_kwargs.copy()

    del run_kwargs['n_examples_per_shard']

    run_kwargs.update(
        archive_path=archive_path, name=name, kind="parallel",
        parallel_exe=cfg.parallel_exe)
    parallel_session = submit_job(**run_kwargs)

    with cd(os.path.join(parallel_session.job_path, 'experiments')):
        dataset_files = []
        for dir_path, dirs, files in os.walk('.'):
            if not dir_path.startswith("./exp__seed="):
                continue

            df = [f for f in files if not f.endswith('.cfg')]
            assert len(df) == 1
            dataset_files.append(os.path.join(dir_path, df[0]))

        cached_filename = os.path.join(cfg.data_dir, "cached_datasets", dataset_cls.__name__, str(get_param_hash(param_values)))

        command = "cat " + " ".join(dataset_files) + " > " + cached_filename
        print("Running command: \n" + command)
        subprocess.run(command, shell=True, check=True)
        print("Done.")

        with open(cached_filename + ".cfg", 'w') as f:
            f.write(pprint.pformat(param_values))

    return parallel_session
Example #11
0
def run_stage(mpi_context, env, stage_idx, exp_dir):
    config, seed = mpi_context.start_stage()

    with ExitStack() as stack:
        stack.enter_context(config)
        stack.enter_context(NumpySeed(seed))

        # Accept config for new stage
        print("\n" + "-" * 10 + " Stage set-up " + "-" * 10)

        print(cfg.to_string())

        # Configure and create session and graph for stage.
        session_config = tf.ConfigProto()
        session_config.intra_op_parallelism_threads = cfg.get(
            'intra_op_parallelism_threads', 0)
        session_config.inter_op_parallelism_threads = cfg.get(
            'inter_op_parallelism_threads', 0)

        # if cfg.use_gpu:
        #     per_process_gpu_memory_fraction = getattr(cfg, 'per_process_gpu_memory_fraction', None)
        #     if per_process_gpu_memory_fraction:
        #         session_config.gpu_options.per_process_gpu_memory_fraction = \
        #             per_process_gpu_memory_fraction

        #     gpu_allow_growth = getattr(cfg, 'gpu_allow_growth', None)
        #     if gpu_allow_growth:
        #         session_config.gpu_options.allow_growth = gpu_allow_growth

        # if cfg.use_gpu:
        #     print("Using GPU if available.")
        #     print("Using {}% of GPU memory.".format(
        #         100 * session_config.gpu_options.per_process_gpu_memory_fraction))
        #     print("Allowing growth of GPU memory: {}".format(session_config.gpu_options.allow_growth))

        graph = tf.Graph()
        sess = tf.Session(graph=graph, config=session_config)

        # This HAS to come after the creation of the session, otherwise
        # it allocates all GPU memory if using the GPU.
        print("\nAvailable devices:")
        from tensorflow.python.client import device_lib
        print(device_lib.list_local_devices())

        # if not cfg.use_gpu:
        #     print("Not using GPU.")
        #     stack.enter_context(graph.device("/cpu:0"))

        stack.enter_context(graph.device("/cpu:0"))

        stack.enter_context(graph.as_default())
        stack.enter_context(sess)
        stack.enter_context(sess.as_default())

        tf_seed = gen_seed()
        print(
            "Setting tensorflow seed to generated seed: {}\n".format(tf_seed))
        tf.set_random_seed(tf_seed)

        # Set limit on CPU RAM for the stage
        cpu_ram_limit_mb = cfg.get("cpu_ram_limit_mb", None)
        if cpu_ram_limit_mb is not None:
            stack.enter_context(memory_limit(cfg.cpu_ram_limit_mb))

        print("Building env...\n")

        # Maybe build env
        if stage_idx == 0 or not cfg.preserve_env:
            if env is not None:
                env.close()
            env = cfg.build_env()

        if hasattr(env, "print_memory_footprint"):
            env.print_memory_footprint()

        print("\nDone building env.\n")
        print("Building updater...\n")

        updater = cfg.get_updater(env, mpi_context=mpi_context)
        updater.stage_idx = stage_idx
        updater.exp_dir = exp_dir

        updater.build_graph()
        print("\nDone building updater.\n")

        # walk_variable_scopes(max_depth=3)

        tf.train.get_or_create_global_step()
        sess.run(uninitialized_variables_initializer())
        sess.run(tf.assert_variables_initialized())

        updater.worker_code()

        stage_idx += 1

    return env