Beispiel #1
0
def play(policy_file, seed, n_test_rollouts, render):
    set_global_seeds(seed)

    # Load policy.
    with open(policy_file, 'rb') as f:
        policy = pickle.load(f)
    env_name = policy.info['env_name']

    # Load params
    with open(PARAMS_FILE) as json_file:
        params = json.load(json_file)

    params['env_name'] = env_name

    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    structure = params['structure']
    task_selection = params['task_selection']
    goal_selection = params['goal_selection']

    dims = config.configure_dims(params)

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
        'structure': structure,
        'task_selection': task_selection,
        'goal_selection': goal_selection,
        'queue_length': params['queue_length'],
        'eval': True,
        'render': render
    }

    for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
        eval_params[name] = params[name]

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(seed)

    # Run evaluation.
    evaluator.clear_history()
    for _ in range(n_test_rollouts):
        evaluator.generate_rollouts()

    # record logs
    for key, val in evaluator.logs('test'):
        logger.record_tabular(key, np.mean(val))
    logger.dump_tabular()
Beispiel #2
0
def main(policy_file, seed, n_test_rollouts, render):
    """
    run HER from a saved policy

    :param policy_file: (str) pickle path to a saved policy
    :param seed: (int) initial seed
    :param n_test_rollouts: (int) the number of test rollouts
    :param render: (bool) if rendering should be done
    """
    set_global_seeds(seed)

    # Load policy.
    with open(policy_file, 'rb') as file_handler:
        policy = pickle.load(file_handler)
    env_name = policy.info['env_name']

    # Prepare params.
    params = config.DEFAULT_PARAMS
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params['env_name'] = env_name
    params = config.prepare_params(params)
    config.log_params(params, logger_input=logger)

    dims = config.configure_dims(params)

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'compute_q': True,
        'rollout_batch_size': 1,
        'render': bool(render),
    }

    for name in ['time_horizon', 'gamma', 'noise_eps', 'random_eps']:
        eval_params[name] = params[name]

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(seed)

    # Run evaluation.
    evaluator.clear_history()
    for _ in range(n_test_rollouts):
        evaluator.generate_rollouts()

    # record logs
    for key, val in evaluator.logs('test'):
        logger.record_tabular(key, np.mean(val))
    logger.dump_tabular()
Beispiel #3
0
def main(policy_file, seed, n_test_rollouts, render, record):
    set_global_seeds(seed)

    # Load policy.
    with open(policy_file, 'rb') as f:
        policy = pickle.load(f)
    env_name = policy.info['env_name']

    # Prepare params.
    params = config.DEFAULT_PARAMS
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in
    params['env_name'] = env_name
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'compute_Q': True,
        'rollout_batch_size': 1,
        'render': bool(render),
    }

    for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
        eval_params[name] = params[name]

    if record:
        make_env = params['make_env']
        def video_callable(episode_id):
            return True
        def make_record_env():
            env = make_env()
            return gym.wrappers.Monitor(env, '../../../results/video/' + env_name, force=True, video_callable=video_callable)
        params['make_env'] = make_record_env
    evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
    evaluator.seed(seed)

    # Run evaluation.
    evaluator.clear_history()
    for _ in range(n_test_rollouts):
        evaluator.generate_rollouts()

    # record logs
    for key, val in evaluator.logs('test'):
        logger.record_tabular(key, np.mean(val))
    logger.dump_tabular()
Beispiel #4
0
def main(policy_file, seed, n_test_rollouts, render, with_forces):
    set_global_seeds(seed)

    # Load policy.
    with open(policy_file, 'rb') as f:
        policy = pickle.load(f)
    env_name = policy.info['env_name']

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['with_forces'] = with_forces
    params['plot_forces'] = False
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params['env_name'] = env_name
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'compute_Q': True,
        'rollout_batch_size': 1,
        'with_forces': with_forces,
        'plot_forces': False,
        'render': bool(render),
    }

    for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
        eval_params[name] = params[name]

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(seed)

    # Run evaluation.
    evaluator.clear_history()
    for _ in range(n_test_rollouts):
        evaluator.generate_rollouts()

    # record logs
    for key, val in evaluator.logs('test'):
        logger.record_tabular(key, np.mean(val))
    logger.dump_tabular()
Beispiel #5
0
def main(policy_file, seed, n_test_rollouts, render):
    set_global_seeds(seed)

    # Load policy.
    with open(policy_file, 'rb') as f:
        policy = pickle.load(f)
    env_name = policy.info['env_name']

    # Prepare params.
    params = config.DEFAULT_PARAMS
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in
    params['env_name'] = env_name
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'compute_Q': True,
        'rollout_batch_size': 1,
        'render': bool(render),
    }

    for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
        eval_params[name] = params[name]
    
    evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
    evaluator.seed(seed)

    # Run evaluation.
    evaluator.clear_history()
    for _ in range(n_test_rollouts):
        evaluator.generate_rollouts()

    # record logs
    for key, val in evaluator.logs('test'):
        logger.record_tabular(key, np.mean(val))
    logger.dump_tabular()
Beispiel #6
0
def launch(env,
           logdir,
           n_epochs,
           num_cpu,
           seed,
           replay_strategy,
           policy_save_interval,
           clip_return,
           k,
           n,
           override_params={},
           save_policies=True):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        try:
            whoami = mpi_fork(num_cpu, ['--bind-to', 'core'])
        except CalledProcessError:
            # fancy version of mpi call failed, try simple version
            whoami = mpi_fork(num_cpu)

        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env
    params['replay_strategy'] = replay_strategy
    if env in config.DEFAULT_ENV_PARAMS:
        params.update(
            config.DEFAULT_ENV_PARAMS[env])  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the '
            +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) '
            +
            'were obtained with --num_cpu 19. This makes a significant difference and if you '
            +
            'are looking to reproduce those results, be aware of this. Please also refer to '
            +
            'https://github.com/openai/baselines/issues/314 for further details.'
        )
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger,
                                   **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(rank_seed)
    train(logdir=logdir,
          policy=policy,
          rollout_worker=rollout_worker,
          k=k,
          n=n,
          evaluator=evaluator,
          n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=policy_save_interval,
          save_policies=save_policies,
          threshold=params['threshold'])
Beispiel #7
0
def launch(env_name,
           logdir,
           n_epochs,
           num_cpu,
           seed,
           replay_strategy,
           policy_save_interval,
           clip_return,
           override_params={},
           save_policies=True):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        whoami = mpi_fork(num_cpu)
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger,
                                   **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(rank_seed)

    train(logdir=logdir,
          policy=policy,
          rollout_worker=rollout_worker,
          evaluator=evaluator,
          n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=policy_save_interval,
          save_policies=save_policies)
Beispiel #8
0
def main(policy_file, seed, n_test_rollouts, render, exploit, compute_q,
         collect_data, goal_generation, note):
    set_global_seeds(seed)

    # Load policy.
    with open(policy_file, 'rb') as f:
        policy = pickle.load(f)
    env_name = policy.info['env_name']

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['note'] = note or params['note']
    if note:
        with open('params/' + env_name + '/' + note + '.json', 'r') as file:
            override_params = json.loads(file.read())
            params.update(**override_params)

    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params['env_name'] = env_name
    goal_generation = params['goal_generation']
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)

    eval_params = {
        'exploit': exploit,  # eval: True, train: False
        'use_target_net': params['test_with_polyak'],  # eval/train: False
        'compute_Q': compute_q,  # eval: True, train: False
        'rollout_batch_size': 1,
        'render': render,
    }

    for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
        eval_params[name] = params[name]

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(seed)

    # Run evaluation.
    evaluator.clear_history()
    num_skills = params['num_skills']

    if goal_generation == 'Zero':
        generated_goal = np.zeros(evaluator.g.shape)
    else:
        generated_goal = False

    for z in range(num_skills):
        assert (evaluator.rollout_batch_size == 1)
        z_s_onehot = np.zeros([evaluator.rollout_batch_size, num_skills])
        z_s_onehot[0, z] = 1

        base = os.path.splitext(policy_file)[0]
        for i_test_rollouts in range(n_test_rollouts):
            if render == 'rgb_array' or render == 'human':

                imgs, episode = evaluator.generate_rollouts(
                    generated_goal=generated_goal, z_s_onehot=z_s_onehot)
                end = '_test_{:02d}_exploit_{}_compute_q_{}_skill_{}.avi'.format(
                    i_test_rollouts, exploit, compute_q, z)
                test_filename = base + end
                save_video(imgs[0], test_filename, lib='cv2')
            else:
                episode = evaluator.generate_rollouts(
                    generated_goal=generated_goal, z_s_onehot=z_s_onehot)

            if collect_data:
                end = '_test_{:02d}_exploit_{}_compute_q_{}_skill_{}.txt'.format(
                    i_test_rollouts, exploit, compute_q, z)
                test_filename = base + end
                with open(test_filename, 'w') as file:
                    file.write(json.dumps(episode['o'].tolist()))

    # record logs
    for key, val in evaluator.logs('test'):
        logger.record_tabular(key, np.mean(val))
    logger.dump_tabular()
def launch(env_name,
           n_epochs,
           num_cpu,
           seed,
           replay_strategy,
           policy_save_interval,
           clip_return,
           temperature,
           prioritization,
           binding,
           logging,
           version,
           dump_buffer,
           n_cycles,
           rank_method,
           w_potential,
           w_linear,
           w_rotational,
           clip_energy,
           override_params={},
           save_policies=True):

    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        whoami = mpi_fork(num_cpu, binding)
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging

    if logging:
        logdir = 'logs/'+str(env_name)+'-temperature'+str(temperature)+\
                 '-prioritization'+str(prioritization)+'-replay_strategy'+str(replay_strategy)+\
                 '-n_epochs'+str(n_epochs)+'-num_cpu'+str(num_cpu)+'-seed'+str(seed)+\
                 '-n_cycles'+str(n_cycles)+'-rank_method'+str(rank_method)+\
                 '-w_potential'+str(w_potential)+'-w_linear'+str(w_linear)+'-w_rotational'+str(w_rotational)+\
                 '-clip_energy'+str(clip_energy)+\
                 '-version'+str(version)
    else:
        logdir = osp.join(
            tempfile.gettempdir(),
            datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))

    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    params['temperature'] = temperature
    params['prioritization'] = prioritization
    params['binding'] = binding
    params['max_timesteps'] = n_epochs * params['n_cycles'] * params[
        'n_batches'] * num_cpu
    params['version'] = version
    params['dump_buffer'] = dump_buffer
    params['n_cycles'] = n_cycles
    params['rank_method'] = rank_method
    params['w_potential'] = w_potential
    params['w_linear'] = w_linear
    params['w_rotational'] = w_rotational
    params['clip_energy'] = clip_energy
    params['n_epochs'] = n_epochs
    params['num_cpu'] = num_cpu

    if params['dump_buffer']:
        params['alpha'] = 0

    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger,
                                   **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(rank_seed)

    train(logdir=logdir,
          policy=policy,
          rollout_worker=rollout_worker,
          evaluator=evaluator,
          n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=policy_save_interval,
          save_policies=save_policies,
          num_cpu=num_cpu,
          dump_buffer=dump_buffer,
          w_potential=params['w_potential'],
          w_linear=params['w_linear'],
          w_rotational=params['w_rotational'],
          rank_method=rank_method,
          clip_energy=clip_energy)
Beispiel #10
0
def launch(
    env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return,
    override_params={}, save_policies=True
):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        whoami = mpi_fork(num_cpu)
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in
    params.update(**override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
    evaluator.seed(rank_seed)

    train(
        logdir=logdir, policy=policy, rollout_worker=rollout_worker,
        evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'],
        n_cycles=params['n_cycles'], n_batches=params['n_batches'],
        policy_save_interval=policy_save_interval, save_policies=save_policies)
Beispiel #11
0
def launch(env_name,
           n_epochs,
           num_cpu,
           seed,
           replay_strategy,
           policy_save_interval,
           clip_return,
           binding,
           logging,
           version,
           n_cycles,
           note,
           override_params={},
           save_policies=True):

    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        whoami = mpi_fork(num_cpu, binding)
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging

    if logging:
        logdir = 'logs/' + str(env_name) + '-replay_strategy' + str(
            replay_strategy) + '-n_epochs' + str(n_epochs) + '-num_cpu' + str(
                num_cpu) + '-seed' + str(seed) + '-n_cycles' + str(
                    n_cycles) + '-version' + str(
                        version) + '-T-' + datetime.datetime.now().strftime(
                            "%Y-%m-%d-%H-%M-%S")
    else:
        logdir = osp.join(
            tempfile.gettempdir(),
            datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))

    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()  # use temp folder for other rank
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    params['binding'] = binding
    params['max_timesteps'] = n_epochs * params['n_cycles'] * params[
        'n_batches'] * num_cpu
    params['version'] = version
    params['n_cycles'] = n_cycles
    params['num_cpu'] = num_cpu
    params['note'] = note or params['note']
    if note:
        with open('params/' + env_name + '/' + note + '.json', 'r') as file:
            override_params = json.loads(file.read())
            params.update(**override_params)

    if params['load_weight']:
        if type(params['load_weight']) is list:
            params['load_weight'] = params['load_weight'][seed]
        base = os.path.splitext(params['load_weight'])[0]
        policy_weight_file = open(base + '_weight.pkl', 'rb')
        pretrain_weights = pickle.load(policy_weight_file)
        policy_weight_file.close()
    else:
        pretrain_weights = None

    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   pretrain_weights=pretrain_weights,
                                   clip_return=clip_return)

    render = False
    if params['collect_video']:
        render = 'rgb_array'

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
        'render': render,
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger,
                                   **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(rank_seed)

    train(logdir=logdir,
          policy=policy,
          rollout_worker=rollout_worker,
          evaluator=evaluator,
          n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=policy_save_interval,
          save_policies=save_policies,
          num_cpu=num_cpu,
          collect_data=params['collect_data'],
          collect_video=params['collect_video'],
          goal_generation=params['goal_generation'],
          num_skills=params['num_skills'],
          use_skill_n=params['use_skill_n'],
          batch_size=params['_batch_size'],
          mi_r_scale=params['mi_r_scale'],
          mi_end_epoch=params['mi_end_epoch'],
          sk_r_scale=params['sk_r_scale'],
          no_train_mi=params['no_train_mi'])
Beispiel #12
0
def launch(env,
           logdir,
           n_epochs,
           num_cpu,
           seed,
           replay_strategy,
           policy_save_interval,
           clip_return,
           override_params={},
           save_policies=True):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        try:
            whoami = mpi_fork(num_cpu, ['--bind-to', 'core'])
        except CalledProcessError:
            # fancy version of mpi call failed, try simple version
            whoami = mpi_fork(num_cpu)

        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    # if rank == 0:
    #     if logdir or logger.get_dir() is None:
    #         logger.configure(dir=logdir)
    # else:
    #     logger.configure()
    # logdir = logger.get_dir()
    # assert logdir is not None
    # os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env
    params['replay_strategy'] = replay_strategy
    if env in config.DEFAULT_ENV_PARAMS:
        params.update(
            config.DEFAULT_ENV_PARAMS[env])  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    # with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
    #     json.dump(params, f)
    # {'Q_lr': 0.001, 'action_l2': 1.0, 'batch_size': 256, 'buffer_size': 1000000, 'clip_obs': 200.0,
    # 'env_name': u'GazeboSmartBotPinc...Kinect-v0', 'hidden': 256, 'layers': 3, 'max_u': 1.0, 'n_batches': 40,
    # 'n_cycles': 50, 'n_test_rollouts': 10, 'network_class': 'baselines.her.actor...torCritic', 'noise_eps': 0.2, ...}
    params = config.prepare_params(
        params
    )  # {'T': 200, '_Q_lr': 0.001, '_action_l2': 1.0, '_batch_size': 256, '_buffer_size': 1000000, '_clip_obs': 200.0, '_hidden': 256, '_layers': 3, '_max_u': 1.0, '_network_class': 'baselines.her.actor...torCritic', '_norm_clip': 5, '_norm_eps': 0.01, '_pi_lr': 0.001, '_polyak': 0.95, ...}
    config.log_params(params, logger=logger)
    # T: 200
    # _Q_lr: 0.001
    # _action_l2: 1.0
    # _batch_size: 256
    # _buffer_size: 1000000
    # _clip_obs: 200.0
    # _hidden: 256
    # _layers: 3
    # _max_u: 1.0
    # _network_class: baselines.her.actor_critic:ActorCritic
    # _norm_clip: 5
    # _norm_eps: 0.01
    # _pi_lr: 0.001
    # _polyak: 0.95
    # _relative_goals: False
    # _scope: ddpg
    # ddpg_params: {'layers': 3, 'norm_clip': 5, 'action_l2': 1.0, 'pi_lr': 0.001, 'norm_eps': 0.01, 'batch_size': 256, 'polyak': 0.95, 'clip_obs': 200.0, 'network_class': 'baselines.her.actor_critic:ActorCritic', 'max_u': 1.0, 'Q_lr': 0.001, 'scope': 'ddpg', 'buffer_size': 1000000, 'hidden': 256, 'relative_goals': False}
    # env_name: GazeboSmartBotPincherKinect-v0
    # gamma: 0.995
    # make_env: <function make_env at 0x7f7de3cb2050>
    # n_batches: 40
    # n_cycles: 50
    # n_test_rollouts: 10
    # noise_eps: 0.2
    # random_eps: 0.3
    # replay_k: 4
    # replay_strategy: future
    # rollout_batch_size: 2
    # test_with_polyak: False

    # if num_cpu == 1:
    #     logger.warn()
    #     logger.warn('*** Warning ***')
    #     logger.warn(
    #         'You are running HER with just a single MPI worker. This will work, but the ' +
    #         'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' +
    #         'were obtained with --num_cpu 19. This makes a significant difference and if you ' +
    #         'are looking to reproduce those results, be aware of this. Please also refer to ' +
    #         'https://github.com/openai/baselines/issues/314 for further details.')
    #     logger.warn('****************')
    #     logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger,
                                   **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(rank_seed)

    train(logdir=logdir,
          policy=policy,
          rollout_worker=rollout_worker,
          evaluator=evaluator,
          n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=policy_save_interval,
          save_policies=save_policies)
Beispiel #13
0
def launch(env,
           logdir,
           n_epochs,
           num_cpu,
           seed,
           replay_strategy,
           policy_save_interval,
           clip_return,
           override_params=None,
           save_policies=True):
    """
    launch training with mpi

    :param env: (str) environment ID
    :param logdir: (str) the log directory
    :param n_epochs: (int) the number of training epochs
    :param num_cpu: (int) the number of CPUs to run on
    :param seed: (int) the initial random seed
    :param replay_strategy: (str) the type of replay strategy ('future' or 'none')
    :param policy_save_interval: (int) the interval with which policy pickles are saved.
        If set to 0, only the best and latest policy will be pickled.
    :param clip_return: (float): clip returns to be in [-clip_return, clip_return]
    :param override_params: (dict) override any parameter for training
    :param save_policies: (bool) whether or not to save the policies
    """

    if override_params is None:
        override_params = {}
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        try:
            whoami = mpi_fork(num_cpu, ['--bind-to', 'core'])
        except CalledProcessError:
            # fancy version of mpi call failed, try simple version
            whoami = mpi_fork(num_cpu)

        if whoami == 'parent':
            sys.exit(0)
        tf_util.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(folder=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env
    params['replay_strategy'] = replay_strategy
    if env in config.DEFAULT_ENV_PARAMS:
        params.update(
            config.DEFAULT_ENV_PARAMS[env])  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'),
              'w') as file_handler:
        json.dump(params, file_handler)
    params = config.prepare_params(params)
    config.log_params(params, logger_input=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the '
            +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) '
            +
            'were obtained with --num_cpu 19. This makes a significant difference and if you '
            +
            'are looking to reproduce those results, be aware of this. Please also refer to '
            +
            'https://github.com/openai/baselines/issues/314 for further details.'
        )
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        # 'use_demo_states': True,
        'compute_q': False,
        'time_horizon': params['time_horizon'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        # 'use_demo_states': False,
        'compute_q': True,
        'time_horizon': params['time_horizon'],
    }

    for name in [
            'time_horizon', 'rollout_batch_size', 'gamma', 'noise_eps',
            'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger,
                                   **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(rank_seed)

    train(policy=policy,
          rollout_worker=rollout_worker,
          evaluator=evaluator,
          n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=policy_save_interval,
          save_policies=save_policies)
Beispiel #14
0
def launch(env, trial_id, n_epochs, num_cpu, seed, policy_save_interval, clip_return, normalize_obs,
           structure, task_selection, goal_selection, goal_replay, task_replay, perturb, save_policies=True):

    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        try:
            whoami = mpi_fork(num_cpu, ['--bind-to', 'core'])
        except CalledProcessError:
            # fancy version of mpi call failed, try simple version
            whoami = mpi_fork(num_cpu)

        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        save_dir = find_save_path('./save/' + env + "/", trial_id)
        logger.configure(dir=save_dir)
    else:
        save_dir = None

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params, add main function arguments and log all parameters
    if structure == 'curious' or structure == 'task_experts':
        params = config.MULTI_TASK_PARAMS
    else:
        params = config.DEFAULT_PARAMS

    time = str(datetime.datetime.now())
    params['time'] = time
    params['env_name'] = env
    params['task_selection'] = task_selection
    params['goal_selection'] = goal_selection
    params['task_replay'] = task_replay
    params['goal_replay'] = goal_replay
    params['structure'] = structure
    params['normalize_obs'] = normalize_obs
    params['num_cpu'] = num_cpu
    params['clip_return'] = clip_return
    params['trial_id'] = trial_id
    params['seed'] = seed
    if rank == 0:
        with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
            json.dump(params, f)
    params = config.prepare_params(params)
    params['ddpg_params']['normalize_obs'] = normalize_obs
    if rank == 0:
        config.log_params(params, logger=logger)

    if num_cpu != 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the ' +
            'experiments that we report in Colas et al. (2018, https://arxiv.org/abs/1810.06284) ' +
            'were obtained with --num_cpu 19. This makes a significant difference and if you ' +
            'are looking to reproduce those results, be aware of this.')
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)

    buffers = config.configure_buffer(dims=dims, params=params)

    # creates several policies with shared buffers in the task-experts structure, otherwise use just one policy
    if structure == 'task_experts':
        policy = [config.configure_ddpg(dims=dims, params=params, buffers=buffers, clip_return=clip_return, t_id=i) for i in range(params['nb_tasks'])]
    else:
        policy = config.configure_ddpg(dims=dims, params=params, buffers=buffers, clip_return=clip_return)


    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
        'structure': structure,
        'task_selection': task_selection,
        'goal_selection': goal_selection,
        'queue_length': params['queue_length'],
        'eval': False,
        'eps_task': params['eps_task']
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
        'structure' : structure,
        'task_selection': task_selection,
        'goal_selection' : goal_selection,
        'queue_length': params['queue_length'],
        'eval': True,
    }

    for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    if structure == 'task_experts':
        # create one rollout worker per policy/task
        rollout_worker = [RolloutWorker(params['make_env'], policy[i], dims, logger, unique_task=i, **rollout_params) for i in range(params['nb_tasks'])]
        for i in range(params['nb_tasks']):
            rollout_worker[i].seed(rank_seed + i)
    else:
        rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params)
        rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
    evaluator.seed(rank_seed + 100)

    train(logdir=save_dir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], perturbation_study=perturb,
          policy_save_interval=policy_save_interval, save_policies=save_policies, structure=structure, task_selection=task_selection, params=params)
Beispiel #15
0
def launch(
    env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return,
    override_params={}, save_policies=True
):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        whoami = mpi_fork(num_cpu)
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in
    params.update(**override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the ' +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' +
            'were obtained with --num_cpu 19. This makes a significant difference and if you ' +
            'are looking to reproduce those results, be aware of this. Please also refer to ' + 
            'https://github.com/openai/baselines/issues/314 for further details.')
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
    evaluator.seed(rank_seed)

    train(
        logdir=logdir, policy=policy, rollout_worker=rollout_worker,
        evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'],
        n_cycles=params['n_cycles'], n_batches=params['n_batches'],
        policy_save_interval=policy_save_interval, save_policies=save_policies)
Beispiel #16
0
def run_task(vv, log_dir=None, exp_name=None):
    override_params = {}
    # Fork for multi-CPU MPI implementation.
    if vv['num_cpu'] > 1:
        whoami = mpi_fork(vv['num_cpu'])
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    log_dir = '/media/part/cmu_ri/deep/deep_RL/data/local/square2d-debug/square2d_debug_2018_06_17/'  #hack for now, fix later

    # Configure logging
    if rank == 0:
        if log_dir or logger.get_dir() is None:
            from pathlib import Path
            logger.configure(dir=log_dir, exp_name=exp_name)
    else:
        if log_dir or logger.get_dir() is None:
            from pathlib import Path
            logger.configure(dir=log_dir, exp_name=exp_name)

    logdir = logger.get_dir()
    #logdir = ''# a quick hack, fix later
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = vv['seed'] + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = vv['env_name']
    params['replay_strategy'] = vv['replay_strategy']
    params['replay_sample_strategy'] = vv['replay_sample_strategy']
    params['reward_type'] = vv['reward_type']
    params['replay_k'] = vv['replay_k']
    if vv['network'] == 'fc':
        params['network_class'] = 'baselines.her.actor_critic:ActorCritic'
    elif vv['network'] == 'cnn_fc':
        params[
            'network_class'] = 'baselines.her.cnn_actor_critic:CNNActorCritic'

    if vv['env_name'] in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[
            vv['env_name']])  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'variant.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    shapes = config.configure_shapes(params)
    dims = shapes_to_dims(shapes)
    policy = config.configure_ddpg(dims=dims,
                                   shapes=shapes,
                                   params=params,
                                   clip_return=vv['clip_return'])

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger,
                                   **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(rank_seed)

    train(logdir=logdir,
          policy=policy,
          rollout_worker=rollout_worker,
          evaluator=evaluator,
          n_epochs=vv['n_epochs'],
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=vv['policy_save_interval'],
          save_policies=vv['save_policies'])