Example #1
0
def test_rlpyt_simple():
    """ partially copied from example 1 """
    game = "pong"
    run_ID = 0
    cuda_idx = None
    n_steps = 1
    sampler = SerialSampler(
        EnvCls=AtariEnv,
        TrajInfoCls=AtariTrajInfo,  # default traj info + GameScore
        env_kwargs=dict(game=game),
        eval_env_kwargs=dict(game=game),
        batch_T=4,  # Four time-steps per sampler iteration.
        batch_B=1,
        max_decorrelation_steps=0,
        eval_n_envs=10,
        eval_max_steps=int(10e3),
        eval_max_trajectories=5,
    )
    algo = DQN(min_steps_learn=1e3, replay_size=1e3)  # remove memory issues
    agent = AtariDqnAgent()
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=n_steps,
        log_interval_steps=1e3,
        affinity=dict(cuda_idx=cuda_idx),
    )
    config = dict(game=game)
    name = "dqn_" + game
    log_dir = "test_example_1"
    with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"):
        runner.train()
Example #2
0
def build_and_train(log_dir, game="pong", run_ID=0, cuda_idx=None, eval=False):
    sampler = SerialSampler(
        EnvCls=AtariEnv,
        TrajInfoCls=AtariTrajInfo,  # default traj info + GameScore
        env_kwargs=dict(game=game),
        eval_env_kwargs=dict(game=game),
        batch_T=4,  # Four time-steps per sampler iteration.
        batch_B=1,
        max_decorrelation_steps=0,
        eval_n_envs=10,
        eval_max_steps=int(10e3),
        eval_max_trajectories=5,
    )
    algo = Dreamer()  # Run with defaults.
    agent = AtariDreamerAgent()
    runner_cls = MinibatchRlEval if eval else MinibatchRl
    runner = runner_cls(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=50e6,
        log_interval_steps=1e3,
        affinity=dict(cuda_idx=cuda_idx),
    )
    config = dict(game=game)
    name = "dreamer_" + game
    with logger_context(log_dir, run_ID, name, config, snapshot_mode="last", override_prefix=True,
                        use_summary_writer=True):
        runner.train()
Example #3
0
def build_and_train(game="academy_empty_goal_close", run_ID=0, cuda_idx=None):
    sampler = SerialSampler(
        EnvCls=create_single_football_env,
        env_kwargs=dict(game=game),
        eval_env_kwargs=dict(game=game),
        batch_T=4,  # Four time-steps per sampler iteration.
        batch_B=1,
        max_decorrelation_steps=0,
        eval_n_envs=10,
        eval_max_steps=int(10e3),
        eval_max_trajectories=5,
    )
    algo = DQN(min_steps_learn=1e3)  # Run with defaults.
    agent = AtariDqnAgent()
    runner = MinibatchRlEval(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=50e6,
        log_interval_steps=1e3,
        affinity=dict(cuda_idx=cuda_idx),
    )
    config = dict(game=game)
    name = "dqn_" + game
    log_dir = "example_1"
    with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"):
        runner.train()
Example #4
0
def build_and_train(env_id="CartPole-v0", run_ID=0, cuda_idx=None):
    sampler = SerialSampler(
        EnvCls=gym_make,
        env_kwargs=dict(id=env_id),
        eval_env_kwargs=dict(id=env_id),
        batch_T=1,  # One time-step per sampler iteration.
        batch_B=1,  # One environment (i.e. sampler Batch dimension).
        max_decorrelation_steps=0,
        eval_n_envs=10,
        eval_max_steps=int(51e3),
        eval_max_trajectories=50,
    )
    algo = PPO()  # Run with defaults.
    agent = RecurrentCategoricalPgAgent()
    runner = MinibatchRlEval(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=1e6,
        log_interval_steps=1e4,
        affinity=dict(cuda_idx=cuda_idx),
    )
    config = dict(env_id=env_id)
    name = "ppo_" + env_id
    log_dir = "ppo_test"
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
Example #5
0
def build_and_train(run_id=0, greedy_eval=False):
    sampler = SerialSampler(
        EnvCls=MyEnv,
        env_kwargs=dict(),
        eval_env_kwargs=dict(),
        batch_T=horizon,
        batch_B=64,
        max_decorrelation_steps=0,
        eval_n_envs=64,
        eval_max_steps=int(1e6),
        eval_max_trajectories=64,
    )
    runner = MinibatchRl(
        algo=PPO(entropy_loss_coeff=0., learning_rate=3e-4),
        agent=MyAgent(greedy_eval),
        sampler=sampler,
        n_steps=int(400 * horizon * 64),
        log_interval_steps=int(10 * horizon * 64),
    )
    log_params = dict()

    log_dir = "data/rl_example_2/{}".format(
        datetime.datetime.today().strftime("%Y%m%d_%H%M"))
    with logger_context(log_dir,
                        run_id,
                        'Reacher2D',
                        log_params=log_params,
                        snapshot_mode="last",
                        use_summary_writer=True,
                        override_prefix=True):
        runner.train()
Example #6
0
def build_and_train(env_id="HalfCheetah-Directional-v0", run_ID=0, cuda_idx=None, n_parallel=6):
    affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel)), alternating=True)
    env_args = dict(id=env_id)
    env_args[RLPYT_WRAPPER_KEY] = [ClipActionsWrapper]
    # sampler = GpuSampler(
    #     EnvCls=gym_make,
    #     env_kwargs=env_args,
    #     eval_env_kwargs=env_args,
    #     batch_T=256,  # One time-step per sampler iteration.
    #     batch_B=8,  # One environment (i.e. sampler Batch dimension).
    #     max_decorrelation_steps=100,
    #     eval_n_envs=5,
    #     eval_max_steps=int(25e3),
    #     eval_max_trajectories=30
    # )
    # agent = MujocoFfOcAgent(model_kwargs={'option_size': 2})

    # sampler = AlternatingSampler(
    #     EnvCls=gym_make,
    #     env_kwargs=env_args,
    #     eval_env_kwargs=env_args,
    #     batch_T=256,  # One time-step per sampler iteration.
    #     batch_B=8,  # One environment (i.e. sampler Batch dimension).
    #     max_decorrelation_steps=100,
    #     eval_n_envs=5,
    #     eval_max_steps=int(25e3),
    #     eval_max_trajectories=30
    # )
    # agent = AlternatingMujocoFfOcAgent(model_kwargs={'option_size': 2})

    sampler = SerialSampler(
        EnvCls=gym_make,
        env_kwargs=env_args,
        eval_env_kwargs=env_args,
        batch_T=256,  # One time-step per sampler iteration.
        batch_B=8,  # One environment (i.e. sampler Batch dimension).
        max_decorrelation_steps=0,
        # eval_n_envs=2,
        # eval_max_steps=int(51e2),
        # eval_max_trajectories=5,
    )
    agent = MujocoFfOcAgent(model_kwargs={'option_size': 2})

    algo = PPOC(clip_vf_loss=False, normalize_rewards='return')  # Run with defaults.
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=1e6,
        log_interval_steps=1e3,
        affinity=affinity,
        transfer=True,
        transfer_iter=150,
        log_traj_window=10
    )
    config = dict(env_id=env_id)
    name = "ppoc_" + env_id
    log_dir = "example_2a_ppoc"
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
Example #7
0
def build_and_train(level="nav_maze_random_goal_01", run_ID=0, cuda_idx=None):
    affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(8)))
    sampler = SerialSampler(
        EnvCls=DeepmindLabEnv,
        env_kwargs=dict(level=level),
        eval_env_kwargs=dict(level=level),
        batch_T=4,  # Four time-steps per sampler iteration.
        batch_B=1,
        max_decorrelation_steps=0,
        eval_n_envs=5,
        eval_max_steps=int(10e3),
        eval_max_trajectories=5,
    )
    algo = PPO()
    agent = AtariFfAgent()
    runner = MinibatchRlEval(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=50e6,
        log_interval_steps=1e3,
        affinity=affinity,
    )
    config = dict(level=level)
    name = "lab_ppo"
    log_dir = "lab_example_3"
    with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"):
        runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    variant = load_variant(log_dir)
    print('Variant', variant)
    config = update_config(config, variant)

    sampler = SerialSampler(EnvCls=DMControlEnv,
                            env_kwargs=config["env"],
                            CollectorCls=CpuResetCollector,
                            eval_env_kwargs=config["eval_env"],
                            **config["sampler"])
    algo = SAC(optim_kwargs=config["optim"], **config["algo"])
    agent = SacAgent(**config["agent"])
    runner = MinibatchRlEval(algo=algo,
                             agent=agent,
                             sampler=sampler,
                             affinity=affinity,
                             **config["runner"])
    name = "sac_{}_{}".format(config['env']['domain'], config['env']['task'])

    with logger_context(log_dir,
                        run_ID,
                        name,
                        log_params=config,
                        snapshot_mode='last'):
        runner.train()
def build_and_train(
    slot_affinity_code="0slt_1gpu_1cpu",
    log_dir="test",
    run_ID="0",
    config_key="ppo_ul_16env",
):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    variant = load_variant(log_dir)
    config = update_config(config, variant)

    pprint.pprint(config)

    sampler = SerialSampler(
        EnvCls=AtariEnv84,
        env_kwargs=config["env"],
        CollectorCls=CpuResetCollector,
        TrajInfoCls=AtariTrajInfo,
        eval_env_kwargs=config["env"],  # Same args!
        **config["sampler"])
    algo = PpoUl(optim_kwargs=config["optim"], **config["algo"])
    agent = AtariPgAgent(model_kwargs=config["model"], **config["agent"])
    runner = MinibatchRl(algo=algo,
                         agent=agent,
                         sampler=sampler,
                         affinity=affinity,
                         **config["runner"])
    name = config["env"]["game"]
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
Example #10
0
def build_and_train(cfg, game="ftwc", run_ID=0):
    #GVS NOTE: for ftwc/qait ?use CpuWaitResetCollector  (or CpuResetCollector)
    sampler = SerialSampler(
        EnvCls=AtariEnv,
        TrajInfoCls=AtariTrajInfo,  # default traj info + GameScore
        env_kwargs=dict(game=game),
        eval_env_kwargs=dict(game=game),
        batch_T=4,  # Four time-steps per sampler iteration.
        batch_B=1,
        max_decorrelation_steps=0,
        eval_n_envs=10,
        eval_max_steps=int(10e2),
        eval_max_trajectories=5,
    )
    algo = DQN(min_steps_learn=1e2)  # Run with defaults.
    agent = AtariDqnAgent()
    runner = MinibatchRlEval(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=50e6,
        log_interval_steps=1e3,
        affinity=dict(cuda_idx=cfg.cuda_idx),
    )
    config = dict(game=game)
    name = "dqn_" + game
    log_dir = "ftwc"
    with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"):
        runner.train()
Example #11
0
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    variant = load_variant(log_dir)
    config = update_config(config, variant)
    # config["eval_env"]["id"] = config["env"]["id"]

    sampler = SerialSampler(
        EnvCls=gym_make,
        env_kwargs=config["env"],
        CollectorCls=CpuResetCollector,
        eval_env_kwargs=config["env"],
        **config["sampler"]
    )
    algo = SAC(optim_kwargs=config["optim"], **config["algo"])
    agent = SacAgent(**config["agent"])
    runner = MinibatchRlEval(
        algo=algo,
        agent=agent,
        sampler=sampler,
        affinity=affinity,
        **config["runner"]
    )
    name = "sac_" + config["env"]["id"]
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
Example #12
0
def build_and_train(
    slot_affinity_code="0slt_1gpu_1cpu",
    log_dir="test",
    run_ID="0",
    config_key="sac_with_ul",
):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    variant = load_variant(log_dir)
    config = update_config(config, variant)

    pprint.pprint(config)

    sampler = SerialSampler(
        EnvCls=make,
        env_kwargs=config["env"],
        CollectorCls=CpuResetCollector,
        # TrajInfoCls=AtariTrajInfo,
        eval_env_kwargs=config["env"],  # Same args!
        **config["sampler"])
    algo = SacWithUl(**config["algo"])
    agent = SacAgent(conv_kwargs=config["conv"],
                     fc1_kwargs=config["fc1"],
                     pi_model_kwargs=config["pi_model"],
                     q_model_kwargs=config["q_model"],
                     **config["agent"])
    runner = MinibatchRlEvalEnvStep(algo=algo,
                                    agent=agent,
                                    sampler=sampler,
                                    affinity=affinity,
                                    frame_skip=config["env"]["frame_skip"],
                                    **config["runner"])
    name = config["env"]["domain_name"] + "_" + config["env"]["task_name"]
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
Example #13
0
def build_and_train(env_id="Hopper-v3", run_ID=0, cuda_idx=None):
    env_args = dict(id=env_id)
    env_args[RLPYT_WRAPPER_KEY] = [ClipActionsWrapper]
    sampler = SerialSampler(
        EnvCls=gym_make,
        env_kwargs=env_args,
        eval_env_kwargs=env_args,
        batch_T=1,  # One time-step per sampler iteration.
        batch_B=1,  # One environment (i.e. sampler Batch dimension).
        max_decorrelation_steps=0,
        eval_n_envs=10,
        eval_max_steps=int(51e3),
        eval_max_trajectories=50,
    )
    algo = SAC()  # Run with defaults.
    agent = SacAgent()
    runner = MinibatchRlEval(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=1e6,
        log_interval_steps=1e4,
        affinity=dict(cuda_idx=cuda_idx),
    )
    config = dict(env_id=env_id)
    name = "sac_" + env_id
    log_dir = "example_2"
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
Example #14
0
def build_and_train(game="pong", run_ID=0, cuda_idx=None):
    sampler = SerialSampler(
        EnvCls=AtariEnv,
        TrajInfoCls=AtariTrajInfo,  # default traj info + GameScore
        env_kwargs=dict(game=game),
        eval_env_kwargs=dict(game=game),
        batch_T=4,  # Four time-steps per sampler iteration.
        batch_B=1,
        max_decorrelation_steps=0,
        eval_n_envs=10,
        eval_max_steps=int(10e3),
        eval_max_trajectories=5,
    )
    algo = DQN(min_steps_learn=1e3)  # Run with defaults.
    agent = AtariDqnAgent()
    runner = MinibatchRlEval(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=50e6,
        log_interval_steps=1e3,
        affinity=dict(cuda_idx=cuda_idx),
    )
    config = dict(game=game)
    name = "dqn_" + game
    #log_dir = "example_1"
    log_dir = get_outputs_path()
    with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"):
        runner.train()
Example #15
0
def build_and_train(level="nav_maze_random_goal_01", run_ID=0, cuda_idx=None):
    sampler = SerialSampler(
        EnvCls=DeepmindLabEnv,
        env_kwargs=dict(level=level),
        eval_env_kwargs=dict(level=level),
        batch_T=4,  # Four time-steps per sampler iteration.
        batch_B=1,
        max_decorrelation_steps=0,
        eval_n_envs=5,
        eval_max_steps=int(10e3),
        eval_max_trajectories=5,
    )
    algo = DQN(min_steps_learn=1e3)  # Run with defaults.
    agent = AtariDqnAgent()
    runner = MinibatchRlEval(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=50e6,
        log_interval_steps=1e5,
        affinity=dict(cuda_idx=cuda_idx),
    )
    config = dict(level=level)
    name = "lab_dqn"
    log_dir = "lab_example_1"
    with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"):
        runner.train()
Example #16
0
def build_and_train(env_id="Hopper-v3", run_ID=0, cuda_idx=None):
    sampler = SerialSampler(
        EnvCls=gym_make,
        env_kwargs=dict(id=env_id),
        eval_env_kwargs=dict(id=env_id),
        batch_T=50,  # One time-step per sampler iteration.
        batch_B=1,  # One environment (i.e. sampler Batch dimension).
        max_decorrelation_steps=0,
        eval_n_envs=2,
        eval_max_steps=int(51e3),
        eval_max_trajectories=200,
    )
    # The cost function for InvertedPendulumBulletEnv
    def obs_cost_fn(x):
        target = torch.FloatTensor([0,0,1,0,0])
        c = (x - target)**2
        c = -c.sum(dim=1)
        return -c.exp()
    algo = GP_Mlp(obs_cost_fn=obs_cost_fn)  # Run with defaults.
    agent = GP_MlpAgent()
    runner = MinibatchRlEval(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=1e6,
        log_interval_steps=200,
        affinity=dict(cuda_idx=cuda_idx),
    )
    config = dict(env_id=env_id)
    name = "gp_mlp_" + env_id
    log_dir = "example_1"
    with logger_context(log_dir, run_ID, name, config, snapshot_mode='last'):
        runner.train()
Example #17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('snapshot_dir', type=str)
    parser.add_argument('max_q_eval_mode', type=str)
    parser.add_argument('--n_rollouts', type=int, default=10)
    args = parser.parse_args()

    snapshot_file = join(args.snapshot_dir, 'params.pkl')
    config_file = join(args.snapshot_dir, 'params.json')

    params = torch.load(snapshot_file, map_location='cpu')
    with open(config_file, 'r') as f:
        config = json.load(f)
    config['sampler']['batch_B'] = 1
    config['sampler']['eval_n_envs'] = 1
    config['sampler']['eval_max_trajectories'] = args.n_rollouts
    config['env']['task_kwargs']['maxq'] = True

    itr, cum_steps = params['itr'], params['cum_steps']
    print(f'Loading experiment at itr {itr}, cum_steps {cum_steps}')

    agent_state_dict = params['agent_state_dict']

    sac_agent_module = 'rlpyt.agents.qpg.{}'.format(config['sac_agent_module'])
    sac_agent_module = importlib.import_module(sac_agent_module)
    SacAgent = sac_agent_module.SacAgent

    agent = SacAgent(max_q_eval_mode=args.max_q_eval_mode, **config["agent"])
    sampler = SerialSampler(
        EnvCls=DMControlEnv,
        env_kwargs=config["env"],
        eval_env_kwargs=config["env"],
        **config["sampler"]
    )
    sampler.initialize(agent)
    agent.load_state_dict(agent_state_dict)

    agent.to_device(cuda_idx=0)
    agent.eval_mode(0)

    traj_infos = sampler.evaluate_agent(0)
    returns = [traj_info.Return for traj_info in traj_infos]
    lengths = [traj_info.Length for traj_info in traj_infos]

    print('Returns', returns)
    print(f'Average Return {np.mean(returns)}, Average Length {np.mean(lengths)}')
Example #18
0
def build_and_train(log_dir,
                    game="cartpole_balance",
                    run_ID=0,
                    cuda_idx=None,
                    eval=False,
                    save_model='last',
                    load_model_path=None):
    params = torch.load(load_model_path) if load_model_path else {}
    agent_state_dict = params.get('agent_state_dict')
    optimizer_state_dict = params.get('optimizer_state_dict')

    action_repeat = 2
    factory_method = make_wapper(DeepMindControl,
                                 [ActionRepeat, NormalizeActions, TimeLimit], [
                                     dict(amount=action_repeat),
                                     dict(),
                                     dict(duration=1000 / action_repeat)
                                 ])
    sampler = SerialSampler(
        EnvCls=factory_method,
        TrajInfoCls=TrajInfo,
        env_kwargs=dict(name=game, use_state=args.state),
        eval_env_kwargs=dict(name=game, use_state=args.state),
        batch_T=1,
        batch_B=1,
        max_decorrelation_steps=0,
        eval_n_envs=10,
        eval_max_steps=int(10e3),
        eval_max_trajectories=5,
    )
    algo = Dreamer(
        initial_optim_state_dict=optimizer_state_dict)  # Run with defaults.
    agent = DMCDreamerAgent(train_noise=0.3,
                            eval_noise=0,
                            expl_type="additive_gaussian",
                            expl_min=None,
                            expl_decay=None,
                            initial_model_state_dict=agent_state_dict)
    runner_cls = MinibatchRlEval if eval else MinibatchRl
    runner = runner_cls(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=5e6,
        log_interval_steps=1e3,
        affinity=dict(cuda_idx=cuda_idx),
    )
    config = dict(game=game)
    name = "dreamer_" + game
    with logger_context(log_dir,
                        run_ID,
                        name,
                        config,
                        snapshot_mode=save_model,
                        override_prefix=True,
                        use_summary_writer=True):
        runner.train()
Example #19
0
def build_and_train(game="fruitbot", run_ID=0, cuda_idx=None, n_parallel=6):
    affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel)), alternating=True)
    env_args = dict(game=game, start_level=0, num_levels=1)
    # sampler = AlternatingSampler(
    #     EnvCls=ProcgenEnv,
    #     env_kwargs=env_args,
    #     eval_env_kwargs=env_args,
    #     batch_T=256,  # One time-step per sampler iteration.
    #     batch_B=12,  # One environment (i.e. sampler Batch dimension).
    #     max_decorrelation_steps=100,
    #     # eval_n_envs=5,
    #     # eval_max_steps=int(25e3),
    #     # eval_max_trajectories=30
    # )
    # sampler = GpuSampler(
    #     EnvCls=ProcgenEnv,
    #     env_kwargs=env_args,
    #     eval_env_kwargs=env_args,
    #     batch_T=256,  # One time-step per sampler iteration.
    #     batch_B=12,  # One environment (i.e. sampler Batch dimension).
    #     max_decorrelation_steps=100,
    #     # eval_n_envs=5,
    #     # eval_max_steps=int(25e3),
    #     # eval_max_trajectories=30
    # )
    #
    sampler = SerialSampler(
        EnvCls=ProcgenEnv,
        env_kwargs=env_args,
        eval_env_kwargs=env_args,
        batch_T=256,  # One time-step per sampler iteration.
        batch_B=8,  # One environment (i.e. sampler Batch dimension).
        max_decorrelation_steps=0,
        # eval_n_envs=2,
        # eval_max_steps=int(51e2),
        # eval_max_trajectories=5,
    )

    algo = PPOC(clip_vf_loss=False, normalize_rewards=None)  # Run with defaults.
    agent = Agent(model_kwargs={'option_size': 2})
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=1e6,
        log_interval_steps=1e3,
        affinity=affinity,
        # transfer=True,
        # transfer_iter=150,
        # log_traj_window=10
    )
    config = dict(game=game)
    name = "ppo_" + game
    log_dir = "example_2a_fruitbot"
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
Example #20
0
 def estimateForState(s):
     cpus = list(range(C.N_PARALLEL))
     affinity = dict(cuda_idx=C.CUDA_IDX, workers_cpus=cpus)
     agent_ = CategoricalPgAgent(
         AcrobotNet, initial_model_state_dict=agent.state_dict())
     sampler = SerialSampler(
         EnvCls=rlpyt_make,
         env_kwargs=dict(id=C.ENV,
                         reward=rewardFn,
                         internalStateFn=C.INTERNAL_STATE_FN,
                         s0=s),
         batch_T=C.HORIZON,
         batch_B=C.BATCH_B,
         max_decorrelation_steps=0,
     )
     sampler.initialize(agent=agent_, affinity=affinity, seed=C.SEED)
     _, traj_info = sampler.obtain_samples(0)
     returns = [t['DiscountedReturn'] for t in traj_info]
     return np.mean(returns)
Example #21
0
def build_and_train(game="pong", run_ID=0, cuda_idx=None, eval=False):
    action_repeat = 2
    env_kwargs = dict(
        name=game,
        action_repeat=action_repeat,
        size=(64, 64),
        grayscale=False,
        life_done=True,
        sticky_actions=True,
    )
    factory_method = make_wapper(
        AtariEnv, [OneHotAction, TimeLimit],
        [dict(), dict(duration=1000 / action_repeat)])
    sampler = SerialSampler(
        EnvCls=factory_method,
        TrajInfoCls=AtariTrajInfo,  # default traj info + GameScore
        env_kwargs=env_kwargs,
        eval_env_kwargs=env_kwargs,
        batch_T=1,
        batch_B=1,
        max_decorrelation_steps=0,
        eval_n_envs=10,
        eval_max_steps=int(10e3),
        eval_max_trajectories=5,
    )
    algo = Dreamer(
        batch_size=1,
        batch_length=5,
        train_every=10,
        train_steps=2,
        prefill=10,
        horizon=5,
        replay_size=100,
        log_video=False,
        kl_scale=0.1,
        use_pcont=True,
    )
    agent = AtariDreamerAgent(train_noise=0.4,
                              eval_noise=0,
                              expl_type="epsilon_greedy",
                              expl_min=0.1,
                              expl_decay=2000 / 0.3,
                              model_kwargs=dict(use_pcont=True))
    runner_cls = MinibatchRlEval if eval else MinibatchRl
    runner = runner_cls(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=20,
        log_interval_steps=10,
        affinity=dict(cuda_idx=cuda_idx),
    )
    runner.train()
Example #22
0
def build_and_train(game="montezuma_revenge",
                    run_ID=0,
                    cuda_idx=None,
                    n_parallel=6):
    affinity = dict(cuda_idx=cuda_idx,
                    workers_cpus=list(range(n_parallel)),
                    alternating=True)
    env_args = dict(id=game)
    # env_args[RLPYT_WRAPPER_KEY] = [ClipActionsWrapper]
    # sampler = AlternatingSampler(
    #     EnvCls=AtariEnv,
    #     TrajInfoCls=AtariTrajInfo,
    #     env_kwargs=dict(game=game),
    #     batch_T=64,  # One time-step per sampler iteration.
    #     batch_B=36,  # One environment (i.e. sampler Batch dimension).
    #     max_decorrelation_steps=1000,
    #     # eval_n_envs=5,
    #     # eval_max_steps=int(25e3),
    #     # eval_max_trajectories=30
    # )
    #
    sampler = SerialSampler(
        EnvCls=AtariEnv,
        TrajInfoCls=AtariTrajInfo,
        env_kwargs=dict(game=game),
        batch_T=256,  # One time-step per sampler iteration.
        batch_B=8,  # One environment (i.e. sampler Batch dimension).
        max_decorrelation_steps=1000,
        # eval_n_envs=2,
        # eval_max_steps=int(51e2),
        # eval_max_trajectories=5,
    )

    # algo = PPO(clip_vf_loss=False, normalize_rewards=None)  # Run with defaults.
    algo = A2OC(normalize_rewards=None)
    agent = AtariOcAgent(model_kwargs={'option_size': 4})
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=1e6,
        log_interval_steps=1e3,
        affinity=affinity,
        # transfer=True,
        # transfer_iter=150,
        # log_traj_window=10
    )
    config = dict(game=game)
    name = "ppo_" + game
    log_dir = "example_2a_atari"
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
Example #23
0
def build_and_train(
    slot_affinity_code="0slt_1gpu_1cpu",
    log_dir="test",
    run_ID="0",
    config_key="serial_radsac",
    experiment_title="exp",
):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    variant = load_variant(log_dir)
    config = update_config(config, variant)

    # Hack that the first part of the log_dir matches the source of the model
    model_base_dir = config["pretrain"]["model_dir"]
    if model_base_dir is not None:
        raw_log_dir = log_dir.split(experiment_title)[-1].lstrip(
            "/")  # get rid of ~/GitRepos/adam/rlpyt/data/local/<timestamp>/
        model_sub_dir = raw_log_dir.split("/RlFromUl/")[
            0]  # keep the UL part, which comes first
        pretrain_ID = config["pretrain"]["run_ID"]
        config["agent"]["state_dict_filename"] = osp.join(
            model_base_dir, model_sub_dir, f"run_{pretrain_ID}/params.pkl")

    pprint.pprint(config)

    sampler = SerialSampler(
        EnvCls=make,
        env_kwargs=config["env"],
        CollectorCls=CpuResetCollector,
        eval_env_kwargs=config["env"],  # Same args!
        **config["sampler"],
    )
    algo = RadSacFromUl(**config["algo"])
    agent = SacAgent(
        conv_kwargs=config["conv"],
        fc1_kwargs=config["fc1"],
        pi_model_kwargs=config["pi_model"],
        q_model_kwargs=config["q_model"],
        **config["agent"],
    )
    runner = MinibatchRlEvalEnvStep(
        algo=algo,
        agent=agent,
        sampler=sampler,
        affinity=affinity,
        frame_skip=config["env"]["frame_skip"],
        **config["runner"],
    )
    name = config["env"]["domain_name"] + "_" + config["env"]["task_name"]
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
def build_and_train(
    slot_affinity_code="0slt_0gpu_4cpu_4cpr",
    log_dir="test",
    run_ID="0",
    config_key="sac_ul_compress",
):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    # variant = load_variant(log_dir)
    # config = update_config(config, variant)

    config["algo"]["min_steps_rl"] = 100
    config["algo"]["min_steps_ul"] = 150
    config["algo"]["replay_size"] = 1e4
    config["algo"]["batch_size"] = 64
    config["algo"]["ul_batch_size"] = 32
    config["runner"]["n_steps"] = 1e3
    config["runner"]["log_interval_steps"] = 1e2
    config["sampler"]["eval_n_envs"] = 1
    config["sampler"]["eval_max_steps"] = 500
    config["algo"]["stop_rl_conv_grad"] = True
    config["algo"]["ul_update_schedule"] = "cosine_8"

    pprint.pprint(config)

    sampler = SerialSampler(
        EnvCls=make,
        env_kwargs=config["env"],
        CollectorCls=CpuResetCollector,
        # TrajInfoCls=AtariTrajInfo,
        eval_env_kwargs=config["env"],  # Same args!
        **config["sampler"])
    algo = SacUl(**config["algo"])
    agent = SacWithUlAgent(conv_kwargs=config["conv"],
                           fc1_kwargs=config["fc1"],
                           pi_model_kwargs=config["pi_model"],
                           q_model_kwargs=config["q_model"],
                           **config["agent"])
    runner = MinibatchRlEvalEnvStep(algo=algo,
                                    agent=agent,
                                    sampler=sampler,
                                    affinity=affinity,
                                    frame_skip=config["env"]["frame_skip"],
                                    **config["runner"])
    name = config["env"]["domain_name"] + "_" + config["env"]["task_name"]
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
Example #25
0
def build_and_train(level="nav_maze_random_goal_01", run_ID=0, cuda_idx=None):
    config = configs['r2d1']
    config['eval_env'] = dict(level=level)
    config['env'] = dict(level=level)

    affinity = make_affinity(
        run_slot=0,
        n_cpu_core=4,  # Use 16 cores across all experiments.
        n_gpu=1,  # Use 8 gpus across all experiments.
        hyperthread_offset=6,  # If machine has 24 cores.
        n_socket=2,  # Presume CPU socket affinity to lower/upper half GPUs.
        gpu_per_run=1,  # How many GPUs to parallelize one run across.
    )

    # sampler = GpuSampler(
    #     EnvCls=DeepmindLabEnv,
    #     env_kwargs=config['env'],
    #     eval_env_kwargs=config['eval_env'],
    #     CollectorCls=GpuWaitResetCollector,
    #     TrajInfoCls=LabTrajInfo,
    #     **config["sampler"]
    # )
    sampler = SerialSampler(
        EnvCls=DeepmindLabEnv,
        env_kwargs=config['env'],
        eval_env_kwargs=config['env'],
        batch_T=16,  # Four time-steps per sampler iteration.
        batch_B=1,
        max_decorrelation_steps=0,
        eval_n_envs=10,
        eval_max_steps=int(10e3),
        eval_max_trajectories=5,
    )
    algo = R2D1(optim_kwargs=config["optim"], **config["algo"])
    agent = AtariR2d1Agent(model_kwargs=config["model"], **config["agent"])
    runner = MinibatchRlEval(
        algo=algo,
        agent=agent,
        sampler=sampler,
        affinity=affinity,
        **config["runner"]
    )
    name = "lab_dqn_" + level
    log_dir = "lab_example_2"
    with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"):
        runner.train()
Example #26
0
def build_and_train(log_dir, level="Level_GoToLocalAvoidLava", run_ID=0, cuda_idx=None, eval=False, save_model='last', load_model_path=None):
    params = torch.load(load_model_path) if load_model_path else {}
    agent_state_dict = params.get('agent_state_dict')
    optimizer_state_dict = params.get('optimizer_state_dict')

    env_kwargs = dict(
        level=level,
        slipperiness=0.0,
        one_hot_obs=True,
    )
    factory_method = make_wapper(
        Minigrid,
        [OneHotAction, TimeLimit],
        [dict(), dict(duration=64)])
    sampler = SerialSampler(
        EnvCls=factory_method,
        TrajInfoCls=TrajInfo,
        env_kwargs=env_kwargs,
        eval_env_kwargs=env_kwargs,
        batch_T=1,
        batch_B=1,
        max_decorrelation_steps=0,
        eval_n_envs=10,
        eval_max_steps=int(10e3),
        eval_max_trajectories=5,
    )
    algo = Dreamer(horizon=10, kl_scale=0.1, use_pcont=True, initial_optim_state_dict=optimizer_state_dict,
                   env=OneHotAction(TimeLimit(Minigrid(**env_kwargs), 64)), save_env_videos=True)
    agent = MinigridDreamerAgent(train_noise=0.4, eval_noise=0, expl_type="epsilon_greedy",
                              expl_min=0.1, expl_decay=2000 / 0.3, initial_model_state_dict=agent_state_dict,
                              model_kwargs=dict(use_pcont=True, stride=1, shape=(20, 7, 7), depth=1, padding=2,
                                                full_conv=False))
    runner_cls = MinibatchRlEval if eval else MinibatchRl
    runner = runner_cls(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=5e6,
        log_interval_steps=1e3,
        affinity=dict(cuda_idx=cuda_idx),
    )
    config = dict(level=level)
    name = "dreamer_" + level
    with logger_context(log_dir, run_ID, name, config, snapshot_mode=save_model, override_prefix=True,
                        use_summary_writer=True):
        runner.train()
Example #27
0
def build_and_train(slot_affinity_code="0slt_0gpu_4cpu_4cpr",
                    log_dir="test",
                    run_ID="0",
                    config_key="ppo_ul_16env"):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    # variant = load_variant(log_dir)
    # config = update_config(config, variant)

    # config["sampler"]["batch_B"] = 4
    # config["sampler"]["batch_T"] = 5
    # config["runner"]["log_interval_steps"] = 100
    # config["runner"]["n_steps"] = 1000
    config["algo"]["ul_update_schedule"] = "constant_1"
    config["algo"]["min_steps_rl"] = 1e3
    config["algo"]["min_steps_ul"] = 200
    config["algo"]["max_steps_ul"] = 20e6
    config["model"]["stop_conv_grad"] = True
    config["sampler"]["max_decorrelation_steps"] = 0
    config["sampler"]["batch_B"] = 3
    config["sampler"]["batch_T"] = 20
    config["algo"]["ul_pri_alpha"] = 1.
    config["algo"]["ul_pri_n_step_return"] = 10
    config["algo"]["ul_replay_size"] = 900

    pprint.pprint(config)

    sampler = SerialSampler(
        EnvCls=AtariEnv84,
        env_kwargs=config["env"],
        CollectorCls=CpuResetCollector,
        TrajInfoCls=AtariTrajInfo,
        eval_env_kwargs=config["env"],  # Same args!
        **config["sampler"])
    algo = PpoUl(optim_kwargs=config["optim"], **config["algo"])
    agent = AtariPgRlWithUlAgent(model_kwargs=config["model"],
                                 **config["agent"])
    runner = MinibatchRl(algo=algo,
                         agent=agent,
                         sampler=sampler,
                         affinity=affinity,
                         **config["runner"])
    name = config["env"]["game"]
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
Example #28
0
File: run.py Project: kevinghst/SPR
def build_and_train(game="pong", run_ID=0, cuda_idx=0, args=None):
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    env = AtariEnv
    config = set_config(args, game)

    sampler = SerialSampler(
        EnvCls=env,
        TrajInfoCls=AtariTrajInfo,  # default traj info + GameScore
        env_kwargs=config["env"],
        eval_env_kwargs=config["eval_env"],
        batch_T=config['sampler']['batch_T'],
        batch_B=config['sampler']['batch_B'],
        max_decorrelation_steps=0,
        eval_CollectorCls=OneToOneSerialEvalCollector,
        eval_n_envs=config["sampler"]["eval_n_envs"],
        eval_max_steps=config['sampler']['eval_max_steps'],
        eval_max_trajectories=config["sampler"]["eval_max_trajectories"],
    )
    args.discount = config["algo"]["discount"]
    algo = SPRCategoricalDQN(optim_kwargs=config["optim"],
                             jumps=args.jumps,
                             **config["algo"])  # Run with defaults.
    agent = SPRAgent(ModelCls=SPRCatDqnModel,
                     model_kwargs=config["model"],
                     **config["agent"])

    wandb.config.update(config)
    runner = MinibatchRlEvalWandb(algo=algo,
                                  agent=agent,
                                  sampler=sampler,
                                  n_steps=args.n_steps,
                                  affinity=dict(cuda_idx=cuda_idx),
                                  log_interval_steps=args.n_steps //
                                  args.num_logs,
                                  seed=args.seed,
                                  final_eval_only=args.final_eval_only,
                                  skip_init_eval=args.skip_init_eval)
    config = dict(game=game)
    name = "dqn_" + game
    log_dir = "logs"
    with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"):
        runner.train()

    return None
def build_and_train(log_dir, game="traffic", run_ID=0, cuda_idx=None, eval=False, save_model='last', load_model_path=None, action_repeat=1, **kwargs):
    params = torch.load(load_model_path) if load_model_path else {}
    agent_state_dict = params.get('agent_state_dict')
    optimizer_state_dict = params.get('optimizer_state_dict')

    env_kwargs = dict(
        name=game,
        render=False,
        **kwargs
    )
    factory_method = make_wapper(
        TrafficEnv,
        [ActionRepeat, OneHotAction, TimeLimit],
        [dict(amount=action_repeat), dict(), dict(duration=1000 / action_repeat)])
    sampler = SerialSampler(
        EnvCls=factory_method,
        TrajInfoCls=AtariTrajInfo,  # default traj info + GameScore
        env_kwargs=env_kwargs,
        eval_env_kwargs=env_kwargs,
        batch_T=1,
        batch_B=1,
        max_decorrelation_steps=0,
        eval_n_envs=10,
        eval_max_steps=int(10e3),
        eval_max_trajectories=5,
    )
    algo = Dreamer(horizon=10, kl_scale=0.1, use_pcont=True, initial_optim_state_dict=optimizer_state_dict)
    agent = AtariDreamerAgent(train_noise=0.4, eval_noise=0, expl_type="epsilon_greedy",
                              expl_min=0.1, expl_decay=2000 / 0.3, initial_model_state_dict=agent_state_dict,
                              model_kwargs=dict(use_pcont=True))
    runner_cls = MinibatchRlEval if eval else MinibatchRl
    runner = runner_cls(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=5e6,
        log_interval_steps=1e3,
        affinity=dict(cuda_idx=cuda_idx),
    )
    config = dict(game=game)
    name = "dreamer_" + game
    with logger_context(log_dir, run_ID, name, config, snapshot_mode=save_model, override_prefix=True,
                        use_summary_writer=True):
        runner.train()
Example #30
0
def build_and_train(env_id="Cassie-v0", run_ID=0, cuda_idx=None, snapshot_file=None):

    if snapshot_file is None:
        initial_optim_state_dict = None
        initial_model_state_dict = None
    else:
        snapshot = torch.load(snapshot_file)
        initial_optim_state_dict=snapshot['optimizer_state_dict']
        initial_model_state_dict=snapshot['agent_state_dict']

    sampler = SerialSampler(
        EnvCls=gym_make,
        env_kwargs=dict(id=env_id,
            xml_file=get_full_path('resources/cassie.xml')),
        eval_env_kwargs=dict(id=env_id,
            xml_file=get_full_path('resources/cassie.xml')),
        batch_T=1,  # One time-step per sampler iteration.
        batch_B=1,  # One environment (i.e. sampler Batch dimension).
        max_decorrelation_steps=0,
        eval_n_envs=1,
        eval_max_steps=int(1000),
        eval_max_trajectories=50, # 50
    )
    algo = SAC(
            initial_optim_state_dict=initial_optim_state_dict)
    agent = SacAgent(
            initial_model_state_dict=initial_model_state_dict)
    runner = MinibatchRlEval(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=1e6,
        log_interval_steps=5e4, #5e4
        affinity=dict(cuda_idx=cuda_idx),
    )
    other_param = dict(
            env_id=env_id,
            forward_reward_weight=0,
            shift_cost=True,
            cum_steps='1M')
    name = "sac_" + env_id
    log_dir = "Cassie_stand"
    with logger_context(log_dir, run_ID, name, other_param, snapshot_mode='last', use_summary_writer=True):
        runner.train()