def build_and_train(game="fruitbot", run_ID=0, cuda_idx=None, n_parallel=6): affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel)), alternating=True) env_args = dict(game=game, start_level=0, num_levels=1) # sampler = AlternatingSampler( # EnvCls=ProcgenEnv, # env_kwargs=env_args, # eval_env_kwargs=env_args, # batch_T=256, # One time-step per sampler iteration. # batch_B=12, # One environment (i.e. sampler Batch dimension). # max_decorrelation_steps=100, # # eval_n_envs=5, # # eval_max_steps=int(25e3), # # eval_max_trajectories=30 # ) # sampler = GpuSampler( # EnvCls=ProcgenEnv, # env_kwargs=env_args, # eval_env_kwargs=env_args, # batch_T=256, # One time-step per sampler iteration. # batch_B=12, # One environment (i.e. sampler Batch dimension). # max_decorrelation_steps=100, # # eval_n_envs=5, # # eval_max_steps=int(25e3), # # eval_max_trajectories=30 # ) # sampler = SerialSampler( EnvCls=ProcgenEnv, env_kwargs=env_args, eval_env_kwargs=env_args, batch_T=256, # One time-step per sampler iteration. batch_B=8, # One environment (i.e. sampler Batch dimension). max_decorrelation_steps=0, # eval_n_envs=2, # eval_max_steps=int(51e2), # eval_max_trajectories=5, ) algo = PPOC(clip_vf_loss=False, normalize_rewards=None) # Run with defaults. agent = Agent(model_kwargs={'option_size': 2}) runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, n_steps=1e6, log_interval_steps=1e3, affinity=affinity, # transfer=True, # transfer_iter=150, # log_traj_window=10 ) config = dict(game=game) name = "ppo_" + game log_dir = "example_2a_fruitbot" with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(game="doom_benchmark", run_ID=0, cuda_idx=None, n_parallel=-1, n_env=-1, n_timestep=-1, sample_mode=None): affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel))) gpu_cpu = "CPU" if cuda_idx is None else f"GPU {cuda_idx}" if sample_mode == "serial": Sampler = SerialSampler # (Ignores workers_cpus.) print(f"Using serial sampler, {gpu_cpu} for sampling and optimizing.") elif sample_mode == "cpu": Sampler = CpuSampler print(f"Using CPU parallel sampler (agent in workers), {gpu_cpu} for optimizing.") elif sample_mode == "gpu": Sampler = GpuSampler print(f"Using GPU parallel sampler (agent in master), {gpu_cpu} for sampling and optimizing.") elif sample_mode == "alternating": Sampler = AlternatingSampler affinity["workers_cpus"] += affinity["workers_cpus"] # (Double list) affinity["alternating"] = True # Sampler will check for this. print(f"Using Alternating GPU parallel sampler, {gpu_cpu} for sampling and optimizing.") # !!! # COMMENT: to use alternating sampler here we had to comment lines 126-127 in action_server.py # if "bootstrap_value" in self.samples_np.agent: # self.bootstrap_value_pair[alt][:] = self.agent.value(*agent_inputs_pair[alt]) # otherwise it crashes # !!! sampler = Sampler( EnvCls=VizdoomEnv, env_kwargs=dict(game=game), batch_T=n_timestep, batch_B=n_env, max_decorrelation_steps=0, ) algo = PPO(minibatches=1, epochs=1) agent = DoomLstmAgent() # Maybe AsyncRL could give better performance? # In the current version however PPO + AsyncRL does not seem to be working (not implemented) runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=1e5, affinity=affinity, ) config = dict(game=game) name = "ppo_" + game + str(n_env) log_dir = "doom_ppo" with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train( slot_affinity_code="0slt_1gpu_1cpu", log_dir="test", run_ID="0", config_key="ppo_16env", experiment_title="exp", snapshot_mode="none", snapshot_gap=None, ): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) # Hack that the first part of the log_dir matches the source of the model model_base_dir = config["pretrain"]["model_dir"] if model_base_dir is not None: raw_log_dir = log_dir.split(experiment_title)[-1].lstrip( "/") # get rid of ~/GitRepos/adam/rlpyt/data/local/<timestamp>/ model_sub_dir = raw_log_dir.split("/RlFromUl/")[ 0] # keep the UL part, which comes first config["agent"]["state_dict_filename"] = osp.join( model_base_dir, model_sub_dir, "run_0/params.pkl") pprint.pprint(config) sampler = AlternatingSampler( EnvCls=DmlabEnv, env_kwargs=config["env"], CollectorCls=GpuWaitResetCollector, # TrajInfoCls=AtariTrajInfo, # eval_env_kwargs=config["env"], # Same args! **config["sampler"]) algo = PPO(optim_kwargs=config["optim"], **config["algo"]) agent = DmlabPgLstmAlternatingAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRl(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = config["env"]["level"] if snapshot_gap is not None: snapshot_gap = int(snapshot_gap) with logger_context( log_dir, run_ID, name, config, snapshot_mode=snapshot_mode, snapshot_gap=snapshot_gap, ): runner.train()
def build_and_train(game="montezuma_revenge", run_ID=0, cuda_idx=None, n_parallel=6): affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel)), alternating=True) env_args = dict(id=game) # env_args[RLPYT_WRAPPER_KEY] = [ClipActionsWrapper] # sampler = AlternatingSampler( # EnvCls=AtariEnv, # TrajInfoCls=AtariTrajInfo, # env_kwargs=dict(game=game), # batch_T=64, # One time-step per sampler iteration. # batch_B=36, # One environment (i.e. sampler Batch dimension). # max_decorrelation_steps=1000, # # eval_n_envs=5, # # eval_max_steps=int(25e3), # # eval_max_trajectories=30 # ) # sampler = SerialSampler( EnvCls=AtariEnv, TrajInfoCls=AtariTrajInfo, env_kwargs=dict(game=game), batch_T=256, # One time-step per sampler iteration. batch_B=8, # One environment (i.e. sampler Batch dimension). max_decorrelation_steps=1000, # eval_n_envs=2, # eval_max_steps=int(51e2), # eval_max_trajectories=5, ) # algo = PPO(clip_vf_loss=False, normalize_rewards=None) # Run with defaults. algo = A2OC(normalize_rewards=None) agent = AtariOcAgent(model_kwargs={'option_size': 4}) runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, n_steps=1e6, log_interval_steps=1e3, affinity=affinity, # transfer=True, # transfer_iter=150, # log_traj_window=10 ) config = dict(game=game) name = "ppo_" + game log_dir = "example_2a_atari" with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(game="pong", run_ID=0, cuda_idx=None, sample_mode="serial", n_parallel=2): affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel))) gpu_cpu = "CPU" if cuda_idx is None else f"GPU {cuda_idx}" if sample_mode == "serial": Sampler = SerialSampler # (Ignores workers_cpus.) print(f"Using serial sampler, {gpu_cpu} for sampling and optimizing.") elif sample_mode == "cpu": Sampler = CpuSampler print( f"Using CPU parallel sampler (agent in workers), {gpu_cpu} for optimizing." ) elif sample_mode == "gpu": Sampler = GpuSampler print( f"Using GPU parallel sampler (agent in master), {gpu_cpu} for sampling and optimizing." ) elif sample_mode == "alternating": Sampler = AlternatingSampler affinity["workers_cpus"] += affinity["workers_cpus"] # (Double list) affinity["alternating"] = True # Sampler will check for this. print( f"Using Alternating GPU parallel sampler, {gpu_cpu} for sampling and optimizing." ) sampler = Sampler( EnvCls=AtariEnv, TrajInfoCls=AtariTrajInfo, env_kwargs=dict(game=game), batch_T=5, # 5 time-steps per sampler iteration. batch_B=16, # 16 parallel environments. max_decorrelation_steps=400, ) algo = A2C() # Run with defaults. agent = AtariFfAgent() runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=1e5, affinity=affinity, ) config = dict(game=game) name = "a2c_" + game log_dir = "example_3" with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(): p = psutil.Process() cpus = p.cpu_affinity() affinity = dict(cuda_idx=None, master_cpus=cpus, workers_cpus=list([x] for x in cpus), set_affinity=True) sampler = CpuSampler( EnvCls=_make_env, env_kwargs=dict(rank=0), max_decorrelation_steps=0, batch_T=6000, batch_B=len(cpus), # 20 parallel environments. ) model_kwargs = dict(model_kwargs=dict(hidden_sizes=[256, 256])) ppo_config = { "discount": 0.98, "entropy_loss_coeff": 0.01, "learning_rate": 0.00025, "value_loss_coeff": 0.5, "clip_grad_norm": 0.5, "minibatches": 40, "gae_lambda": 0.95, "ratio_clip": 0.2, "epochs": 4 } algo = PPO(**ppo_config) agent = MujocoFfAgent(**model_kwargs) runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, n_steps=int(60e6), log_interval_steps=int(1e6), affinity=affinity, ) config = dict(rank=0, env_id='picking') name = "ppo_rlpyt_pushing" log_dir = os.path.join(os.path.dirname(__file__), name) with logger_context(log_dir, 0, name, config, use_summary_writer=True, snapshot_mode='all'): runner.train()
def run(self): config = self.getConfig() sampler = CpuSampler(EnvCls=make_env, env_kwargs={"num_levels": config["num_levels"]}, batch_T=256, batch_B=8, max_decorrelation_steps=0) optim_args = dict(weight_decay=config["l2_penalty"] ) if "l2_penalty" in config else None algo = PPO(discount=config["discount"], entropy_loss_coeff=config["entropy_bonus"], gae_lambda=config["lambda"], minibatches=config["minibatches_per_epoch"], epochs=config["epochs_per_rollout"], ratio_clip=config["ppo_clip"], learning_rate=config["learning_rate"], normalize_advantage=True, optim_kwargs=optim_args) agent = ImpalaAgent( model_kwargs={ "in_channels": config["in_channels"], "out_channels": config["out_channels"], "hidden_size": config["hidden_size"] }) affinity = dict(cuda_idx=0, workers_cpus=list(range(config["workers"]))) runner = MinibatchRl(algo=algo, agent=agent, sampler=sampler, n_steps=25e6, log_interval_steps=500, affinity=affinity, seed=42069) log_dir = "./logs" name = config["name"] run_ID = name with logger_context(log_dir, run_ID, name, config, use_summary_writer=True): runner.train() torch.save(agent.state_dict(), "./" + name + ".pt") wandb.save("./" + name + ".pt")
def build_and_train(): p = psutil.Process() cpus = p.cpu_affinity() affinity = dict(cuda_idx=None, master_cpus=cpus, workers_cpus=list([x] for x in cpus), set_affinity=True) sampler = CpuSampler(EnvCls=_make_env, env_kwargs=dict(rank=0), batch_T=1, batch_B=4, max_decorrelation_steps=0, CollectorCls=CpuResetCollector) algo = SAC(batch_size=256, min_steps_learn=10000, replay_size=1000000, replay_ratio=256 / 4, target_update_interval=1, target_entropy=-9, target_update_tau=0.01, learning_rate=0.00025, action_prior="uniform", reward_scale=1, reparameterize=True, clip_grad_norm=1e9, n_step_return=1, updates_per_sync=1, bootstrap_timelimit=False) # Run with defaults. agent = SacAgent(model_kwargs={'hidden_sizes': [256, 256]}) runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=10000, affinity=affinity, ) config = dict(env_id='picking') name = "sac_rlpyt_picking" log_dir = os.path.join(os.path.dirname(__file__), "sac_rlpyt_picking") with logger_context(log_dir, 0, name, config, use_summary_writer=True, snapshot_mode='all'): runner.train()
def build_and_train(slot_affinity_code="0slt_0gpu_4cpu_4cpr", log_dir="test", run_ID="0", config_key="ppo_ul_16env"): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] # variant = load_variant(log_dir) # config = update_config(config, variant) # config["sampler"]["batch_B"] = 4 # config["sampler"]["batch_T"] = 5 # config["runner"]["log_interval_steps"] = 100 # config["runner"]["n_steps"] = 1000 config["algo"]["ul_update_schedule"] = "constant_1" config["algo"]["min_steps_rl"] = 1e3 config["algo"]["min_steps_ul"] = 200 config["algo"]["max_steps_ul"] = 20e6 config["model"]["stop_conv_grad"] = True config["sampler"]["max_decorrelation_steps"] = 0 config["sampler"]["batch_B"] = 3 config["sampler"]["batch_T"] = 20 config["algo"]["ul_pri_alpha"] = 1. config["algo"]["ul_pri_n_step_return"] = 10 config["algo"]["ul_replay_size"] = 900 pprint.pprint(config) sampler = SerialSampler( EnvCls=AtariEnv84, env_kwargs=config["env"], CollectorCls=CpuResetCollector, TrajInfoCls=AtariTrajInfo, eval_env_kwargs=config["env"], # Same args! **config["sampler"]) algo = PpoUl(optim_kwargs=config["optim"], **config["algo"]) agent = AtariPgRlWithUlAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRl(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = config["env"]["game"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(run_ID=0, cuda_idx=None, n_parallel=2, serial_sampling=False): affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel))) device = "CPU" if cuda_idx is None else f"GPU {cuda_idx}" if serial_sampling: Sampler = SerialSampler # ignores workers_cpus print( f"Using serial sampler w/ {device} for action sampling and optimization" ) else: Sampler = CpuSampler if cuda_idx is None else GpuSampler print( f"Using parallel sampler w/ {device} for action sampling and optimization" ) game = "pong" sampler = Sampler( EnvCls=AtariEnv, TrajInfoCls=AtariTrajInfo, # default traj info + GameScore env_kwargs=dict(game=game), eval_env_kwargs=dict(game=game), batch_T=1, batch_B=8, # number of game running in parallel max_decorrelation_steps=0) ### ALGO GOES HERE algo = None agent = AtariDqnAgent() runner = MinibatchRl(algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=1e3, affinity=affinity) config = dict(game=game) name = "rp_attack_dqn_" + game log_dir = "rp_attack" with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) sampler = AlternatingSampler(EnvCls=gym_make, env_kwargs=config["env"], **config["sampler"]) algo = A2C(optim_kwargs=config["optim"], **config["algo"]) agent = MujocoFfAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRl(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = config["env"]["id"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) sampler = AlternatingSampler(EnvCls=ProcgenEnv, env_kwargs=config["env"], CollectorCls=GpuResetCollector, **config["sampler"]) algo = PPO(optim_kwargs=config["optim"], **config["algo"]) agent = ProcgenFfAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRl(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = config["env"]["game"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = get_affinity(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) sampler = SerialSampler(EnvCls=gym_make, env_kwargs=config["env"], CollectorCls=ResetCollector, **config["sampler"]) algo = PPO(optim_kwargs=config["optim"], **config["algo"]) agent = MujocoFfAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRl(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = "ppo_" + config["env"]["id"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] # variant = load_variant(log_dir) # config = update_config(config, variant) sampler = CpuSampler(EnvCls=AtariEnv, env_kwargs=config["env"], CollectorCls=EpisodicLivesWaitResetCollector, **config["sampler"]) algo = A2C(optim_kwargs=config["optim"], **config["algo"]) agent = AtariLstmAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRl(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = config["env"]["game"] + str(config["algo"]["entropy_loss_coeff"]) with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) sampler = CpuSampler(EnvCls=gym_make, env_kwargs=config["env"], CollectorCls=CpuResetCollector, **config["sampler"]) algo = DDPG(optim_kwargs=config["optim"], **config["algo"]) agent = DdpgAgent(**config["agent"]) runner = MinibatchRl(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = config["env"]["id"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(run_id=0, greedy_eval=False, test=True, test_date=None): sampler = BatchedEpisodicSampler( EnvCls=MyEnv, env_kwargs=dict(), batch_T=500, batch_B=64, ) log_dir = "data/rl_example_3/" init_agent = None if test: data = load_params(log_dir, run_id, test_date) init_agent = data['agent_state_dict'] runner = MinibatchRl( algo=PPO(entropy_loss_coeff=0., learning_rate=3e-4), agent=AgentPgDiscrete( greedy_eval, model_kwargs={ 'policy_hidden_sizes': [64, 64], 'value_hidden_sizes': [64, 64], }, initial_model_state_dict=init_agent, ), sampler=sampler, n_steps=int(400 * sampler.batch_size), log_interval_steps=int(10 * sampler.batch_size), ) if test: runner.startup() sampler.obtain_samples(0, 'eval') obs = sampler.samples_np.env.observation plot_obs(obs) else: with logger_context("{}{}".format( log_dir, datetime.datetime.today().strftime("%Y%m%d_%H%M")), run_id, 'Reacher2D', snapshot_mode="last", use_summary_writer=True, override_prefix=True): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = get_affinity(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) sampler = GpuParallelSampler(EnvCls=AtariEnv, env_kwargs=config["env"], CollectorCls=WaitResetCollector, TrajInfoCls=AtariTrajInfo, **config["sampler"]) algo = PPO(optim_kwargs=config["optim"], **config["algo"]) agent = AtariLstmAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRl(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = config["env"]["game"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) env = IsaacGymEnv(config['env']['task']) # Make env import torch.nn as nn config["model"]["hidden_nonlinearity"] = getattr( nn, config["model"] ["hidden_nonlinearity"]) # Replace string with proper activation sampler = IsaacSampler(env, **config["sampler"]) algo = PPOC(optim_kwargs=config["optim"], **config["algo"]) agent = MujocoFfOcAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRl(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = "ppo_nv_" + config["env"]["task"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(log_dir, run_ID, config_key): # affinity = affinity_from_code(run_slot_affinity_code) slot_affinity_code = prepend_run_slot(0, affinity_code) affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] sampler = CpuSampler(EnvCls=make_env, env_kwargs={}, CollectorCls=CpuResetCollector, **config["sampler"]) algo = PPO(optim_kwargs=config["optim"], **config["algo"]) agent = MultiFfAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRl(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = config["env"]["id"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) config["algo_name"] = 'A2OC' env = BatchPOMDPEnv(batch_B=config["sampler"]["batch_B"], **config["env"]) config["algo"]["discount"] = env.discount sampler = BatchPOMDPSampler(env=env, **config["sampler"]) algo = A2OC(optim_kwargs=config["optim"], **config["algo"]) agent = PomdpOcFfAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRl(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = config["env"]["id"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) sampler = CpuParallelSampler(EnvCls=AtariEnv, env_kwargs=config["env"], CollectorCls=EpisodicLivesWaitResetCollector, TrajInfoCls=AtariTrajInfo, **config["sampler"]) algo = A2C(optim_kwargs=config["optim"], **config["algo"]) agent = AtariFfAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRl(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = config["env"]["game"] with logger_context(log_dir, run_ID, name, config): # Might have to flatten config runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) # import ipdb # ipdb.set_trace() config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) sampler = CpuSampler(EnvCls=gym_make, env_kwargs=config["env"], CollectorCls=CpuResetCollector, **config["sampler"]) algo = PPO(optim_kwargs=config["optim"], **config["algo"]) agent = MujocoFfAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRl(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = config["env"]["id"] with logger_context(log_dir, run_ID, name, config, snapshot_mode="all"): runner.train()
def debug_build_and_train(game="pong", run_ID=0, cuda_idx=0): config = configs['ernbw'] config['runner']['log_interval_steps'] = 1e5 config['env']['game'] = game config["eval_env"]["game"] = config["env"]["game"] config["algo"]["n_step_return"] = 5 config["algo"]["prioritized_replay"] = True config["algo"]["min_steps_learn"] = 1e3 wandb.config.update(config) sampler = SerialSampler( EnvCls=AtariEnv, TrajInfoCls=AtariTrajInfo, # default traj info + GameScore env_kwargs=dict(game=game), eval_env_kwargs=dict(game=game), batch_T=4, # Four time-steps per sampler iteration. batch_B=1, max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(10e3), eval_max_trajectories=5, ) algo = PizeroCategoricalDQN(optim_kwargs=config["optim"], **config["algo"]) # Run with defaults. agent = AtariCatDqnAgent(ModelCls=PizeroCatDqnModel, model_kwargs=config["model"], **config["agent"]) runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=1e3, affinity=dict(cuda_idx=cuda_idx), ) config = dict(game=game) name = "dqn_" + game log_dir = "example_1" with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) config["algo_name"] = 'A2OC_D_RNN' t_env = pomdp_interface(**config["env"]) config["algo"]["discount"] = t_env.discount sampler = GpuSampler(EnvCls=pomdp_interface, env_kwargs=config["env"], **config["sampler"]) algo = A2OC(optim_kwargs=config["optim"], **config["algo"]) agent = PomdpOcRnnAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRl(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = config["env"]["id"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train( slot_affinity_code="0slt_1gpu_1cpu", log_dir="test", run_ID="0", config_key="ppo_ul_16env", snapshot_mode="none", snapshot_gap=None, ): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) pprint.pprint(config) sampler = AlternatingSampler( EnvCls=DmlabEnv, env_kwargs=config["env"], CollectorCls=GpuWaitResetCollector, **config["sampler"] ) algo = PpoUl(optim_kwargs=config["optim"], **config["algo"]) agent = DmlabPgLstmAlternatingAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"] ) name = config["env"]["level"] if snapshot_gap is not None: snapshot_gap = int(snapshot_gap) with logger_context( log_dir, run_ID, name, config, snapshot_mode=snapshot_mode, snapshot_gap=snapshot_gap, ): runner.train()
def build_and_train(game="pong", run_ID=0, cuda_idx=None): sampler = SerialSampler( # EnvCls=MyEnv, # env_kwargs=dict(), # batch_T=4, # Four time-steps per sampler iteration. # batch_B=1, # max_decorrelation_steps=0, # eval_n_envs=10, # eval_env_kwargs=dict(), # eval_max_steps=int(10e3), # eval_max_trajectories=5, EnvCls=CanvasEnv, env_kwargs=dict(), batch_T=1, # 5 time-steps per sampler iteration. batch_B=16, # 16 parallel environments. max_decorrelation_steps=400, ) algo = PPO() agent = CategoricalPgAgent( ModelCls=MyModel, model_kwargs=dict(image_shape=(1, CANVAS_WIDTH, CANVAS_WIDTH), output_size=N_ACTIONS), initial_model_state_dict=None, ) runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=1e3, affinity=dict(cuda_idx=cuda_idx), ) config = dict() name = "dqn_" + game log_dir = "example_1" with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"): runner.train()
def findOptimalAgent(reward, run_ID=0): """ Find the optimal agent for the MDP (see Config for specification) under a custom reward function using rlpyt's implementation of A2C. """ cpus = list(range(C.N_PARALLEL)) affinity = dict(cuda_idx=C.CUDA_IDX, workers_cpus=cpus) sampler = SerialSampler(EnvCls=rlpyt_make, env_kwargs=dict(id=C.ENV, reward=reward), batch_T=C.BATCH_T, batch_B=C.BATCH_B, max_decorrelation_steps=400, eval_env_kwargs=dict(id=C.ENV), eval_n_envs=5, eval_max_steps=2500) algo = A2C(discount=C.DISCOUNT, learning_rate=C.LR, value_loss_coeff=C.VALUE_LOSS_COEFF, entropy_loss_coeff=C.ENTROPY_LOSS_COEFF) agent = CategoricalPgAgent(AcrobotNet) runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, n_steps=C.N_STEPS, log_interval_steps=C.LOG_STEP, affinity=affinity, ) name = "a2c_" + C.ENV.lower() log_dir = name with logger_context(log_dir, run_ID, name, snapshot_mode='last', override_prefix=True): runner.train() return agent
def build_and_train(): affinity = dict(cuda_idx=None, workers_cpus=list(range(15))) sampler = CpuSampler( EnvCls=_make_env, env_kwargs=dict(rank=0), batch_T=6000, batch_B=20, ) algo = SAC(bootstrap_timelimit=False) # Run with defaults. agent = SacAgent() runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=600, affinity=affinity, ) config = dict(env_id='reaching') name = "sac_reaching" log_dir = os.path.join(os.path.dirname(__file__), "example") with logger_context(log_dir, 0, name, config, use_summary_writer=True): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID): # (Or load from a central store of configs.) config = dict( env=dict(game="pong"), algo=dict(learning_rate=7e-4), sampler=dict(batch_B=16), ) affinity = affinity_from_code(slot_affinity_code) variant = load_variant(log_dir) # global config config = update_config(config, variant) sampler = GpuSampler( EnvCls=AtariEnv, TrajInfoCls=AtariTrajInfo, env_kwargs=config["env"], CollectorCls=GpuWaitResetCollector, batch_T=5, # batch_B=16, # Get from config. max_decorrelation_steps=400, **config["sampler"] ) algo = A2C(**config["algo"]) # Run with defaults. agent = AtariFfAgent() runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=1e5, affinity=affinity, ) name = "a2c_" + config["env"]["game"] # log_dir = "example_6" with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(run_id=0): sampler = SerialSampler( EnvCls=MyEnv, env_kwargs=dict(), eval_env_kwargs=dict(), batch_T=horizon, batch_B=64, max_decorrelation_steps=0, eval_n_envs=64, eval_max_steps=int(1e6), eval_max_trajectories=64, ) algo = PPO(entropy_loss_coeff=0., learning_rate=3e-4) agent = GaussianPgAgent( ModelCls=MujocoFfModel, model_kwargs=model_params, ) runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, n_steps=int(400 * horizon * 64), log_interval_steps=int(10 * horizon * 64), ) log_params = dict() log_dir = "data/rl_example_1/{}".format( datetime.datetime.today().strftime("%Y%m%d_%H%M")) with logger_context(log_dir, run_id, 'Reacher2D', log_params=log_params, snapshot_mode="last", use_summary_writer=True, override_prefix=True): runner.train()