def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) config["eval_env"]["game"] = config["env"]["game"] sampler = GpuSampler( EnvCls=AtariEnv, env_kwargs=config["env"], CollectorCls=WaitResetCollector, TrajInfoCls=AtariTrajInfo, eval_env_kwargs=config["eval_env"], **config["sampler"] ) algo = DQN(optim_kwargs=config["optim"], **config["algo"]) agent = AtariDqnAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"] ) name = config["env"]["game"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) print('Variant', variant) config = update_config(config, variant) sampler = SerialSampler(EnvCls=DMControlEnv, env_kwargs=config["env"], CollectorCls=CpuResetCollector, eval_env_kwargs=config["eval_env"], **config["sampler"]) algo = SAC(optim_kwargs=config["optim"], **config["algo"]) agent = SacAgent(**config["agent"]) runner = MinibatchRlEval(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = "sac_{}_{}".format(config['env']['domain'], config['env']['task']) with logger_context(log_dir, run_ID, name, log_params=config, snapshot_mode='last'): runner.train()
def build_and_train(game="pong", run_ID=0, cuda_idx=None): sampler = SerialSampler( EnvCls=AtariEnv, TrajInfoCls=AtariTrajInfo, # default traj info + GameScore env_kwargs=dict(game=game), eval_env_kwargs=dict(game=game), batch_T=4, # Four time-steps per sampler iteration. batch_B=1, max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(10e3), eval_max_trajectories=5, ) algo = DQN(min_steps_learn=1e3) # Run with defaults. agent = AtariDqnAgent() runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=1e3, affinity=dict(cuda_idx=cuda_idx), ) config = dict(game=game) name = "dqn_" + game #log_dir = "example_1" log_dir = get_outputs_path() with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"): runner.train()
def build_and_train(cfg, game="ftwc", run_ID=0): #GVS NOTE: for ftwc/qait ?use CpuWaitResetCollector (or CpuResetCollector) sampler = SerialSampler( EnvCls=AtariEnv, TrajInfoCls=AtariTrajInfo, # default traj info + GameScore env_kwargs=dict(game=game), eval_env_kwargs=dict(game=game), batch_T=4, # Four time-steps per sampler iteration. batch_B=1, max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(10e2), eval_max_trajectories=5, ) algo = DQN(min_steps_learn=1e2) # Run with defaults. agent = AtariDqnAgent() runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=1e3, affinity=dict(cuda_idx=cfg.cuda_idx), ) config = dict(game=game) name = "dqn_" + game log_dir = "ftwc" with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) # config["eval_env"]["id"] = config["env"]["id"] sampler = SerialSampler( EnvCls=gym_make, env_kwargs=config["env"], CollectorCls=CpuResetCollector, eval_env_kwargs=config["env"], **config["sampler"] ) algo = SAC(optim_kwargs=config["optim"], **config["algo"]) agent = SacAgent(**config["agent"]) runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"] ) name = "sac_" + config["env"]["id"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) eval_env_config = config["env"].copy() eval_env_config["start_level"] = config["env"]["num_levels"] + 100 eval_env_config["num_levels"] = 100 sampler = GpuSampler(EnvCls=make, env_kwargs=config["env"], CollectorCls=GpuResetCollector, eval_env_kwargs=eval_env_config, **config["sampler"]) if config["checkpoint"]: model_state_dict = torch.load(config["checkpoint"]) print("Loaded.") else: model_state_dict = None algo = PPO_AUG_VAE(optim_kwargs=config["optim"], **config["algo"]) agent = RADPgVaeAgent(ModelCls=RadVaePolicy, model_kwargs=config["model"], initial_model_state_dict=model_state_dict, **config["agent"]) runner = MinibatchRlEval(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = config["env"]["id"] with logger_context(log_dir, run_ID, name, config, snapshot_mode='last'): runner.train()
def build_and_train(game="pong", run_ID=0, cuda_idx=None, n_parallel=2): config = dict( env=dict(game=game), algo=dict(batch_size=128), sampler=dict(batch_T=2, batch_B=32), ) sampler = GpuSampler( EnvCls=AtariEnv, env_kwargs=dict(game=game), CollectorCls=GpuWaitResetCollector, eval_env_kwargs=dict(game=game), max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(10e3), eval_max_trajectories=5, # batch_T=4, # Get from config. # batch_B=1, **config[ "sampler"] # More parallel environments for batched forward-pass. ) algo = DQN(**config["algo"]) # Run with defaults. agent = AtariDqnAgent() runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=1e3, affinity=dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel))), ) name = "dqn_" + game log_dir = "example_5" with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(level="nav_maze_random_goal_01", run_ID=0, cuda_idx=None): sampler = SerialSampler( EnvCls=DeepmindLabEnv, env_kwargs=dict(level=level), eval_env_kwargs=dict(level=level), batch_T=4, # Four time-steps per sampler iteration. batch_B=1, max_decorrelation_steps=0, eval_n_envs=5, eval_max_steps=int(10e3), eval_max_trajectories=5, ) algo = DQN(min_steps_learn=1e3) # Run with defaults. agent = AtariDqnAgent() runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=1e5, affinity=dict(cuda_idx=cuda_idx), ) config = dict(level=level) name = "lab_dqn" log_dir = "lab_example_1" with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"): runner.train()
def build_and_train( slot_affinity_code="0slt_1gpu_1cpu", log_dir="test", run_ID="0", config_key="scaled_ddqn_ul", ): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) pprint.pprint(config) sampler = SerialSampler( EnvCls=AtariEnv84, env_kwargs=config["env"], CollectorCls=CpuResetCollector, TrajInfoCls=AtariTrajInfo, eval_env_kwargs=config["env"], # Same args! **config["sampler"]) algo = DqnUl(optim_kwargs=config["optim"], **config["algo"]) agent = AtariDqnAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRlEval(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = config["env"]["game"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(env_id="Hopper-v3", run_ID=0, cuda_idx=None): sampler = SerialSampler( EnvCls=gym_make, env_kwargs=dict(id=env_id), eval_env_kwargs=dict(id=env_id), batch_T=50, # One time-step per sampler iteration. batch_B=1, # One environment (i.e. sampler Batch dimension). max_decorrelation_steps=0, eval_n_envs=2, eval_max_steps=int(51e3), eval_max_trajectories=200, ) # The cost function for InvertedPendulumBulletEnv def obs_cost_fn(x): target = torch.FloatTensor([0,0,1,0,0]) c = (x - target)**2 c = -c.sum(dim=1) return -c.exp() algo = GP_Mlp(obs_cost_fn=obs_cost_fn) # Run with defaults. agent = GP_MlpAgent() runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=1e6, log_interval_steps=200, affinity=dict(cuda_idx=cuda_idx), ) config = dict(env_id=env_id) name = "gp_mlp_" + env_id log_dir = "example_1" with logger_context(log_dir, run_ID, name, config, snapshot_mode='last'): runner.train()
def build_and_train(game="academy_empty_goal_close", run_ID=0, cuda_idx=None): sampler = SerialSampler( EnvCls=create_single_football_env, env_kwargs=dict(game=game), eval_env_kwargs=dict(game=game), batch_T=4, # Four time-steps per sampler iteration. batch_B=1, max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(10e3), eval_max_trajectories=5, ) algo = DQN(min_steps_learn=1e3) # Run with defaults. agent = AtariDqnAgent() runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=1e3, affinity=dict(cuda_idx=cuda_idx), ) config = dict(game=game) name = "dqn_" + game log_dir = "example_1" with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) eval_env_config = config["env"].copy() eval_env_config["start_level"] = config["env"]["num_levels"] + 100 eval_env_config["num_levels"] = 100 sampler = GpuSampler( EnvCls=make, env_kwargs=config["env"], CollectorCls=GpuResetCollector, eval_env_kwargs=eval_env_config, **config["sampler"] ) algo = PPO(optim_kwargs=config["optim"], **config["algo"]) agent = RADPgAgent(ModelCls=RADModel, model_kwargs=config["model"], **config["agent"]) runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"] ) name = config["env"]["id"] with logger_context(log_dir, run_ID, name, config, snapshot_mode='last'): runner.train()
def build_and_train(game="pong", run_ID=0, cuda_idx=None): sampler = SerialSampler( EnvCls=AtariEnv, env_kwargs=dict(game=game), eval_env_kwargs=dict(game=game), batch_T= 4, # Four time-steps per sampler iteration. 在collector中采样数据的时候每个循环走多少个step batch_B=1, # 有多少个并行的environment实例 max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(10e3), eval_max_trajectories=5, ) algo = DQN(min_steps_learn=1e3) # Run with defaults. agent = AtariDqnAgent() # 在sampler中initialize runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, # 总共多少个step log_interval_steps=1e3, # 每多少个step记录一次日志 affinity=dict(cuda_idx=cuda_idx), ) config = dict(game=game) name = "dqn_" + game log_dir = "example_1" with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"): runner.train()
def build_and_train(level="nav_maze_random_goal_01", run_ID=0, cuda_idx=None): affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(8))) sampler = SerialSampler( EnvCls=DeepmindLabEnv, env_kwargs=dict(level=level), eval_env_kwargs=dict(level=level), batch_T=4, # Four time-steps per sampler iteration. batch_B=1, max_decorrelation_steps=0, eval_n_envs=5, eval_max_steps=int(10e3), eval_max_trajectories=5, ) algo = PPO() agent = AtariFfAgent() runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=1e3, affinity=affinity, ) config = dict(level=level) name = "lab_ppo" log_dir = "lab_example_3" with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = make_affinity( run_slot=0, n_cpu_core=os.cpu_count(), # Use 16 cores across all experiments. n_gpu=1, # Use 8 gpus across all experiments. gpu_per_run=1, sample_gpu_per_run=1, async_sample=True, optim_sample_share_gpu=True) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) config["eval_env"]["game"] = config["env"]["game"] sampler = GpuSampler(EnvCls=AtariEnv, env_kwargs=config["env"], CollectorCls=GpuWaitResetCollector, TrajInfoCls=AtariTrajInfo, eval_env_kwargs=config["eval_env"], **config["sampler"]) algo = CategoricalDQN(optim_kwargs=config["optim"], **config["algo"]) agent = AtariCatDqnAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRlEval(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = config["env"]["game"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) print('Config', config) if 'pixel_wrapper_kwargs' in config['env']: info_keys = config.get('info_keys', None) state_keys = config.get('state_keys', None) init_namedtuples(info_keys=info_keys, state_keys=state_keys) sampler = CpuSampler(EnvCls=DMControlEnv, env_kwargs=config["env"], CollectorCls=CpuResetCollector, eval_env_kwargs=config["env"], **config["sampler"]) algo = SAC(optim_kwargs=config["optim"], **config["algo"]) agent = SacAgent(**config["agent"]) runner = MinibatchRlEval(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = "sac_{}_{}".format(config['env']['domain'], config['env']['task']) with logger_context(log_dir, run_ID, name, log_params=config, snapshot_mode='last'): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) sampler = GpuSampler( EnvCls=gym.make, env_kwargs=config["env"], eval_env_kwargs=config["eval_env"], **config["sampler"] ) algo = DiscreteSACAE(optim_kwargs=config["optim"], ae_optim_kwargs=config["ae_optim"], **config["algo"]) agent = DiscreteSacAEAgent(**config["agent"], encoder_kwargs=config["encoder"], model_kwargs=config["actor"], critic_kwargs=config["critic"]) runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"] ) name = config["env"]["id"] with logger_context(log_dir, run_ID, name, config, snapshot_mode='last'): runner.train()
def build_and_train(env_id="Hopper-v3", run_ID=0, cuda_idx=None): sampler = SerialSampler( EnvCls=gym_make, env_kwargs=dict(id=env_id), eval_env_kwargs=dict(id=env_id), batch_T=1, # One time-step per sampler iteration. batch_B=1, # One environment (i.e. sampler Batch dimension). max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(51e3), eval_max_trajectories=50, ) algo = SAC() # Run with defaults. agent = SacAgent() runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=1e6, log_interval_steps=1e4, affinity=dict(cuda_idx=cuda_idx), ) config = dict(env_id=env_id) name = "sac_" + env_id log_dir = "example_2" with logger_context(log_dir, run_ID, name, config): runner.train()
def run_task(vv, log_dir, exp_name): vv = update_env_kwargs(vv) run_ID = vv['seed'] config_key = vv['config_key'] slot_affinity_code = encode_affinity( n_cpu_core=20, n_gpu=2, n_socket=2, run_slot=0, set_affinity=True, # it can help to restrict workers to individual CPUs ) affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] config.update(**vv) # config["env"] = env_arg_dict[config['env_name']] vv['env_kwargs']['headless'] = True sac_module = 'rlpyt.algos.qpg.{}'.format(config['sac_module']) sac_agent_module = 'rlpyt.agents.qpg.{}'.format(config['sac_agent_module']) sac_module = importlib.import_module(sac_module) sac_agent_module = importlib.import_module(sac_agent_module) SAC = sac_module.SAC SacAgent = sac_agent_module.SacAgent if 'pixel_wrapper_kwargs' in config['env']: info_keys = config.get('info_keys', None) state_keys = config.get('state_keys', None) init_namedtuples(info_keys=info_keys, state_keys=state_keys) sampler = CpuSampler( EnvCls=SOFTGYM_ENVS[vv['env_name']], env_kwargs=vv['env_kwargs'], CollectorCls=CpuResetCollector, eval_env_kwargs=vv['env_kwargs'], **config["sampler"] ) algo = SAC(optim_kwargs=config["optim"], **config["algo"]) agent = SacAgent(**config["agent"]) runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"] ) name = "sac_{}".format(vv['env_name']) with logger_context(log_dir, run_ID, name, log_params=config, snapshot_mode='last'): runner.train()
def build_and_train(level="nav_maze_random_goal_01", run_ID=0, cuda_idx=None): config = configs['r2d1'] config['eval_env'] = dict(level=level) config['env'] = dict(level=level) affinity = make_affinity( run_slot=0, n_cpu_core=4, # Use 16 cores across all experiments. n_gpu=1, # Use 8 gpus across all experiments. hyperthread_offset=6, # If machine has 24 cores. n_socket=2, # Presume CPU socket affinity to lower/upper half GPUs. gpu_per_run=1, # How many GPUs to parallelize one run across. ) # sampler = GpuSampler( # EnvCls=DeepmindLabEnv, # env_kwargs=config['env'], # eval_env_kwargs=config['eval_env'], # CollectorCls=GpuWaitResetCollector, # TrajInfoCls=LabTrajInfo, # **config["sampler"] # ) sampler = SerialSampler( EnvCls=DeepmindLabEnv, env_kwargs=config['env'], eval_env_kwargs=config['env'], batch_T=16, # Four time-steps per sampler iteration. batch_B=1, max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(10e3), eval_max_trajectories=5, ) algo = R2D1(optim_kwargs=config["optim"], **config["algo"]) agent = AtariR2d1Agent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"] ) name = "lab_dqn_" + level log_dir = "lab_example_2" with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"): runner.train()
def build_and_train(env_id="Cassie-v0", run_ID=0, cuda_idx=None, snapshot_file=None): if snapshot_file is None: initial_optim_state_dict = None initial_model_state_dict = None else: snapshot = torch.load(snapshot_file) initial_optim_state_dict=snapshot['optimizer_state_dict'] initial_model_state_dict=snapshot['agent_state_dict'] sampler = SerialSampler( EnvCls=gym_make, env_kwargs=dict(id=env_id, xml_file=get_full_path('resources/cassie.xml')), eval_env_kwargs=dict(id=env_id, xml_file=get_full_path('resources/cassie.xml')), batch_T=1, # One time-step per sampler iteration. batch_B=1, # One environment (i.e. sampler Batch dimension). max_decorrelation_steps=0, eval_n_envs=1, eval_max_steps=int(1000), eval_max_trajectories=50, # 50 ) algo = SAC( initial_optim_state_dict=initial_optim_state_dict) agent = SacAgent( initial_model_state_dict=initial_model_state_dict) runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=1e6, log_interval_steps=5e4, #5e4 affinity=dict(cuda_idx=cuda_idx), ) other_param = dict( env_id=env_id, forward_reward_weight=0, shift_cost=True, cum_steps='1M') name = "sac_" + env_id log_dir = "Cassie_stand" with logger_context(log_dir, run_ID, name, other_param, snapshot_mode='last', use_summary_writer=True): runner.train()
def build_and_train(game="academy_empty_goal_close", run_ID=1, cuda_idx=None): env_vector_size = args.envVectorSize coach = Coach(envOptions=args.envOptions, vectorSize=env_vector_size, algo='Bandit', initialQ=args.initialQ, beta=args.beta) sampler = SerialSampler( EnvCls=create_single_football_env, env_kwargs=dict(game=game), eval_env_kwargs=dict(game=game), batch_T=5, # Four time-steps per sampler iteration. batch_B=env_vector_size, max_decorrelation_steps=0, eval_n_envs=args.evalNumOfEnvs, eval_max_steps=int(10e3), eval_max_trajectories=5, coach=coach, eval_env=args.evalEnv, ) algo = PPO(minibatches=1) # Run with defaults. agent = AtariLstmAgent() # TODO: move to ff runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=args.numOfSteps, log_interval_steps=1e3, affinity=dict(cuda_idx=cuda_idx), ) name = args.name log_dir = "example_1" with logger_context(log_dir, run_ID, name, log_params=vars(args), snapshot_mode="last"): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) sampler = GpuSampler( EnvCls=gym.make, env_kwargs=config["env"], CollectorCls=GpuResetCollector, eval_env_kwargs=config["eval_env"], **config["sampler"] ) if config["checkpoint"]: model_state_dict = torch.load(config["checkpoint"]) else: model_state_dict = None algo = PPO(optim_kwargs=config["optim"], **config["algo"]) agent = CategoricalPgAgent( ModelCls=BaselinePolicy, model_kwargs=config["model"], initial_model_state_dict=model_state_dict, **config["agent"] ) runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"] ) name = config["env"]["id"] with logger_context(log_dir, run_ID, name, config, snapshot_mode='last'): runner.train()
def build_and_train(env_id="HalfCheetah-v3", log_dir='results', alg_name='ddpg', run_ID=0, cuda_idx=None, seed=42, q_hidden_sizes=[64, 64], q_nonlinearity='relu', batch_size=32, q_target=None, log_freq=1e3): set_seed(seed) sampler = SerialSampler( EnvCls=gym_make, env_kwargs=dict(id=env_id), eval_env_kwargs=dict(id=env_id), batch_T=1, # One time-step per sampler iteration. batch_B=1, # One environment (i.e. sampler Batch dimension). max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(51e3), eval_max_trajectories=50, ) if q_nonlinearity == 'relu': q_nonlin = torch.nn.ReLU if q_nonlinearity == 'sine': q_nonlin = Sine if q_nonlinearity == 'linear': q_nonlin = Linear if alg_name.lower() == 'ddpg': if q_target is None: q_target = True algo = DDPG(batch_size=batch_size, target=q_target, min_steps_learn=log_freq) agent = DdpgAgent(q_hidden_sizes=q_hidden_sizes, q_nonlinearity=q_nonlin) elif alg_name.lower() == 'preqn': if q_target is None: q_target = False algo = PreQN(batch_size=batch_size, target=q_target, min_steps_learn=log_freq) agent = PreqnAgent(q_hidden_sizes=q_hidden_sizes, q_nonlinearity=q_nonlin) runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, seed=seed, n_steps=1e6, log_interval_steps=log_freq, #1e4, affinity=dict(cuda_idx=cuda_idx), ) config = dict(env_id=env_id) log_dir = os.path.join(log_dir, env_id) log_dir = os.path.join(log_dir, alg_name.lower()) log_dir += '-' + q_nonlinearity log_dir += '-hs' + str(q_hidden_sizes) log_dir += '-qt' + str(q_target) log_dir += '-bs' + str(batch_size) name = '' #env_id with logger_context(log_dir, run_ID, name, config, override_prefix=True, use_summary_writer=True): runner.train()
def build_and_train(env_id="CartPole-v1", run_ID=0, cuda_idx=None, sample_mode="serial", n_parallel=2, args={}): affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel))) gpu_cpu = "CPU" if cuda_idx is None else f"GPU {cuda_idx}" if sample_mode == "serial": Sampler = SerialSampler # (Ignores workers_cpus.) print(f"Using serial sampler, {gpu_cpu} for sampling and optimizing.") elif sample_mode == "cpu": Sampler = CpuSampler print( f"Using CPU parallel sampler (agent in workers), {gpu_cpu} for optimizing." ) elif sample_mode == "gpu": Sampler = GpuSampler print( f"Using GPU parallel sampler (agent in master), {gpu_cpu} for sampling and optimizing." ) elif sample_mode == "alternating": Sampler = AlternatingSampler affinity["workers_cpus"] += affinity["workers_cpus"] # (Double list) affinity["alternating"] = True # Sampler will check for this. print( f"Using Alternating GPU parallel sampler, {gpu_cpu} for sampling and optimizing." ) sampler = Sampler( EnvCls=gym_make, env_kwargs=dict(id=env_id), eval_env_kwargs=dict(id=env_id), batch_T=5, # 5 time-steps per sampler iteration. batch_B=16, # 16 parallel environments. max_decorrelation_steps=400, eval_n_envs=25, eval_max_steps=12500) algo = PPO(learning_rate=args.lr) agentCls, agent_basis = get_agent_cls_cartpole(args.network) agent = agentCls(model_kwargs={ 'fc_sizes': args.fcs, 'gain_type': args.gain_type, 'basis': agent_basis }) runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=1e5, log_interval_steps=5e2, affinity=affinity, ) config = dict(env_id=env_id, lr=args.lr, gain_type=args.gain_type, debug=False, network=args.network, fcs=str(args.fcs)) name = f"{args.folder}_{args.network}" log_dir = f"{args.folder}_{args.network}" with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(args, game="", run_ID=0, config=None): """ 1. Parse the args object into dictionaries understood by rlpyt """ config['env']['id'] = args.env_name config["eval_env"]["id"] = args.env_name config["eval_env"]["horizon"] = args.horizon config["env"]["horizon"] = args.horizon if 'procgen' in args.env_name: for k, v in vars(args).items(): if args.env_name.split('-')[1] in k: config['env'][k] = v config['model']['frame_stack'] = args.frame_stack config['model']['nce_loss'] = args.nce_loss config['model']['algo'] = args.algo config['model']['env_name'] = args.env_name config['model']['dueling'] = args.dueling == 1 config['algo']['double_dqn'] = args.double_dqn == 1 config['algo']['prioritized_replay'] = args.prioritized_replay == 1 config['algo']['n_step_return'] = args.n_step_return config['algo']['learning_rate'] = args.learning_rate config['runner']['log_interval_steps'] = args.log_interval_steps config['cmd_args'] = vars(args) """ 2. Create the CatDQN (C51) agent from custom implementation """ agent = AtariCatDqnAgent(ModelCls=AtariCatDqnModel_nce, model_kwargs=config["model"], **config["agent"]) algo = CategoricalDQN_nce(args=config['cmd_args'], ReplayBufferCls=None, optim_kwargs=config["optim"], **config["algo"]) if args.mode == 'parallel': affinity = make_affinity(n_cpu_core=args.n_cpus, n_gpu=args.n_gpus, n_socket=1 # hyperthread_offset=0 ) """ Some architecture require the following block to be uncommented. Try with and without. This is here to allow scheduling of non-sequential CPU IDs """ # import psutil # psutil.Process().cpu_affinity([]) # cpus = tuple(psutil.Process().cpu_affinity()) # affinity['all_cpus'] = affinity['master_cpus'] = cpus # affinity['workers_cpus'] = tuple([tuple([x]) for x in cpus+cpus]) # env_kwargs = config['env'] sampler = GpuSampler(EnvCls=make_env, env_kwargs=config["env"], CollectorCls=GpuWaitResetCollector, TrajInfoCls=AtariTrajInfo, eval_env_kwargs=config["eval_env"], **config["sampler"]) """ If you don't have a GPU, use the CpuSampler """ # sampler = CpuSampler( # EnvCls=AtariEnv if args.game is not None else make_env, # env_kwargs=config["env"], # CollectorCls=CpuWaitResetCollector, # TrajInfoCls=AtariTrajInfo, # eval_env_kwargs=config["eval_env"], # **config["sampler"] # ) elif args.mode == 'serial': affinity = make_affinity( n_cpu_core=1, # Use 16 cores across all experiments. n_gpu=args.n_gpus, # Use 8 gpus across all experiments. n_socket=1, ) """ Some architecture require the following block to be uncommented. Try with and without. """ # import psutil # psutil.Process().cpu_affinity([]) # cpus = tuple(psutil.Process().cpu_affinity()) # affinity['all_cpus'] = affinity['master_cpus'] = cpus # affinity['workers_cpus'] = tuple([tuple([x]) for x in cpus+cpus]) # env_kwargs = config['env'] sampler = SerialSampler( EnvCls=make_env, env_kwargs=config["env"], # CollectorCls=SerialEvalCollector, TrajInfoCls=AtariTrajInfo, eval_env_kwargs=config["eval_env"], **config["sampler"]) """ 3. Bookkeeping, setting up Comet.ml experiments, etc """ folders_name = [args.output_dir, args.env_name, 'run_' + args.run_ID] path = os.path.join(*folders_name) os.makedirs(path, exist_ok=True) experiment = Experiment(api_key='your_key', auto_output_logging=False, project_name='driml', workspace="your_workspace", disabled=True) experiment.add_tag('C51+DIM' if ( args.lambda_LL > 0 or args.lambda_LG > 0 or args.lambda_GL > 0 or args.lambda_GG > 0) else 'C51') experiment.set_name(args.experiment_name) experiment.log_parameters(config) MinibatchRlEval.TF_logger = Logger(path, use_TFX=True, params=config, comet_experiment=experiment, disable_local=True) MinibatchRlEval.log_diagnostics = log_diagnostics_custom MinibatchRlEval._log_infos = _log_infos MinibatchRlEval.evaluate_agent = evaluate_agent """ 4. Define the runner as minibatch """ runner = MinibatchRlEval(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) runner.algo.opt_info_fields = tuple( list(runner.algo.opt_info_fields) + ['lossNCE'] + ['action%d' % i for i in range(15)]) name = args.mode + "_value_based_nce_" + args.env_name log_dir = os.path.join(args.output_dir, args.env_name) logger.set_snapshot_gap(args.weight_save_interval // config['runner']['log_interval_steps']) """ 6. Run the experiment and optionally save network weights """ with experiment.train(): with logger_context( log_dir, run_ID, name, config, snapshot_mode=( 'last' if args.weight_save_interval == -1 else 'gap' )): # set 'all' to save every it, 'gap' for every X it runner.train()
def start_experiment(args): args_json = json.dumps(vars(args), indent=4) if not os.path.isdir(args.log_dir): os.makedirs(args.log_dir) with open(args.log_dir + '/arguments.json', 'w') as jsonfile: jsonfile.write(args_json) config = dict(env_id=args.env) if args.sample_mode == 'gpu': assert args.num_gpus > 0 affinity = dict(cuda_idx=0, workers_cpus=list(range(args.num_cpus))) os.environ['CUDA_VISIBLE_DEVICES'] = str(0) else: affinity = dict(workers_cpus=list(range(args.num_cpus))) # potentially reload models initial_optim_state_dict = None initial_model_state_dict = None if args.pretrain != 'None': os.system(f"find {args.log_dir} -name '*.json' -delete" ) # clean up json files for video recorder checkpoint = torch.load( os.path.join(_RESULTS_DIR, args.pretrain, 'params.pkl')) initial_optim_state_dict = checkpoint['optimizer_state_dict'] initial_model_state_dict = checkpoint['agent_state_dict'] # ----------------------------------------------------- POLICY ----------------------------------------------------- # model_args = dict(curiosity_kwargs=dict(curiosity_alg=args.curiosity_alg)) if args.curiosity_alg == 'icm': model_args['curiosity_kwargs'][ 'feature_encoding'] = args.feature_encoding model_args['curiosity_kwargs']['batch_norm'] = args.batch_norm model_args['curiosity_kwargs'][ 'prediction_beta'] = args.prediction_beta model_args['curiosity_kwargs'][ 'forward_loss_wt'] = args.forward_loss_wt elif args.curiosity_alg == 'disagreement': model_args['curiosity_kwargs'][ 'feature_encoding'] = args.feature_encoding model_args['curiosity_kwargs']['ensemble_size'] = args.ensemble_size model_args['curiosity_kwargs']['batch_norm'] = args.batch_norm model_args['curiosity_kwargs'][ 'prediction_beta'] = args.prediction_beta model_args['curiosity_kwargs'][ 'forward_loss_wt'] = args.forward_loss_wt model_args['curiosity_kwargs']['device'] = args.sample_mode elif args.curiosity_alg == 'ndigo': model_args['curiosity_kwargs'][ 'feature_encoding'] = args.feature_encoding model_args['curiosity_kwargs']['pred_horizon'] = args.pred_horizon model_args['curiosity_kwargs']['batch_norm'] = args.batch_norm model_args['curiosity_kwargs']['num_predictors'] = args.num_predictors model_args['curiosity_kwargs']['device'] = args.sample_mode elif args.curiosity_alg == 'rnd': model_args['curiosity_kwargs'][ 'feature_encoding'] = args.feature_encoding model_args['curiosity_kwargs'][ 'prediction_beta'] = args.prediction_beta model_args['curiosity_kwargs'][ 'drop_probability'] = args.drop_probability model_args['curiosity_kwargs']['gamma'] = args.discount model_args['curiosity_kwargs']['device'] = args.sample_mode if args.env in _MUJOCO_ENVS: if args.lstm: agent = MujocoLstmAgent( initial_model_state_dict=initial_model_state_dict) else: agent = MujocoFfAgent( initial_model_state_dict=initial_model_state_dict) else: if args.lstm: agent = AtariLstmAgent( initial_model_state_dict=initial_model_state_dict, model_kwargs=model_args, no_extrinsic=args.no_extrinsic) else: agent = AtariFfAgent( initial_model_state_dict=initial_model_state_dict) # ----------------------------------------------------- LEARNING ALG ----------------------------------------------------- # if args.alg == 'ppo': if args.kernel_mu == 0.: kernel_params = None else: kernel_params = (args.kernel_mu, args.kernel_sigma) algo = PPO( discount=args.discount, learning_rate=args.lr, value_loss_coeff=args.v_loss_coeff, entropy_loss_coeff=args.entropy_loss_coeff, OptimCls=torch.optim.Adam, optim_kwargs=None, clip_grad_norm=args.grad_norm_bound, initial_optim_state_dict= initial_optim_state_dict, # is None is not reloading a checkpoint gae_lambda=args.gae_lambda, minibatches=args. minibatches, # if recurrent: batch_B needs to be at least equal, if not recurrent: batch_B*batch_T needs to be at least equal to this epochs=args.epochs, ratio_clip=args.ratio_clip, linear_lr_schedule=args.linear_lr, normalize_advantage=args.normalize_advantage, normalize_reward=args.normalize_reward, kernel_params=kernel_params, curiosity_type=args.curiosity_alg) elif args.alg == 'a2c': algo = A2C(discount=args.discount, learning_rate=args.lr, value_loss_coeff=args.v_loss_coeff, entropy_loss_coeff=args.entropy_loss_coeff, OptimCls=torch.optim.Adam, optim_kwargs=None, clip_grad_norm=args.grad_norm_bound, initial_optim_state_dict=initial_optim_state_dict, gae_lambda=args.gae_lambda, normalize_advantage=args.normalize_advantage) # ----------------------------------------------------- SAMPLER ----------------------------------------------------- # # environment setup traj_info_cl = TrajInfo # environment specific - potentially overriden below if 'mario' in args.env.lower(): env_cl = mario_make env_args = dict(game=args.env, no_extrinsic=args.no_extrinsic, no_negative_reward=args.no_negative_reward, normalize_obs=args.normalize_obs, normalize_obs_steps=10000) elif 'deepmind' in args.env.lower(): # pycolab deepmind environments env_cl = deepmind_make traj_info_cl = PycolabTrajInfo env_args = dict(game=args.env, no_extrinsic=args.no_extrinsic, no_negative_reward=args.no_negative_reward, normalize_obs=args.normalize_obs, normalize_obs_steps=10000, log_heatmaps=args.log_heatmaps, logdir=args.log_dir, obs_type=args.obs_type, max_steps_per_episode=args.max_episode_steps) elif args.env in _MUJOCO_ENVS: env_cl = gym_make env_args = dict(id=args.env, no_extrinsic=args.no_extrinsic, no_negative_reward=args.no_negative_reward, normalize_obs=False, normalize_obs_steps=10000) elif args.env in _ATARI_ENVS: env_cl = AtariEnv traj_info_cl = AtariTrajInfo env_args = dict( game=args.env, no_extrinsic=args.no_extrinsic, no_negative_reward=args.no_negative_reward, normalize_obs=args.normalize_obs, normalize_obs_steps=10000, downsampling_scheme='classical', record_freq=args.record_freq, record_dir=args.log_dir, horizon=args.max_episode_steps, ) if args.sample_mode == 'gpu': if args.lstm: collector_class = GpuWaitResetCollector else: collector_class = GpuResetCollector sampler = GpuSampler(EnvCls=env_cl, env_kwargs=env_args, eval_env_kwargs=env_args, batch_T=args.timestep_limit, batch_B=args.num_envs, max_decorrelation_steps=0, TrajInfoCls=traj_info_cl, eval_n_envs=args.eval_envs, eval_max_steps=args.eval_max_steps, eval_max_trajectories=args.eval_max_traj, record_freq=args.record_freq, log_dir=args.log_dir, CollectorCls=collector_class) else: if args.lstm: collector_class = CpuWaitResetCollector else: collector_class = CpuResetCollector sampler = CpuSampler( EnvCls=env_cl, env_kwargs=env_args, eval_env_kwargs=env_args, batch_T=args.timestep_limit, # timesteps in a trajectory episode batch_B=args.num_envs, # environments distributed across workers max_decorrelation_steps=0, TrajInfoCls=traj_info_cl, eval_n_envs=args.eval_envs, eval_max_steps=args.eval_max_steps, eval_max_trajectories=args.eval_max_traj, record_freq=args.record_freq, log_dir=args.log_dir, CollectorCls=collector_class) # ----------------------------------------------------- RUNNER ----------------------------------------------------- # if args.eval_envs > 0: runner = MinibatchRlEval(algo=algo, agent=agent, sampler=sampler, n_steps=args.iterations, affinity=affinity, log_interval_steps=args.log_interval, log_dir=args.log_dir, pretrain=args.pretrain) else: runner = MinibatchRl(algo=algo, agent=agent, sampler=sampler, n_steps=args.iterations, affinity=affinity, log_interval_steps=args.log_interval, log_dir=args.log_dir, pretrain=args.pretrain) with logger_context(args.log_dir, config, snapshot_mode="last", use_summary_writer=True): runner.train()
def build_and_train(bsuite_id, gym_id, run_ID=0, cuda_idx=None, results_dir='./bsuite_baseline', n_parallel=8): id = bsuite_id if not gym_id else gym_id logger._tf_summary_dir = f'./runs/{id.replace("/", "_")}_{run_ID}_baseline_{datetime.now().strftime("%D-%T").replace("/", "_")}' logger._tf_summary_writer = SummaryWriter(logger._tf_summary_dir) def get_env(*args, **kwargs): return GymEnvWrapper( TransformObservation(env=FrameStack( num_stack=4, env=(gym_wrapper.GymFromDMEnv( bsuite.load_and_record_to_csv( bsuite_id=bsuite_id, results_dir=results_dir, overwrite=True, )) if not gym_id else gym.make(gym_id))), f=lambda lazy_frames: np.reshape( np.stack(lazy_frames._frames), -1))) sampler = SerialSampler( # TODO (Async)GpuSampler EnvCls=get_env, env_kwargs=dict(game=bsuite_id), eval_env_kwargs=dict(game=bsuite_id), batch_T= 1, # Four time-steps per sampler iteration. (Only influence count) batch_B=1, max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(10e3), eval_max_trajectories=5, ) n_steps = 3e4 algo = DQN( discount=0.995, min_steps_learn=1e3, eps_steps=n_steps, # delta_clip=None, # learning_rate=1e-4, # target_update_tau=500, # target_update_tau=0.01, # target_update_interval=100, double_dqn=True, prioritized_replay=True, # clip_grad_norm=1, # FIXME arbitrary # n_step_return=2, # FIXME arbitrary # clip_grad_norm=1000000, ) # Run with defaults. # agent = MlpDqnAgent(ModelCls=lambda *args, **kwargs: MlpDqnModel(*args, **kwargs, dueling=True)) agent = MlpDqnAgent(ModelCls=MlpDqnModel) p = psutil.Process() runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=n_steps, # orig 50e6 log_interval_steps=1e2, # orig 1e3, affinity=dict(cuda_idx=cuda_idx), # affinity=dict(cuda_idx=cuda_idx, workers_cpus=p.cpu_affinity()[:n_parallel]), ) runner.train()