def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) # config["eval_env"]["id"] = config["env"]["id"] sampler = SerialSampler( EnvCls=gym_make, env_kwargs=config["env"], CollectorCls=CpuResetCollector, eval_env_kwargs=config["env"], **config["sampler"] ) algo = SAC(optim_kwargs=config["optim"], **config["algo"]) agent = SacAgent(**config["agent"]) runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"] ) name = "sac_" + config["env"]["id"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(env_id="Hopper-v3", run_ID=0, cuda_idx=None): sampler = SerialSampler( EnvCls=gym_make, env_kwargs=dict(id=env_id), eval_env_kwargs=dict(id=env_id), batch_T=1, # One time-step per sampler iteration. batch_B=1, # One environment (i.e. sampler Batch dimension). max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(51e3), eval_max_trajectories=50, ) algo = SAC() # Run with defaults. agent = SacAgent() runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=1e6, log_interval_steps=1e4, affinity=dict(cuda_idx=cuda_idx), ) config = dict(env_id=env_id) name = "sac_" + env_id log_dir = "example_2" with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(): p = psutil.Process() cpus = p.cpu_affinity() affinity = dict(cuda_idx=None, master_cpus=cpus, workers_cpus=list([x] for x in cpus), set_affinity=True) sampler = CpuSampler(EnvCls=_make_env, env_kwargs=dict(rank=0), batch_T=1, batch_B=4, max_decorrelation_steps=0, CollectorCls=CpuResetCollector) algo = SAC(batch_size=256, min_steps_learn=10000, replay_size=1000000, replay_ratio=256 / 4, target_update_interval=1, target_entropy=-9, target_update_tau=0.01, learning_rate=0.00025, action_prior="uniform", reward_scale=1, reparameterize=True, clip_grad_norm=1e9, n_step_return=1, updates_per_sync=1, bootstrap_timelimit=False) # Run with defaults. agent = SacAgent(model_kwargs={'hidden_sizes': [256, 256]}) runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=10000, affinity=affinity, ) config = dict(env_id='picking') name = "sac_rlpyt_picking" log_dir = os.path.join(os.path.dirname(__file__), "sac_rlpyt_picking") with logger_context(log_dir, 0, name, config, use_summary_writer=True, snapshot_mode='all'): runner.train()
def build_and_train(env_id="Cassie-v0", run_ID=0, cuda_idx=None, snapshot_file=None): if snapshot_file is None: initial_optim_state_dict = None initial_model_state_dict = None else: snapshot = torch.load(snapshot_file) initial_optim_state_dict=snapshot['optimizer_state_dict'] initial_model_state_dict=snapshot['agent_state_dict'] sampler = SerialSampler( EnvCls=gym_make, env_kwargs=dict(id=env_id, xml_file=get_full_path('resources/cassie.xml')), eval_env_kwargs=dict(id=env_id, xml_file=get_full_path('resources/cassie.xml')), batch_T=1, # One time-step per sampler iteration. batch_B=1, # One environment (i.e. sampler Batch dimension). max_decorrelation_steps=0, eval_n_envs=1, eval_max_steps=int(1000), eval_max_trajectories=50, # 50 ) algo = SAC( initial_optim_state_dict=initial_optim_state_dict) agent = SacAgent( initial_model_state_dict=initial_model_state_dict) runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=1e6, log_interval_steps=5e4, #5e4 affinity=dict(cuda_idx=cuda_idx), ) other_param = dict( env_id=env_id, forward_reward_weight=0, shift_cost=True, cum_steps='1M') name = "sac_" + env_id log_dir = "Cassie_stand" with logger_context(log_dir, run_ID, name, other_param, snapshot_mode='last', use_summary_writer=True): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) print('Variant', variant) config = update_config(config, variant) sampler = SerialSampler(EnvCls=DMControlEnv, env_kwargs=config["env"], CollectorCls=CpuResetCollector, eval_env_kwargs=config["eval_env"], **config["sampler"]) algo = SAC(optim_kwargs=config["optim"], **config["algo"]) agent = SacAgent(**config["agent"]) runner = MinibatchRlEval(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = "sac_{}_{}".format(config['env']['domain'], config['env']['task']) with logger_context(log_dir, run_ID, name, config, snapshot_mode='last'): runner.train()
def build_and_train(): affinity = dict(cuda_idx=None, workers_cpus=list(range(15))) sampler = CpuSampler( EnvCls=_make_env, env_kwargs=dict(rank=0), batch_T=6000, batch_B=20, ) algo = SAC(bootstrap_timelimit=False) # Run with defaults. agent = SacAgent() runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=600, affinity=affinity, ) config = dict(env_id='reaching') name = "sac_reaching" log_dir = os.path.join(os.path.dirname(__file__), "example") with logger_context(log_dir, 0, name, config, use_summary_writer=True): runner.train()
def build_and_train(env_id="HalfCheetah-v2", run_ID=0, cuda_idx=None, sample_mode='cpu', n_parallel=2, eval=False, wandb_log=True, log_interval_steps=1e5, n_steps=50e6): affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel))) gpu_cpu = "CPU" if cuda_idx is None else f"GPU {cuda_idx}" if sample_mode == "serial": Sampler = SerialSampler # (Ignores workers_cpus.) if eval: eval_collector_cl = SerialEvalCollector else: eval_collector_cl = None print(f"Using serial sampler, {gpu_cpu} for sampling and optimizing.") elif sample_mode == "cpu": Sampler = CpuSampler if eval: eval_collector_cl = CpuEvalCollector else: eval_collector_cl = None print( f"Using CPU parallel sampler (agent in workers), {gpu_cpu} for optimizing." ) num_envs = 8 b_size = 20 env_kwargs = dict(id=env_id) if eval: eval_env_kwargs = env_kwargs eval_max_steps = 1e4 num_eval_envs = num_envs else: eval_env_kwargs = None eval_max_steps = None num_eval_envs = 0 sampler = Sampler( EnvCls=gym_make, env_kwargs=env_kwargs, eval_env_kwargs=eval_env_kwargs, batch_T=b_size, # One time-step per sampler iteration. batch_B=num_envs, # One environment (i.e. sampler Batch dimension). max_decorrelation_steps=0, eval_n_envs=num_eval_envs, eval_CollectorCls=eval_collector_cl, eval_max_steps=eval_max_steps, ) # eval_max_trajectories=50, # ) algo = SAC() # Run with defaults. agent = SacAgent() if eval: RunnerCl = MinibatchRlEval else: RunnerCl = MinibatchRl runner = RunnerCl(algo=algo, agent=agent, sampler=sampler, n_steps=n_steps, log_interval_steps=log_interval_steps, affinity=affinity, wandb_log=True) config = dict(env_id=env_id) name = "sac_" + env_id log_dir = "baseline_run" with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(): opt_affinities = list() opt_affinity = dict(cpus=[0], cuda_idx=None, torch_threads=1, set_affinity=True) opt_affinities.append(opt_affinity) smp_affinity = AttrDict( all_cpus=[0, 1], master_cpus=[0], workers_cpus=[1], master_torch_threads=1, worker_torch_threads=1, cuda_idx=None, alternating=False, # Just to pass through a check. set_affinity=True, ) affinity = AttrDict( all_cpus=[0, 1], # For exp launcher to use taskset. optimizer=opt_affinities, sampler=smp_affinity, set_affinity=True, ) sampler = AsyncCpuSampler(EnvCls=_make_env, env_kwargs=dict(rank=0), batch_T=600, batch_B=3, max_decorrelation_steps=0, CollectorCls=DbCpuResetCollector) algo = SAC(batch_size=256, min_steps_learn=10000, replay_size=1000000, replay_ratio=1, target_update_interval=1, target_entropy=-9, target_update_tau=0.01, learning_rate=0.00025, action_prior="uniform", reward_scale=1, reparameterize=True, clip_grad_norm=1e9, n_step_return=1, updates_per_sync=1, bootstrap_timelimit=False) # Run with defaults. agent = SacAgent(model_kwargs={'hidden_sizes': [256, 256]}) runner = AsyncRl( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=10000, affinity=affinity, ) config = dict(env_id='picking') name = "sac_rlpyt_picking" log_dir = os.path.join(os.path.dirname(__file__), "sac_rlpyt_picking") with logger_context(log_dir, 0, name, config, use_summary_writer=False, snapshot_mode='all'): runner.train()
def build_and_train(game="cartpole", run_ID=0, cuda_idx=None, sample_mode="serial", n_parallel=2, eval=False, serial=False, train_mask=[True, True], wandb_log=False, save_models_to_wandb=False, log_interval_steps=1e5, observation_mode="agent", inc_player_last_act=False, alt_train=False, eval_perf=False, n_steps=50e6, one_agent=False): # def envs: if observation_mode == "agent": fully_obs = False rand_obs = False elif observation_mode == "random": fully_obs = False rand_obs = True elif observation_mode == "full": fully_obs = True rand_obs = False n_serial = None if game == "cartpole": work_env = gym.make env_name = 'CartPole-v1' cont_act = False state_space_low = np.asarray([ 0.0, 0.0, 0.0, 0.0, -4.8000002e+00, -3.4028235e+38, -4.1887903e-01, -3.4028235e+38 ]) state_space_high = np.asarray([ 1.0, 1.0, 1.0, 1.0, 4.8000002e+00, 3.4028235e+38, 4.1887903e-01, 3.4028235e+38 ]) obs_space = Box(state_space_low, state_space_high, dtype=np.float32) player_act_space = work_env(env_name).action_space player_act_space.shape = (1, ) print(player_act_space) if inc_player_last_act: observer_obs_space = Box(np.append(state_space_low, 0), np.append(state_space_high, 1), dtype=np.float32) else: observer_obs_space = obs_space player_reward_shaping = player_reward_shaping_cartpole observer_reward_shaping = observer_reward_shaping_cartpole max_decor_steps = 20 b_size = 20 num_envs = 8 max_episode_length = np.inf player_model_kwargs = dict(hidden_sizes=[24], lstm_size=16, nonlinearity=torch.nn.ReLU, normalize_observation=False, norm_obs_clip=10, norm_obs_var_clip=1e-6) observer_model_kwargs = dict(hidden_sizes=[64], lstm_size=16, nonlinearity=torch.nn.ReLU, normalize_observation=False, norm_obs_clip=10, norm_obs_var_clip=1e-6) elif game == "hiv": work_env = wn.gym.make env_name = 'HIV-v0' cont_act = False state_space_low = np.asarray( [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) state_space_high = np.asarray([ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf ]) obs_space = Box(state_space_low, state_space_high, dtype=np.float32) player_act_space = work_env(env_name).action_space if inc_player_last_act: observer_obs_space = Box(np.append(state_space_low, 0), np.append(state_space_high, 3), dtype=np.float32) else: observer_obs_space = obs_space player_reward_shaping = player_reward_shaping_hiv observer_reward_shaping = observer_reward_shaping_hiv max_decor_steps = 10 b_size = 32 num_envs = 8 max_episode_length = 100 player_model_kwargs = dict(hidden_sizes=[32], lstm_size=16, nonlinearity=torch.nn.ReLU, normalize_observation=False, norm_obs_clip=10, norm_obs_var_clip=1e-6) observer_model_kwargs = dict(hidden_sizes=[64], lstm_size=16, nonlinearity=torch.nn.ReLU, normalize_observation=False, norm_obs_clip=10, norm_obs_var_clip=1e-6) elif game == "heparin": work_env = HeparinEnv env_name = 'Heparin-Simulator' cont_act = False state_space_low = np.asarray([ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 18728.926, 72.84662, 0.0, 0.0, 0.0, 0.0, 0.0 ]) state_space_high = np.asarray([ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.7251439e+04, 1.0664291e+02, 200.0, 8.9383472e+02, 1.0025734e+02, 1.5770737e+01, 4.7767456e+01 ]) # state_space_low = np.asarray([0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18728.926,72.84662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]) # state_space_high = np.asarray([1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.7251439e+04,1.0664291e+02,0.0000000e+00,8.9383472e+02,1.4476662e+02,1.3368750e+02,1.6815166e+02,1.0025734e+02,1.5770737e+01,4.7767456e+01,7.7194958e+00]) obs_space = Box(state_space_low, state_space_high, dtype=np.float32) player_act_space = work_env(env_name).action_space if inc_player_last_act: observer_obs_space = Box(np.append(state_space_low, 0), np.append(state_space_high, 4), dtype=np.float32) else: observer_obs_space = obs_space player_reward_shaping = player_reward_shaping_hep observer_reward_shaping = observer_reward_shaping_hep max_decor_steps = 3 b_size = 20 num_envs = 8 max_episode_length = 20 player_model_kwargs = dict(hidden_sizes=[32], lstm_size=16, nonlinearity=torch.nn.ReLU, normalize_observation=False, norm_obs_clip=10, norm_obs_var_clip=1e-6) observer_model_kwargs = dict(hidden_sizes=[128], lstm_size=16, nonlinearity=torch.nn.ReLU, normalize_observation=False, norm_obs_clip=10, norm_obs_var_clip=1e-6) elif game == "halfcheetah": assert not serial assert not one_agent work_env = gym.make env_name = 'HalfCheetah-v2' cont_act = True temp_env = work_env(env_name) state_space_low = np.concatenate([ np.zeros(temp_env.observation_space.low.shape), temp_env.observation_space.low ]) state_space_high = np.concatenate([ np.ones(temp_env.observation_space.high.shape), temp_env.observation_space.high ]) obs_space = Box(state_space_low, state_space_high, dtype=np.float32) player_act_space = temp_env.action_space if inc_player_last_act: observer_obs_space = Box(np.append(state_space_low, 0), np.append(state_space_high, 4), dtype=np.float32) else: observer_obs_space = obs_space player_reward_shaping = None observer_reward_shaping = None temp_env.close() max_decor_steps = 0 b_size = 20 num_envs = 8 max_episode_length = np.inf player_model_kwargs = dict(hidden_sizes=[256, 256]) observer_model_kwargs = dict(hidden_sizes=[256, 256]) player_q_model_kwargs = dict(hidden_sizes=[256, 256]) observer_q_model_kwargs = dict(hidden_sizes=[256, 256]) player_v_model_kwargs = dict(hidden_sizes=[256, 256]) observer_v_model_kwargs = dict(hidden_sizes=[256, 256]) if game == "halfcheetah": observer_act_space = Box( low=state_space_low[:int(len(state_space_low) / 2)], high=state_space_high[:int(len(state_space_high) / 2)]) else: if serial: n_serial = int(len(state_space_high) / 2) observer_act_space = Discrete(2) observer_act_space.shape = (1, ) else: if one_agent: observer_act_space = IntBox( low=0, high=player_act_space.n * int(2**int(len(state_space_high) / 2))) else: observer_act_space = IntBox(low=0, high=int(2**int( len(state_space_high) / 2))) affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel))) gpu_cpu = "CPU" if cuda_idx is None else f"GPU {cuda_idx}" if sample_mode == "serial": alt = False Sampler = SerialSampler # (Ignores workers_cpus.) if eval: eval_collector_cl = SerialEvalCollector else: eval_collector_cl = None print(f"Using serial sampler, {gpu_cpu} for sampling and optimizing.") elif sample_mode == "cpu": alt = False Sampler = CpuSampler if eval: eval_collector_cl = CpuEvalCollector else: eval_collector_cl = None print( f"Using CPU parallel sampler (agent in workers), {gpu_cpu} for optimizing." ) env_kwargs = dict(work_env=work_env, env_name=env_name, obs_spaces=[obs_space, observer_obs_space], action_spaces=[player_act_space, observer_act_space], serial=serial, player_reward_shaping=player_reward_shaping, observer_reward_shaping=observer_reward_shaping, fully_obs=fully_obs, rand_obs=rand_obs, inc_player_last_act=inc_player_last_act, max_episode_length=max_episode_length, cont_act=cont_act) if eval: eval_env_kwargs = env_kwargs eval_max_steps = 1e4 num_eval_envs = num_envs else: eval_env_kwargs = None eval_max_steps = None num_eval_envs = 0 sampler = Sampler( EnvCls=CWTO_EnvWrapper, env_kwargs=env_kwargs, batch_T=b_size, batch_B=num_envs, max_decorrelation_steps=max_decor_steps, eval_n_envs=num_eval_envs, eval_CollectorCls=eval_collector_cl, eval_env_kwargs=eval_env_kwargs, eval_max_steps=eval_max_steps, ) if game == "halfcheetah": player_algo = SAC() observer_algo = SACBeta() player = SacAgent(ModelCls=PiMlpModel, QModelCls=QofMuMlpModel, model_kwargs=player_model_kwargs, q_model_kwargs=player_q_model_kwargs, v_model_kwargs=player_v_model_kwargs) observer = SacAgentBeta(ModelCls=PiMlpModelBeta, QModelCls=QofMuMlpModel, model_kwargs=observer_model_kwargs, q_model_kwargs=observer_q_model_kwargs, v_model_kwargs=observer_v_model_kwargs) else: player_model = CWTO_LstmModel observer_model = CWTO_LstmModel player_algo = PPO() observer_algo = PPO() player = CWTO_LstmAgent(ModelCls=player_model, model_kwargs=player_model_kwargs, initial_model_state_dict=None) observer = CWTO_LstmAgent(ModelCls=observer_model, model_kwargs=observer_model_kwargs, initial_model_state_dict=None) if one_agent: agent = CWTO_AgentWrapper(player, observer, serial=serial, n_serial=n_serial, alt=alt, train_mask=train_mask, one_agent=one_agent, nplayeract=player_act_space.n) else: agent = CWTO_AgentWrapper(player, observer, serial=serial, n_serial=n_serial, alt=alt, train_mask=train_mask) if eval: RunnerCl = MinibatchRlEval else: RunnerCl = MinibatchRl runner = RunnerCl(player_algo=player_algo, observer_algo=observer_algo, agent=agent, sampler=sampler, n_steps=n_steps, log_interval_steps=log_interval_steps, affinity=affinity, wandb_log=wandb_log, alt_train=alt_train) config = dict(domain=game) if game == "halfcheetah": name = "sac_" + game else: name = "ppo_" + game log_dir = os.getcwd() + "/cwto_logs/" + name with logger_context(log_dir, run_ID, name, config): runner.train() if save_models_to_wandb: agent.save_models_to_wandb() if eval_perf: eval_n_envs = 10 eval_envs = [CWTO_EnvWrapper(**env_kwargs) for _ in range(eval_n_envs)] set_envs_seeds(eval_envs, make_seed()) eval_collector = SerialEvalCollector(envs=eval_envs, agent=agent, TrajInfoCls=TrajInfo_obs, max_T=1000, max_trajectories=10, log_full_obs=True) traj_infos_player, traj_infos_observer = eval_collector.collect_evaluation( runner.get_n_itr()) observations = [] player_actions = [] returns = [] observer_actions = [] for traj in traj_infos_player: observations.append(np.array(traj.Observations)) player_actions.append(np.array(traj.Actions)) returns.append(traj.Return) for traj in traj_infos_observer: observer_actions.append( np.array([ obs_action_translator(act, eval_envs[0].power_vec, eval_envs[0].obs_size) for act in traj.Actions ])) # save results: open_obs = open('eval_observations.pkl', "wb") pickle.dump(observations, open_obs) open_obs.close() open_ret = open('eval_returns.pkl', "wb") pickle.dump(returns, open_ret) open_ret.close() open_pact = open('eval_player_actions.pkl', "wb") pickle.dump(player_actions, open_pact) open_pact.close() open_oact = open('eval_observer_actions.pkl', "wb") pickle.dump(observer_actions, open_oact) open_oact.close()