def test_trajectory_runner(env_id): env = make_vec_env(SerialVecEnv, make_gym_env, env_id, 3, 0, False) env_spec = EnvSpec(env) agent = RandomAgent(None, env_spec) runner = TrajectoryRunner(None, agent, env) D = runner(4) assert len(D) == 3 assert all([isinstance(d, Trajectory) for d in D]) assert all([d.T == 4 for d in D]) # Check if s in transition is equal to s_next in previous transition for d in D: for s1, s2 in zip(d.transitions[:-1], d.transitions[1:]): assert np.allclose(s1.s_next, s2.s) # Long horizon D = runner(T=1000) for d in D: if d.T < 1000: assert d.all_done[-1] == True with pytest.raises(AssertionError): env = make_vec_env(SerialVecEnv, make_gym_env, env_id, 3, 0, True) TrajectoryRunner(None, agent, env)
def check(agent_name, env_name): # Create environment list_make_env = make_envs(make_env=make_gym_env, env_id=env_name, num_env=1, init_seed=0) env = SerialVecEnv(list_make_env=list_make_env) env_spec = EnvSpec(env) # Create agent if agent_name == 'random': agent = RandomAgent(env_spec=env_spec, config=None) elif agent_name == 'agent1': agent = Agent1(config=None) elif agent_name == 'agent2': agent = Agent2(config=None) else: raise ValueError('Wrong agent name') # Test: not allowed more than one environment for TrajectoryRunner with pytest.raises(AssertionError): list_make_env2 = make_envs(make_env=make_gym_env, env_id=env_name, num_env=2, init_seed=0) env2 = SerialVecEnv(list_make_env=list_make_env2) runner2 = TrajectoryRunner(agent=agent, env=env2, gamma=1.0) # Create runner runner = TrajectoryRunner(agent=agent, env=env, gamma=1.0) # Small batch D = runner(N=3, T=4) assert len(D) == 3 assert all([isinstance(d, Trajectory) for d in D]) assert all([d.T == 4 for d in D]) assert all([d.gamma == 1.0 for d in D]) # Check additional information for d in D: for t in d.transitions: if agent_name != 'random': assert 'action_logprob' in t.info # Check if s in transition is equal to s_next in previous transition for d in D: for s1, s2 in zip(d.transitions[:-1], d.transitions[1:]): assert np.allclose(s1.s_next, s2.s) # Long horizon D = runner(N=3, T=1000) for d in D: if d.T < 1000: assert d.all_done[-1] == True
def f(self, solution, seed, config): if self.agent is None: self.init(seed, config) # load solution parameters to the agent internal network msg = f'expected {self.network.num_params}, got {np.asarray(solution).size}' assert np.asarray(solution).size == self.network.num_params, msg self.agent.policy.network.from_vec(torch.from_numpy(solution).float()) # seed the environment self.env.list_env[0].seed(seed) # create runner runner = TrajectoryRunner(agent=self.agent, env=self.env, gamma=1.0) # take rollouts and calculate mean return (no discount) with torch.no_grad(): D = runner(N=config['train.N'], T=config['train.T']) mean_return = np.mean([sum(trajectory.all_r) for trajectory in D]) # Negate return to be objective value, because ES does minimization function_value = -mean_return return function_value
def eval(self, n): # Set network as evaluation mode self.agent.policy.network.eval() # Create a new instance of the envrionment env = make_gym_env(env_id=self.config['env:id'], seed=self.config['seed'], monitor=False, monitor_dir=None) # Create a TrajectoryRunner runner = TrajectoryRunner(agent=self.agent, env=env, gamma=self.config['algo:gamma']) # Evaluate the agent for a set of trajectories D = runner(N=self.config['eval:N'], T=self.config['eval:T']) # Return evaluation output eval_output = {} eval_output['D'] = D eval_output['n'] = n return eval_output
def __call__(self, config, seed, device_str): # Set random seeds set_global_seeds(seed) # Create device device = torch.device(device_str) # Use log dir for current job (run_experiment) logdir = Path(config['log.dir']) / str(config['ID']) / str(seed) # Make environment (VecEnv) for training and evaluating env = make_vec_env( vec_env_class=SerialVecEnv, make_env=make_gym_env, env_id=config['env.id'], num_env=config['train.N'], # batch size for multiple environments init_seed=seed) eval_env = make_vec_env(vec_env_class=SerialVecEnv, make_env=make_gym_env, env_id=config['env.id'], num_env=1, init_seed=seed) if config[ 'env.standardize']: # wrap with VecStandardize for running averages of observation and rewards env = VecStandardize(venv=env, use_obs=True, use_reward=True, clip_obs=10., clip_reward=10., gamma=0.99, eps=1e-8) eval_env = VecStandardize( venv= eval_env, # remember to synchronize running averages during evaluation !!! use_obs=True, use_reward=False, # do not process rewards, no training clip_obs=env.clip_obs, clip_reward=env.clip_reward, gamma=env.gamma, eps=env.eps, constant_obs_mean=env.obs_runningavg. mu, # use current running average as constant constant_obs_std=env.obs_runningavg.sigma) env_spec = EnvSpec(env) # Create policy network = Network(config=config, env_spec=env_spec) if env_spec.control_type == 'Discrete': policy = CategoricalPolicy(config=config, network=network, env_spec=env_spec, learn_V=True) elif env_spec.control_type == 'Continuous': policy = GaussianPolicy( config=config, network=network, env_spec=env_spec, learn_V=True, min_std=config['agent.min_std'], std_style=config['agent.std_style'], constant_std=config['agent.constant_std'], std_state_dependent=config['agent.std_state_dependent'], init_std=config['agent.init_std']) network = network.to(device) # Create optimizer and learning rate scheduler optimizer = optim.Adam(policy.network.parameters(), lr=config['algo.lr']) if config['algo.use_lr_scheduler']: if 'train.iter' in config: # iteration-based training max_epoch = config['train.iter'] elif 'train.timestep' in config: # timestep-based training max_epoch = config[ 'train.timestep'] + 1 # +1 to avoid 0.0 lr in final iteration lambda_f = lambda epoch: 1 - epoch / max_epoch # decay learning rate for each training epoch lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_f) # Create agent kwargs = {'device': device} if config['algo.use_lr_scheduler']: kwargs['lr_scheduler'] = lr_scheduler agent = A2CAgent(config=config, policy=policy, optimizer=optimizer, **kwargs) # Create runner runner = SegmentRunner(agent=agent, env=env, gamma=config['algo.gamma']) eval_runner = TrajectoryRunner(agent=agent, env=eval_env, gamma=1.0) # Create engine engine = Engine(agent=agent, runner=runner, config=config, eval_runner=eval_runner) # Training and evaluation train_logs = [] eval_logs = [] for i in count(): # incremental iteration if 'train.iter' in config and i >= config[ 'train.iter']: # enough iterations break elif 'train.timestep' in config and agent.total_T >= config[ 'train.timestep']: # enough timesteps break # train and evaluation train_output = engine.train(n=i) # logging if i == 0 or (i + 1) % config['log.record_interval'] == 0 or ( i + 1) % config['log.print_interval'] == 0: train_log = engine.log_train(train_output) with torch.no_grad(): # disable grad, save memory eval_output = engine.eval(n=i) eval_log = engine.log_eval(eval_output) if i == 0 or (i + 1) % config[ 'log.record_interval'] == 0: # record loggings train_logs.append(train_log) eval_logs.append(eval_log) # Save all loggings pickle_dump(obj=train_logs, f=logdir / 'train_logs', ext='.pkl') pickle_dump(obj=eval_logs, f=logdir / 'eval_logs', ext='.pkl') return None
def __call__(self, config, seed, device_str): set_global_seeds(seed) device = torch.device(device_str) logdir = Path(config['log.dir']) / str(config['ID']) / str(seed) # Environment related env = make_vec_env(vec_env_class=SerialVecEnv, make_env=make_gym_env, env_id=config['env.id'], num_env=config['train.N'], # batched environment init_seed=seed, rolling=True) eval_env = make_vec_env(vec_env_class=SerialVecEnv, make_env=make_gym_env, env_id=config['env.id'], num_env=config['eval.N'], init_seed=seed, rolling=False) if config['env.standardize']: # running averages of observation and reward env = VecStandardize(venv=env, use_obs=True, use_reward=False, # A2C clip_obs=10., clip_reward=10., gamma=0.99, eps=1e-8) eval_env = VecStandardize(venv=eval_env, # remember to synchronize running averages during evaluation !!! use_obs=True, use_reward=False, # do not process rewards, no training clip_obs=env.clip_obs, clip_reward=env.clip_reward, gamma=env.gamma, eps=env.eps, constant_obs_mean=env.obs_runningavg.mu, # use current running average as constant constant_obs_std=env.obs_runningavg.sigma) env_spec = EnvSpec(env) # Network and policy if config['network.recurrent']: network = LSTM(config=config, device=device, env_spec=env_spec) else: network = Network(config=config, device=device, env_spec=env_spec) if env_spec.control_type == 'Discrete': policy = CategoricalPolicy(config=config, network=network, env_spec=env_spec, device=device, learn_V=True) elif env_spec.control_type == 'Continuous': policy = GaussianPolicy(config=config, network=network, env_spec=env_spec, device=device, learn_V=True, min_std=config['agent.min_std'], std_style=config['agent.std_style'], constant_std=config['agent.constant_std'], std_state_dependent=config['agent.std_state_dependent'], init_std=config['agent.init_std']) # Optimizer and learning rate scheduler optimizer = optim.Adam(policy.network.parameters(), lr=config['algo.lr']) if config['algo.use_lr_scheduler']: if 'train.iter' in config: # iteration-based max_epoch = config['train.iter'] elif 'train.timestep' in config: # timestep-based max_epoch = config['train.timestep'] + 1 # avoid zero lr in final iteration lambda_f = lambda epoch: 1 - epoch/max_epoch lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_f) # Agent kwargs = {'device': device} if config['algo.use_lr_scheduler']: kwargs['lr_scheduler'] = lr_scheduler agent = A2CAgent(config=config, policy=policy, optimizer=optimizer, **kwargs) # Runner runner = SegmentRunner(agent=agent, env=env, gamma=config['algo.gamma']) eval_runner = TrajectoryRunner(agent=agent, env=eval_env, gamma=1.0) # Engine engine = Engine(agent=agent, runner=runner, config=config, eval_runner=eval_runner) # Training and evaluation train_logs = [] eval_logs = [] if config['network.recurrent']: rnn_states_buffer = agent.policy.rnn_states # for SegmentRunner for i in count(): if 'train.iter' in config and i >= config['train.iter']: # enough iterations break elif 'train.timestep' in config and agent.total_T >= config['train.timestep']: # enough timesteps break if config['network.recurrent']: if isinstance(rnn_states_buffer, list): # LSTM: [h, c] rnn_states_buffer = [buf.detach() for buf in rnn_states_buffer] else: rnn_states_buffer = rnn_states_buffer.detach() agent.policy.rnn_states = rnn_states_buffer train_output = engine.train(n=i) # Logging if i == 0 or (i+1) % config['log.record_interval'] == 0 or (i+1) % config['log.print_interval'] == 0: train_log = engine.log_train(train_output) if config['network.recurrent']: rnn_states_buffer = agent.policy.rnn_states # for SegmentRunner with torch.no_grad(): # disable grad, save memory eval_output = engine.eval(n=i) eval_log = engine.log_eval(eval_output) if i == 0 or (i+1) % config['log.record_interval'] == 0: train_logs.append(train_log) eval_logs.append(eval_log) # Save all loggings pickle_dump(obj=train_logs, f=logdir/'train_logs', ext='.pkl') pickle_dump(obj=eval_logs, f=logdir/'eval_logs', ext='.pkl') return None
def __call__(self, config): # Set random seeds: PyTorch, numpy.random, random set_global_seeds(seed=config['seed']) # Create an environment env = make_gym_env(env_id=config['env:id'], seed=config['seed'], monitor=False, monitor_dir=None) # Create environment specification env_spec = EnvSpec(env) # Create device torch.cuda.set_device(config['cuda_id']) device = torch.device( f'cuda:{config["cuda_id"]}' if config['cuda'] else 'cpu') # Create policy network = MLP(config=config).to(device) policy = CategoricalPolicy(network=network, env_spec=env_spec) # Create optimizer optimizer = optim.Adam(policy.network.parameters(), lr=config['algo:lr']) # Create learning rate scheduler if config['algo:use_lr_scheduler']: max_epoch = config[ 'train:iter'] # Max number of lr decay, Note where lr_scheduler put lambda_f = lambda epoch: 1 - epoch / max_epoch # decay learning rate for each training epoch lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_f) # Create agent kwargs = {'device': device} if config['algo:use_lr_scheduler']: kwargs['lr_scheduler'] = lr_scheduler agent = ActorCriticAgent(policy=policy, optimizer=optimizer, config=config, **kwargs) # Create runner runner = TrajectoryRunner(agent=agent, env=env, gamma=config['algo:gamma']) # Create engine engine = Engine(agent=agent, runner=runner, config=config, logger=None) # Training and evaluation train_logs = [] eval_logs = [] for i in range(config['train:iter']): train_output = engine.train(i) # Logging and evaluation if i == 0 or (i + 1) % config['log:interval'] == 0: # Log training and record the loggings train_logger = engine.log_train(train_output) train_logs.append(train_logger.logs) # Log evaluation and record the loggings eval_output = engine.eval(i) eval_logger = engine.log_eval(eval_output) eval_logs.append(eval_logger.logs) # Save the loggings np.save( Path(config['log:dir']) / str(config['ID']) / 'train', train_logs) np.save( Path(config['log:dir']) / str(config['ID']) / 'eval', eval_logs) return None