def eval(self, n): self.agent.eval() start_time = time() if self.config['env.standardize']: eval_env = VecStandardize( venv=self.eval_env, use_obs=True, use_reward=False, # do not process rewards, no training clip_obs=self.runner.env.clip_obs, clip_reward=self.runner.env.clip_reward, gamma=self.runner.env.gamma, eps=self.runner.env.eps, constant_obs_mean=self.runner.env.obs_runningavg.mu, constant_obs_std=self.runner.env.obs_runningavg.sigma) eval_runner = EpisodeRunner(self.config, self.agent, eval_env) T = eval_env.T D = eval_runner(T) eval_output = {} eval_output['D'] = D eval_output['n'] = n eval_output['T'] = T eval_output['num_sec'] = time() - start_time return eval_output
def test_get_wrapper(env_id): env = make_vec_env(SerialVecEnv, make_gym_env, env_id, 3, 0) env = VecStandardize(env) env = VecClipAction(env) out = get_wrapper(env, 'VecClipAction') assert out.__class__.__name__ == 'VecClipAction' del out out = get_wrapper(env, 'VecStandardize') assert out.__class__.__name__ == 'VecStandardize' del out out = get_wrapper(env, 'SerialVecEnv') assert out.__class__.__name__ == 'SerialVecEnv'
def _prepare(self, config): self.env = make_vec_env(SerialVecEnv, make_gym_env, config['env.id'], config['train.N'], 0) self.env = VecClipAction(self.env) if config['env.standardize']: self.env = VecStandardize(self.env, use_obs=True, use_reward=False, clip_obs=10.0, clip_reward=10.0, gamma=0.99, eps=1e-08) self.env_spec = EnvSpec(self.env) self.device = torch.device('cpu') self.agent = Agent(config, self.env_spec, self.device)
def algorithm(config, seed, device): logdir = Path(config['log.dir']) / str(config['ID']) / str(seed) seeder = Seeder(seed) seeds = seeder(size=config['env.count']) env_constructors = [] for seed in seeds: env_constructors.append(partial(CraftingEnv, seed)) env = VecStandardize(SerialVecEnv(env_constructors), clip_reward=100.0) env_spec = EnvSpec(env) agent = Agent(config, env_spec, device) runner = RollingSegmentRunner(config, agent, env) engine = Engine(agent, runner, env) for i in range(config['train.iter']): training_result = engine.train(i) print(f'Training iteration {i} complete.') if i % config['log.interval'] == 0: logs = engine.log_train(training_result) pickle_dump(obj=logs, f=logdir / f'iter_{i}_train_logs', ext='.pkl') torch.save(engine.agent.policy.state_dict(), logdir / 'trained_params')
from dial_control_rl import agent from dial_control_rl.env import CraftingEnv from lagom.utils import Seeder from lagom.envs.vec_env import SerialVecEnv, VecStandardize from lagom.envs import EnvSpec from functools import partial env = CraftingEnv() seeder = Seeder(0) seeds = seeder(size=1) env_constructors = [] for seed in seeds: env_constructors.append(partial(CraftingEnv, seed)) env = VecStandardize(SerialVecEnv(env_constructors), clip_reward=100.0) env_spec = EnvSpec(env) policy = agent.Policy({'algo.rl': 0}, env_spec, torch.device('cpu')) policy.load_state_dict(params) policy = policy.double() def V(x): out = policy(torch.tensor(x), ['V']) return out['V'][0] def Q(x): out = policy(torch.tensor(x), ['action_dist']) out = out['action_dist']
def __call__(self, config, seed, device): set_global_seeds(seed) logdir = Path(config['log.dir']) / str(config['ID']) / str(seed) if config['env.time_aware_obs']: kwargs = {'extra_wrapper': [TimeAwareObservation]} else: kwargs = {} env = make_vec_env(SerialVecEnv, make_gym_env, config['env.id'], config['train.N'], seed, monitor=True, **kwargs) if config['eval.independent']: eval_env = make_vec_env(SerialVecEnv, make_gym_env, config['env.id'], config['eval.N'], seed) if config['env.clip_action']: env = VecClipAction(env) if config['eval.independent']: eval_env = VecClipAction(eval_env) if config[ 'env.standardize']: # running averages of observation and reward env = VecStandardize( venv=env, use_obs=True, use_reward=False, # A2C specific clip_obs=10., clip_reward=10., gamma=0.99, eps=1e-8) env_spec = EnvSpec(env) agent = Agent(config, env_spec, device) runner = RollingSegmentRunner(config, agent, env) if config['eval.independent']: engine = Engine(agent, runner, config, eval_env=eval_env) else: engine = Engine(agent, runner, config) train_logs = [] eval_logs = [] for i in count(): if 'train.iter' in config and i >= config[ 'train.iter']: # enough iterations break elif 'train.timestep' in config and agent.total_T >= config[ 'train.timestep']: # enough timesteps break train_output = engine.train(i) if i == 0 or (i + 1) % config['log.interval'] == 0: train_log = engine.log_train(train_output) train_logs.append(train_log) if config['eval.independent']: with torch.no_grad(): # disable grad, save memory eval_output = engine.eval(n=i) eval_log = engine.log_eval(eval_output) eval_logs.append(eval_log) pickle_dump(obj=train_logs, f=logdir / 'train_logs', ext='.pkl') pickle_dump(obj=eval_logs, f=logdir / 'eval_logs', ext='.pkl') return None
def __call__(self, config, seed, device_str): # Set random seeds set_global_seeds(seed) # Create device device = torch.device(device_str) # Use log dir for current job (run_experiment) logdir = Path(config['log.dir']) / str(config['ID']) / str(seed) # Make environment (VecEnv) for training and evaluating env = make_vec_env( vec_env_class=SerialVecEnv, make_env=make_gym_env, env_id=config['env.id'], num_env=config['train.N'], # batch size for multiple environments init_seed=seed) eval_env = make_vec_env(vec_env_class=SerialVecEnv, make_env=make_gym_env, env_id=config['env.id'], num_env=1, init_seed=seed) if config[ 'env.standardize']: # wrap with VecStandardize for running averages of observation and rewards env = VecStandardize(venv=env, use_obs=True, use_reward=True, clip_obs=10., clip_reward=10., gamma=0.99, eps=1e-8) eval_env = VecStandardize( venv= eval_env, # remember to synchronize running averages during evaluation !!! use_obs=True, use_reward=False, # do not process rewards, no training clip_obs=env.clip_obs, clip_reward=env.clip_reward, gamma=env.gamma, eps=env.eps, constant_obs_mean=env.obs_runningavg. mu, # use current running average as constant constant_obs_std=env.obs_runningavg.sigma) env_spec = EnvSpec(env) # Create policy network = Network(config=config, env_spec=env_spec) if env_spec.control_type == 'Discrete': policy = CategoricalPolicy(config=config, network=network, env_spec=env_spec, learn_V=True) elif env_spec.control_type == 'Continuous': policy = GaussianPolicy( config=config, network=network, env_spec=env_spec, learn_V=True, min_std=config['agent.min_std'], std_style=config['agent.std_style'], constant_std=config['agent.constant_std'], std_state_dependent=config['agent.std_state_dependent'], init_std=config['agent.init_std']) network = network.to(device) # Create optimizer and learning rate scheduler optimizer = optim.Adam(policy.network.parameters(), lr=config['algo.lr']) if config['algo.use_lr_scheduler']: if 'train.iter' in config: # iteration-based training max_epoch = config['train.iter'] elif 'train.timestep' in config: # timestep-based training max_epoch = config[ 'train.timestep'] + 1 # +1 to avoid 0.0 lr in final iteration lambda_f = lambda epoch: 1 - epoch / max_epoch # decay learning rate for each training epoch lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_f) # Create agent kwargs = {'device': device} if config['algo.use_lr_scheduler']: kwargs['lr_scheduler'] = lr_scheduler agent = A2CAgent(config=config, policy=policy, optimizer=optimizer, **kwargs) # Create runner runner = SegmentRunner(agent=agent, env=env, gamma=config['algo.gamma']) eval_runner = TrajectoryRunner(agent=agent, env=eval_env, gamma=1.0) # Create engine engine = Engine(agent=agent, runner=runner, config=config, eval_runner=eval_runner) # Training and evaluation train_logs = [] eval_logs = [] for i in count(): # incremental iteration if 'train.iter' in config and i >= config[ 'train.iter']: # enough iterations break elif 'train.timestep' in config and agent.total_T >= config[ 'train.timestep']: # enough timesteps break # train and evaluation train_output = engine.train(n=i) # logging if i == 0 or (i + 1) % config['log.record_interval'] == 0 or ( i + 1) % config['log.print_interval'] == 0: train_log = engine.log_train(train_output) with torch.no_grad(): # disable grad, save memory eval_output = engine.eval(n=i) eval_log = engine.log_eval(eval_output) if i == 0 or (i + 1) % config[ 'log.record_interval'] == 0: # record loggings train_logs.append(train_log) eval_logs.append(eval_log) # Save all loggings pickle_dump(obj=train_logs, f=logdir / 'train_logs', ext='.pkl') pickle_dump(obj=eval_logs, f=logdir / 'eval_logs', ext='.pkl') return None
def __call__(self, config, seed, device_str): set_global_seeds(seed) device = torch.device(device_str) logdir = Path(config['log.dir']) / str(config['ID']) / str(seed) # Environment related env = make_vec_env(vec_env_class=SerialVecEnv, make_env=make_gym_env, env_id=config['env.id'], num_env=config['train.N'], # batched environment init_seed=seed, rolling=True) eval_env = make_vec_env(vec_env_class=SerialVecEnv, make_env=make_gym_env, env_id=config['env.id'], num_env=config['eval.N'], init_seed=seed, rolling=False) if config['env.standardize']: # running averages of observation and reward env = VecStandardize(venv=env, use_obs=True, use_reward=False, # A2C clip_obs=10., clip_reward=10., gamma=0.99, eps=1e-8) eval_env = VecStandardize(venv=eval_env, # remember to synchronize running averages during evaluation !!! use_obs=True, use_reward=False, # do not process rewards, no training clip_obs=env.clip_obs, clip_reward=env.clip_reward, gamma=env.gamma, eps=env.eps, constant_obs_mean=env.obs_runningavg.mu, # use current running average as constant constant_obs_std=env.obs_runningavg.sigma) env_spec = EnvSpec(env) # Network and policy if config['network.recurrent']: network = LSTM(config=config, device=device, env_spec=env_spec) else: network = Network(config=config, device=device, env_spec=env_spec) if env_spec.control_type == 'Discrete': policy = CategoricalPolicy(config=config, network=network, env_spec=env_spec, device=device, learn_V=True) elif env_spec.control_type == 'Continuous': policy = GaussianPolicy(config=config, network=network, env_spec=env_spec, device=device, learn_V=True, min_std=config['agent.min_std'], std_style=config['agent.std_style'], constant_std=config['agent.constant_std'], std_state_dependent=config['agent.std_state_dependent'], init_std=config['agent.init_std']) # Optimizer and learning rate scheduler optimizer = optim.Adam(policy.network.parameters(), lr=config['algo.lr']) if config['algo.use_lr_scheduler']: if 'train.iter' in config: # iteration-based max_epoch = config['train.iter'] elif 'train.timestep' in config: # timestep-based max_epoch = config['train.timestep'] + 1 # avoid zero lr in final iteration lambda_f = lambda epoch: 1 - epoch/max_epoch lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_f) # Agent kwargs = {'device': device} if config['algo.use_lr_scheduler']: kwargs['lr_scheduler'] = lr_scheduler agent = A2CAgent(config=config, policy=policy, optimizer=optimizer, **kwargs) # Runner runner = SegmentRunner(agent=agent, env=env, gamma=config['algo.gamma']) eval_runner = TrajectoryRunner(agent=agent, env=eval_env, gamma=1.0) # Engine engine = Engine(agent=agent, runner=runner, config=config, eval_runner=eval_runner) # Training and evaluation train_logs = [] eval_logs = [] if config['network.recurrent']: rnn_states_buffer = agent.policy.rnn_states # for SegmentRunner for i in count(): if 'train.iter' in config and i >= config['train.iter']: # enough iterations break elif 'train.timestep' in config and agent.total_T >= config['train.timestep']: # enough timesteps break if config['network.recurrent']: if isinstance(rnn_states_buffer, list): # LSTM: [h, c] rnn_states_buffer = [buf.detach() for buf in rnn_states_buffer] else: rnn_states_buffer = rnn_states_buffer.detach() agent.policy.rnn_states = rnn_states_buffer train_output = engine.train(n=i) # Logging if i == 0 or (i+1) % config['log.record_interval'] == 0 or (i+1) % config['log.print_interval'] == 0: train_log = engine.log_train(train_output) if config['network.recurrent']: rnn_states_buffer = agent.policy.rnn_states # for SegmentRunner with torch.no_grad(): # disable grad, save memory eval_output = engine.eval(n=i) eval_log = engine.log_eval(eval_output) if i == 0 or (i+1) % config['log.record_interval'] == 0: train_logs.append(train_log) eval_logs.append(eval_log) # Save all loggings pickle_dump(obj=train_logs, f=logdir/'train_logs', ext='.pkl') pickle_dump(obj=eval_logs, f=logdir/'eval_logs', ext='.pkl') return None
def test_vec_standardize(self, vec_env_class): venv = make_vec_env(vec_env_class, make_gym_env, 'CartPole-v1', 5, 1, True) venv = VecStandardize(venv, use_obs=True, use_reward=True, clip_obs=10., clip_reward=10., gamma=0.99, eps=1e-8) assert isinstance(venv, VecEnvWrapper) and isinstance( venv, VecStandardize) obs = venv.reset() assert not np.allclose(venv.obs_runningavg.mu, 0.0) assert not np.allclose(venv.obs_runningavg.sigma, 0.0) a = [1] * 5 [venv.step(a) for _ in range(20)] assert venv.obs_runningavg.N == 5 + 5 * 20 assert venv.reward_runningavg.N == 5 * 20 assert not np.allclose(venv.obs_runningavg.mu, 0.0) assert not np.allclose(venv.obs_runningavg.sigma, 0.0) running_avg = venv.running_averages assert isinstance(running_avg, dict) assert len( running_avg ) == 2 and 'obs_avg' in running_avg and 'r_avg' in running_avg assert 'mu' in running_avg['obs_avg'] and 'sigma' in running_avg[ 'obs_avg'] assert not np.allclose(running_avg['obs_avg']['mu'], 0.0) assert not np.allclose(running_avg['obs_avg']['sigma'], 0.0) assert 'mu' not in running_avg['r_avg'] assert 'sigma' in running_avg['r_avg'] assert not np.allclose(running_avg['r_avg']['sigma'], 0.0) del venv, obs, a # other settings: clipping venv = make_vec_env(vec_env_class, make_gym_env, 'CartPole-v1', 5, 1, True) venv = VecStandardize(venv, use_obs=True, use_reward=True, clip_obs=0.01, clip_reward=0.0001, gamma=0.99, eps=1e-8) obs = venv.reset() assert np.allclose(np.abs(np.asarray(obs)), 0.01) running_avg = venv.running_averages assert isinstance(running_avg, dict) assert len( running_avg ) == 2 and 'obs_avg' in running_avg and 'r_avg' in running_avg assert 'mu' in running_avg['obs_avg'] and 'sigma' in running_avg[ 'obs_avg'] assert not np.allclose(running_avg['obs_avg']['mu'], 0.0) assert not np.allclose(running_avg['obs_avg']['sigma'], 0.0) assert 'mu' not in running_avg['r_avg'] assert 'sigma' in running_avg['r_avg'] assert running_avg['r_avg']['sigma'] is None a = [1] * 5 obs, rewards, _, _ = venv.step(a) assert rewards.max() == 0.0001 del venv, obs, a # other settings: turn off use_obs venv = make_vec_env(vec_env_class, make_gym_env, 'CartPole-v1', 5, 1, True) venv = VecStandardize(venv, use_obs=False, use_reward=False, clip_obs=0.001, clip_reward=0.0001, gamma=0.99, eps=1e-8) obs = venv.reset() assert np.asarray(obs).max() > 0.001 a = [1] * 5 obs, rewards, _, _ = venv.step(a) assert np.asarray(rewards).max() >= 0.0001 del venv, obs, a # other settings: gamma venv = make_vec_env(vec_env_class, make_gym_env, 'CartPole-v1', 5, 1, True) with pytest.raises(AssertionError): venv = VecStandardize( venv, use_obs=False, use_reward=False, clip_obs=0.001, clip_reward=0.0001, gamma=1.0, # not allowed eps=1e-8) del venv # other settings: constant value venv = make_vec_env(vec_env_class, make_gym_env, 'CartPole-v1', 5, 1, True) venv = VecStandardize(venv, use_obs=True, use_reward=True, clip_obs=10., clip_reward=10., gamma=0.99, eps=1e-8, constant_obs_mean=np.array([5.] * 4), constant_obs_std=np.array([1.] * 4), constant_reward_std=np.array(1000)) obs = venv.reset() assert obs.min() < -4.0 a = [1] * 5 obs, rewards, _, _ = venv.step(a) assert rewards.min() <= 0.01