def check(agent_name, env_name): # Create environment list_make_env = make_envs(make_env=make_gym_env, env_id=env_name, num_env=1, init_seed=0) env = SerialVecEnv(list_make_env=list_make_env) env_spec = EnvSpec(env) # Create agent if agent_name == 'random': agent = RandomAgent(env_spec=env_spec, config=None) elif agent_name == 'agent1': agent = Agent1(config=None) elif agent_name == 'agent2': agent = Agent2(config=None) else: raise ValueError('Wrong agent name') # Test: not allowed more than one environment for TrajectoryRunner with pytest.raises(AssertionError): list_make_env2 = make_envs(make_env=make_gym_env, env_id=env_name, num_env=2, init_seed=0) env2 = SerialVecEnv(list_make_env=list_make_env2) runner2 = TrajectoryRunner(agent=agent, env=env2, gamma=1.0) # Create runner runner = TrajectoryRunner(agent=agent, env=env, gamma=1.0) # Small batch D = runner(N=3, T=4) assert len(D) == 3 assert all([isinstance(d, Trajectory) for d in D]) assert all([d.T == 4 for d in D]) assert all([d.gamma == 1.0 for d in D]) # Check additional information for d in D: for t in d.transitions: if agent_name != 'random': assert 'action_logprob' in t.info # Check if s in transition is equal to s_next in previous transition for d in D: for s1, s2 in zip(d.transitions[:-1], d.transitions[1:]): assert np.allclose(s1.s_next, s2.s) # Long horizon D = runner(N=3, T=1000) for d in D: if d.T < 1000: assert d.all_done[-1] == True
def check(agent_name, env_name): # Create environment list_make_env = make_envs(make_env=make_gym_env, env_id=env_name, num_env=2, init_seed=0) env = SerialVecEnv(list_make_env=list_make_env) env_spec = EnvSpec(env) assert env.num_env == 2 # Create agent if agent_name == 'random': agent = RandomAgent(env_spec=env_spec, config=None) elif agent_name == 'agent1': agent = Agent1(config=None) elif agent_name == 'agent2': agent = Agent2(config=None) else: raise ValueError('Wrong agent name') # Create runner runner = SegmentRunner(agent=agent, env=env, gamma=1.0) # Small batch D = runner(T=3, reset=False) assert len(D) == 2 assert all([isinstance(d, Segment) for d in D]) assert all([d.T == 3 for d in D]) assert all([d.gamma == 1.0 for d in D]) # Check additional information for d in D: for t in d.transitions: if agent_name != 'random': assert 'action_logprob' in t.info # Check if s in transition is equal to s_next in previous transition for d in D: for s1, s2 in zip(d.transitions[:-1], d.transitions[1:]): assert np.allclose(s1.s_next, s2.s) # Take one more step, test rolling effect, i.e. first state should be same as last state in previous D D2 = runner(T=1, reset=False) assert len(D2) == 2 assert all([d.T == 1 for d in D2]) for d, d2 in zip(D, D2): assert np.allclose(d2.all_s[0][0], d.transitions[-1].s_next) # Long horizon D = runner(T=200, reset=True) # Segment with identical time steps assert all([d.T == 200 for d in D]) # For CartPole, 200 time steps, should be somewhere done=True if env_name == 'CartPole-v1': assert any([True in d.all_done for d in D]) assert all([len(d.trajectories) > 1 for d in D])
def make_env_spec(self): list_make_env = make_envs(make_env=make_gym_env, env_id='CartPole-v1', num_env=3, init_seed=0) venv = SerialVecEnv(list_make_env=list_make_env, rolling=True) env_spec = EnvSpec(venv) return env_spec
def make_env_spec(self): list_make_env = make_envs(make_env=make_gym_env, env_id='Pendulum-v0', num_env=1, init_seed=0) venv = SerialVecEnv(list_make_env=list_make_env) env_spec = EnvSpec(venv) return env_spec
def test_make_envs(): list_make_env = make_envs(make_env=make_gym_env, env_id='Pendulum-v0', num_env=3, init_seed=1) assert len(list_make_env) == 3 assert list_make_env[0] != list_make_env[1] and list_make_env[0] != list_make_env[2] # Test if the seedings are correct seeder = Seeder(init_seed=1) seeds = seeder(3) for make_env, seed in zip(list_make_env, seeds): assert make_env.keywords['seed'] == seed env = list_make_env[0]() raw_env = gym.make('Pendulum-v0') raw_env.seed(seeds[0]) assert np.allclose(env.reset(), raw_env.reset())
def __call__(self, config): # Set random seeds: PyTorch, numpy.random, random set_global_seeds(seed=config['seed']) # Make a list of make_env functions list_make_env = make_envs(make_env=make_gym_env, env_id=config['env:id'], num_env=config['train:N'], init_seed=config['seed'] * 2) # Create vectorized environment env = SerialVecEnv(list_make_env=list_make_env) # Create environment specification env_spec = EnvSpec(env) # Create device device = torch.device( f'cuda:{config["cuda_id"]}' if config['cuda'] else 'cpu') # Create policy network = MLP(config=config).to(device) policy = CategoricalPolicy(network=network, env_spec=env_spec) # Create optimizer optimizer = optim.Adam(policy.network.parameters(), lr=config['algo:lr']) # Create learning rate scheduler if config['algo:use_lr_scheduler']: max_epoch = config[ 'train:iter'] # Max number of lr decay, Note where lr_scheduler put lambda_f = lambda epoch: 1 - epoch / max_epoch # decay learning rate for each training epoch lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_f) # Create agent kwargs = {'device': device} if config['algo:use_lr_scheduler']: kwargs['lr_scheduler'] = lr_scheduler agent = A2CAgent(policy=policy, optimizer=optimizer, config=config, **kwargs) # Create runner runner = SegmentRunner(agent=agent, env=env, gamma=config['algo:gamma']) # Create engine engine = Engine(agent=agent, runner=runner, config=config, logger=None) # Training and evaluation train_logs = [] eval_logs = [] for i in range(config['train:iter']): train_output = engine.train(i) # Logging and evaluation if i == 0 or (i + 1) % config['log:interval'] == 0: # Log training and record the loggings train_logger = engine.log_train(train_output) train_logs.append(train_logger.logs) # Log evaluation and record the loggings eval_output = engine.eval(i) eval_logger = engine.log_eval(eval_output) eval_logs.append(eval_logger.logs) # Save the loggings np.save( Path(config['log:dir']) / str(config['ID']) / 'train', train_logs) np.save( Path(config['log:dir']) / str(config['ID']) / 'eval', eval_logs) return None