def test_finite_shmem_vector_env_complex(): length = 100 dataset = DummyDataset(length) envs = FiniteShmemVectorEnv( DoNothingTracker(), [_finite_env_factory(dataset, 5, i, complex=True) for i in range(5)]) envs._collector_guarded = True policy = AnyPolicy() test_collector = Collector(policy, envs, exploration_noise=True) try: test_collector.collect(n_step=10**18) except StopIteration: pass
def watch( args: argparse.Namespace = get_args(), agent_learn: Optional[BasePolicy] = None, agent_opponent: Optional[BasePolicy] = None, ) -> None: env = TicTacToeEnv(args.board_size, args.win_size) policy, optim = get_agents(args, agent_learn=agent_learn, agent_opponent=agent_opponent) policy.eval() policy.set_eps(args.eps_test) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}')
def test_collector_envpool_gym_reset_return_info(): envs = envpool.make_gym("Pendulum-v1", num_envs=4, gym_reset_return_info=True) policy = MyPolicy(action_shape=(len(envs), 1)) c0 = Collector(policy, envs, VectorReplayBuffer(len(envs) * 10, len(envs)), exploration_noise=True) c0.collect(n_step=8) env_ids = np.zeros(len(envs) * 10) env_ids[[0, 1, 10, 11, 20, 21, 30, 31]] = [0, 0, 1, 1, 2, 2, 3, 3] assert np.allclose(c0.buffer.info["env_id"], env_ids)
def __init__(self, args, mask=None, name='full'): env = gym.make(args.task) self.stop_fn = lambda x: x >= env.spec.reward_threshold self.state_shape = env.observation_space.shape or env.observation_space.n self.action_shape = env.action_space.shape or env.action_space.n self.buffer = ReplayBuffer(400000) # Env # train_envs = gym.make(args.task) self.train_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) self.test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # Mask state_dim = int(np.prod(self.state_shape)) self._view_mask = torch.ones(state_dim) if mask == 'even': for i in range(0, state_dim, 2): self._view_mask[i] = 0 elif mask == "odd": for i in range(1, state_dim, 2): self._view_mask[i] = 0 elif type(mask) == int: self._view_mask[mask] = 0 # Model net = NetWithView(args.layer_num, self.state_shape, device=args.device, mask=self._view_mask) self.actor = Actor(net, self.action_shape).to(args.device) self.critic = Critic(net).to(args.device) optim = torch.optim.Adam(list( self.actor.parameters()) + list(self.critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical self.policy = PPOPolicy( self.actor, self.critic, optim, dist, args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, action_range=None) # Collector self.train_collector = Collector( self.policy, self.train_envs, ReplayBuffer(args.buffer_size)) self.test_collector = Collector(self.policy, self.test_envs) # Log self.writer = SummaryWriter(f'{args.logdir}/{args.task}/ppo/{args.note}/{name}')
def test_finite_subproc_vector_env(): length = 100 dataset = DummyDataset(length) envs = FiniteSubprocVectorEnv( MetricTracker(length), [_finite_env_factory(dataset, 5, i) for i in range(5)]) envs._collector_guarded = True policy = AnyPolicy() test_collector = Collector(policy, envs, exploration_noise=True) for _ in range(1): envs._logger = [MetricTracker(length)] try: test_collector.collect(n_step=10**18) except StopIteration: envs._logger[0].validate()
def run(self, base_model, applied_mutators): search_space = dry_run_for_search_space(base_model, applied_mutators) concurrency = query_available_resources() env_fn = lambda: ModelEvaluationEnv(base_model, applied_mutators, search_space) policy = self.policy_fn(env_fn()) env = BaseVectorEnv([env_fn for _ in range(concurrency)], MultiThreadEnvWorker) collector = Collector(policy, env, VectorReplayBuffer(20000, len(env))) for cur_collect in range(1, self.max_collect + 1): _logger.info('Collect [%d] Running...', cur_collect) result = collector.collect(n_episode=self.trial_per_collect) _logger.info('Collect [%d] Result: %s', cur_collect, str(result)) policy.update(0, collector.buffer, batch_size=64, repeat=5)
def watch( args: argparse.Namespace = get_args(), agent_learn: Optional[BasePolicy] = None, agent_opponent: Optional[BasePolicy] = None, ) -> None: env = get_env() policy, optim, agents = get_agents(args, agent_learn=agent_learn, agent_opponent=agent_opponent) policy.eval() policy.policies[agents[args.agent_id - 1]].set_eps(args.eps_test) collector = Collector(policy, env, exploration_noise=True) result = collector.collect(n_episode=1, render=args.render) rews, lens = result["rews"], result["lens"] print( f"Final reward: {rews[:, args.agent_id - 1].mean()}, length: {lens.mean()}" )
def test_episode( policy: BasePolicy, collector: Collector, test_fn: Optional[Callable[[int, Optional[int]], None]], epoch: int, n_episode: Union[int, List[int]], writer: Optional[SummaryWriter] = None, global_step: Optional[int] = None, ) -> Dict[str, float]: """A simple wrapper of testing policy in collector.""" collector.reset_env() collector.reset_buffer() policy.eval() if test_fn: test_fn(epoch, global_step) if collector.get_env_num() > 1 and isinstance(n_episode, int): n = collector.get_env_num() n_ = np.zeros(n) + n_episode // n n_[:n_episode % n] += 1 n_episode = list(n_) result = collector.collect(n_episode=n_episode) if writer is not None and global_step is not None: for k in result.keys(): writer.add_scalar("test/" + k, result[k], global_step=global_step) return result
def test_collector_with_async(): env_lens = [2, 3, 4, 5] writer = SummaryWriter('log/async_collector') logger = Logger(writer) env_fns = [ lambda x=i: MyTestEnv(size=x, sleep=0.1, random_sleep=True) for i in env_lens ] venv = SubprocVectorEnv(env_fns, wait_num=len(env_fns) - 1) policy = MyPolicy() c1 = Collector(policy, venv, ReplayBuffer(size=1000, ignore_obs_next=False), logger.preprocess_fn) c1.collect(n_episode=10) # check if the data in the buffer is chronological # i.e. data in the buffer are full episodes, and each episode is # returned by the same environment env_id = c1.buffer.info['env_id'] size = len(c1.buffer) obs = c1.buffer.obs[:size] done = c1.buffer.done[:size] obs_ground_truth = [] i = 0 while i < size: # i is the start of an episode if done[i]: # this episode has one transition assert env_lens[env_id[i]] == 1 i += 1 continue j = i while True: j += 1 # in one episode, the environment id is the same assert env_id[j] == env_id[i] if done[j]: break j = j + 1 # j is the start of the next episode assert j - i == env_lens[env_id[i]] obs_ground_truth += list(range(j - i)) i = j obs_ground_truth = np.expand_dims(np.array(obs_ground_truth), axis=-1) assert np.allclose(obs, obs_ground_truth)
def test(test_agent: BasePolicy, env_creator: Callable[[], gym.Env], num_episodes: int, verbose: bool = True) -> None: """Test a model on a specific environment using a given agent. It will render the result in the default viewer. .. note:: This function can be terminated early using CTRL+C. :param env_creator: Lambda function without argument used to create a learning environment. :param num_episodes: Max number of episodes to run. :param verbose: Whether or not to print information about what is going on. Optional: True by default. """ env = env_creator() collector = Collector(test_agent, env) result = collector.collect(n_episode=1, render=env.step_dt) if verbose: print(f"Final reward: {result['rew']}, length: {result['len']}")
def test_collector_with_exact_episodes(): env_lens = [2, 6, 3, 10] writer = SummaryWriter('log/exact_collector') logger = Logger(writer) env_fns = [ lambda x=i: MyTestEnv(size=x, sleep=0.1, random_sleep=True) for i in env_lens ] venv = SubprocVectorEnv(env_fns, wait_num=len(env_fns) - 1) policy = MyPolicy() c1 = Collector(policy, venv, ReplayBuffer(size=1000, ignore_obs_next=False), logger.preprocess_fn) n_episode1 = [2, 0, 5, 1] n_episode2 = [1, 3, 2, 0] c1.collect(n_episode=n_episode1) expected_steps = sum([a * b for a, b in zip(env_lens, n_episode1)]) actual_steps = sum(venv.steps) assert expected_steps == actual_steps c1.collect(n_episode=n_episode2) expected_steps = sum( [a * (b + c) for a, b, c in zip(env_lens, n_episode1, n_episode2)]) actual_steps = sum(venv.steps) assert expected_steps == actual_steps
def test_ppo(args=get_args()): torch.set_num_threads(1) # for poor CPU env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, device=args.device) actor = Actor(net, args.action_shape).to(args.device) critic = Critic(net).to(args.device) optim = torch.optim.Adam(list( actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical policy = PPOPolicy( actor, critic, optim, dist, args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, action_range=None, gae_lambda=args.gae_lambda, reward_normalization=args.rew_norm, dual_clip=args.dual_clip, value_clip=args.value_clip) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs)
def watch(): print("Setup test envs ...") policy.eval() policy.set_eps(args.eps_test) test_envs.seed(args.seed) if args.save_buffer_name: print(f"Generate buffer with size {args.buffer_size}") buffer = ReplayBuffer(args.buffer_size, ignore_obs_next=True, save_only_last_obs=True, stack_num=args.frames_stack) collector = Collector(policy, test_envs, buffer) result = collector.collect(n_step=args.buffer_size) print(f"Save buffer into {args.save_buffer_name}") # Unfortunately, pickle will cause oom with 1M buffer size buffer.save_hdf5(args.save_buffer_name) else: print("Testing agent ...") test_collector.reset() result = test_collector.collect(n_episode=[1] * args.test_num, render=args.render) pprint.pprint(result)
def test_episode( policy: BasePolicy, collector: Collector, test_fn: Optional[Callable[[int, Optional[int]], None]], epoch: int, n_episode: int, logger: Optional[BaseLogger] = None, global_step: Optional[int] = None, reward_metric: Optional[Callable[[np.ndarray], np.ndarray]] = None, ) -> Dict[str, Any]: """A simple wrapper of testing policy in collector.""" collector.reset_env() collector.reset_buffer() policy.eval() if test_fn: test_fn(epoch, global_step) result = collector.collect(n_episode=n_episode) if reward_metric: result["rews"] = reward_metric(result["rews"]) if logger and global_step is not None: logger.log_test_data(result, global_step) return result
def test_ddpg(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # train_envs = gym.make(args.task) train_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, device=args.device) actor = Actor(net, args.action_shape, args.max_action, args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) net = Net(args.layer_num, args.state_shape, args.action_shape, concat=True, device=args.device) critic = Critic(net, args.device).to(args.device) critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) policy = DDPGPolicy( actor, actor_optim, critic, critic_optim, action_range=[env.action_space.low[0], env.action_space.high[0]], tau=args.tau, gamma=args.gamma, exploration_noise=GaussianNoise(sigma=args.exploration_noise), reward_normalization=True, ignore_done=True) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log writer = SummaryWriter(args.logdir + '/' + 'ddpg') def stop_fn(mean_rewards): return mean_rewards >= env.spec.reward_threshold # trainer result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer) assert stop_fn(result['best_reward']) if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! policy.eval() test_envs.seed(args.seed) test_collector.reset() result = test_collector.collect(n_episode=[1] * args.test_num, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}')
def train(self, vector_env: FiniteVectorEnv) -> dict[str, Any]: """Create a collector and collects ``episode_per_iter`` episodes. Update the policy on the collected replay buffer. """ self.policy.train() with vector_env.collector_guard(): collector = Collector( self.policy, vector_env, VectorReplayBuffer(self.buffer_size, len(vector_env))) # Number of episodes collected in each training iteration can be overridden by fast dev run. if self.trainer.fast_dev_run is not None: episodes = self.trainer.fast_dev_run else: episodes = self.episode_per_iter col_result = collector.collect(n_episode=episodes) update_result = self.policy.update(sample_size=0, buffer=collector.buffer, **self.update_kwargs) res = {**col_result, **update_result} self.log_dict(res) return res
def test(args=get_args()): # Let's watch its performance! env = LimitWrapper( StateBonus(ImgObsWrapper(gym.make('MiniGrid-FourRooms-v0')))) args.state_shape = env.observation_space.shape args.action_shape = env.env.action_space.shape or env.env.action_space.n # model net = DQN(args.state_shape[0], args.state_shape[1], args.action_shape, args.device) net = net.to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) policy = DQNPolicy(net, optim, args.gamma, args.n_step, use_target_network=args.target_update_freq > 0, target_update_freq=args.target_update_freq) policy.load_state_dict(torch.load('dqn.pth')) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def watch(): print("Setup test envs ...") policy.eval() policy.set_eps(args.eps_test) test_envs.seed(args.seed) if args.save_buffer_name: print(f"Generate buffer with size {args.buffer_size}") buffer = VectorReplayBuffer( args.buffer_size, buffer_num=len(test_envs), ignore_obs_next=True, save_only_last_obs=True, stack_num=args.frames_stack) collector = Collector(policy, test_envs, buffer, exploration_noise=True) result = collector.collect(n_step=args.buffer_size) print(f"Save buffer into {args.save_buffer_name}") # Unfortunately, pickle will cause oom with 1M buffer size buffer.save_hdf5(args.save_buffer_name) else: print("Testing agent ...") test_collector.reset() result = test_collector.collect(n_episode=args.test_num, render=args.render) rew = result["rews"].mean() print(f'Mean reward (over {result["n/ep"]} episodes): {rew}')
def test_episode(policy: BasePolicy, collector: Collector, test_fn: Callable[[int], None], epoch: int, n_episode: Union[int, List[int]]) -> Dict[str, float]: """A simple wrapper of testing policy in collector.""" collector.reset_env() collector.reset_buffer() policy.eval() if test_fn: test_fn(epoch) if collector.get_env_num() > 1 and np.isscalar(n_episode): n = collector.get_env_num() n_ = np.zeros(n) + n_episode // n n_[:n_episode % n] += 1 n_episode = list(n_) return collector.collect(n_episode=n_episode)
def test_reinforce(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) # train_envs = gym.make(args.task) train_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)], norm_obs=True) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)], norm_obs=True, obs_rms=train_envs.obs_rms, update_obs_rms=False) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net_a = Net(args.state_shape, hidden_sizes=args.hidden_sizes, activation=nn.Tanh, device=args.device) actor = ActorProb(net_a, args.action_shape, max_action=args.max_action, unbounded=True, device=args.device).to(args.device) torch.nn.init.constant_(actor.sigma_param, -0.5) for m in actor.modules(): if isinstance(m, torch.nn.Linear): # orthogonal initialization torch.nn.init.orthogonal_(m.weight, gain=np.sqrt(2)) torch.nn.init.zeros_(m.bias) # do last policy layer scaling, this will make initial actions have (close to) # 0 mean and std, and will help boost performances, # see https://arxiv.org/abs/2006.05990, Fig.24 for details for m in actor.mu.modules(): if isinstance(m, torch.nn.Linear): torch.nn.init.zeros_(m.bias) m.weight.data.copy_(0.01 * m.weight.data) optim = torch.optim.Adam(actor.parameters(), lr=args.lr) lr_scheduler = None if args.lr_decay: # decay learning rate to 0 linearly max_update_num = np.ceil( args.step_per_epoch / args.step_per_collect) * args.epoch lr_scheduler = LambdaLR( optim, lr_lambda=lambda epoch: 1 - epoch / max_update_num) def dist(*logits): return Independent(Normal(*logits), 1) policy = PGPolicy(actor, optim, dist, discount_factor=args.gamma, reward_normalization=args.rew_norm, action_scaling=True, action_bound_method=args.action_bound_method, lr_scheduler=lr_scheduler, action_space=env.action_space) # load a previous policy if args.resume_path: policy.load_state_dict( torch.load(args.resume_path, map_location=args.device)) print("Loaded agent from: ", args.resume_path) # collector if args.training_num > 1: buffer = VectorReplayBuffer(args.buffer_size, len(train_envs)) else: buffer = ReplayBuffer(args.buffer_size) train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) # log t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_reinforce' log_path = os.path.join(args.logdir, args.task, 'reinforce', log_file) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) logger = BasicLogger(writer, update_interval=10, train_interval=100) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) if not args.watch: # trainer result = onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.repeat_per_collect, args.test_num, args.batch_size, step_per_collect=args.step_per_collect, save_fn=save_fn, logger=logger, test_in_train=False) pprint.pprint(result) # Let's watch its performance! policy.eval() test_envs.seed(args.seed) test_collector.reset() result = test_collector.collect(n_episode=args.test_num, render=args.render) print( f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}' )
def test_c51(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)] ) # test_envs = gym.make(args.task) test_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)] ) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, device=args.device, softmax=True, num_atoms=args.num_atoms ) optim = torch.optim.Adam(net.parameters(), lr=args.lr) policy = C51Policy( net, optim, args.gamma, args.num_atoms, args.v_min, args.v_max, args.n_step, target_update_freq=args.target_update_freq ).to(args.device) # buffer if args.prioritized_replay: buf = PrioritizedVectorReplayBuffer( args.buffer_size, buffer_num=len(train_envs), alpha=args.alpha, beta=args.beta ) else: buf = VectorReplayBuffer(args.buffer_size, buffer_num=len(train_envs)) # collector train_collector = Collector(policy, train_envs, buf, exploration_noise=True) test_collector = Collector(policy, test_envs, exploration_noise=True) # policy.set_eps(1) train_collector.collect(n_step=args.batch_size * args.training_num) # log log_path = os.path.join(args.logdir, args.task, 'c51') writer = SummaryWriter(log_path) logger = TensorboardLogger(writer, save_interval=args.save_interval) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): return mean_rewards >= env.spec.reward_threshold def train_fn(epoch, env_step): # eps annnealing, just a demo if env_step <= 10000: policy.set_eps(args.eps_train) elif env_step <= 50000: eps = args.eps_train - (env_step - 10000) / \ 40000 * (0.9 * args.eps_train) policy.set_eps(eps) else: policy.set_eps(0.1 * args.eps_train) def test_fn(epoch, env_step): policy.set_eps(args.eps_test) def save_checkpoint_fn(epoch, env_step, gradient_step): # see also: https://pytorch.org/tutorials/beginner/saving_loading_models.html torch.save( { 'model': policy.state_dict(), 'optim': optim.state_dict(), }, os.path.join(log_path, 'checkpoint.pth') ) pickle.dump( train_collector.buffer, open(os.path.join(log_path, 'train_buffer.pkl'), "wb") ) if args.resume: # load from existing checkpoint print(f"Loading agent under {log_path}") ckpt_path = os.path.join(log_path, 'checkpoint.pth') if os.path.exists(ckpt_path): checkpoint = torch.load(ckpt_path, map_location=args.device) policy.load_state_dict(checkpoint['model']) policy.optim.load_state_dict(checkpoint['optim']) print("Successfully restore policy and optim.") else: print("Fail to restore policy and optim.") buffer_path = os.path.join(log_path, 'train_buffer.pkl') if os.path.exists(buffer_path): train_collector.buffer = pickle.load(open(buffer_path, "rb")) print("Successfully restore buffer.") else: print("Fail to restore buffer.") # trainer result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.step_per_collect, args.test_num, args.batch_size, update_per_step=args.update_per_step, train_fn=train_fn, test_fn=test_fn, stop_fn=stop_fn, save_fn=save_fn, logger=logger, resume_from_log=args.resume, save_checkpoint_fn=save_checkpoint_fn ) assert stop_fn(result['best_reward']) if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) policy.eval() policy.set_eps(args.eps_test) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) rews, lens = result["rews"], result["lens"] print(f"Final reward: {rews.mean()}, length: {lens.mean()}")
def onpolicy_trainer( policy: BasePolicy, train_collector: Collector, test_collector: Collector, max_epoch: int, step_per_epoch: int, repeat_per_collect: int, episode_per_test: int, batch_size: int, step_per_collect: Optional[int] = None, episode_per_collect: Optional[int] = None, train_fn: Optional[Callable[[int, int], None]] = None, test_fn: Optional[Callable[[int, Optional[int]], None]] = None, stop_fn: Optional[Callable[[float], bool]] = None, save_fn: Optional[Callable[[BasePolicy], None]] = None, reward_metric: Optional[Callable[[np.ndarray], np.ndarray]] = None, logger: BaseLogger = LazyLogger(), verbose: bool = True, test_in_train: bool = True, ) -> Dict[str, Union[float, str]]: """A wrapper for on-policy trainer procedure. The "step" in trainer means an environment step (a.k.a. transition). :param policy: an instance of the :class:`~tianshou.policy.BasePolicy` class. :param Collector train_collector: the collector used for training. :param Collector test_collector: the collector used for testing. :param int max_epoch: the maximum number of epochs for training. The training process might be finished before reaching ``max_epoch`` if ``stop_fn`` is set. :param int step_per_epoch: the number of transitions collected per epoch. :param int repeat_per_collect: the number of repeat time for policy learning, for example, set it to 2 means the policy needs to learn each given batch data twice. :param int episode_per_test: the number of episodes for one policy evaluation. :param int batch_size: the batch size of sample data, which is going to feed in the policy network. :param int step_per_collect: the number of transitions the collector would collect before the network update, i.e., trainer will collect "step_per_collect" transitions and do some policy network update repeatly in each epoch. :param int episode_per_collect: the number of episodes the collector would collect before the network update, i.e., trainer will collect "episode_per_collect" episodes and do some policy network update repeatly in each epoch. :param function train_fn: a hook called at the beginning of training in each epoch. It can be used to perform custom additional operations, with the signature ``f( num_epoch: int, step_idx: int) -> None``. :param function test_fn: a hook called at the beginning of testing in each epoch. It can be used to perform custom additional operations, with the signature ``f( num_epoch: int, step_idx: int) -> None``. :param function save_fn: a hook called when the undiscounted average mean reward in evaluation phase gets better, with the signature ``f(policy: BasePolicy) -> None``. :param function stop_fn: a function with signature ``f(mean_rewards: float) -> bool``, receives the average undiscounted returns of the testing result, returns a boolean which indicates whether reaching the goal. :param function reward_metric: a function with signature ``f(rewards: np.ndarray with shape (num_episode, agent_num)) -> np.ndarray with shape (num_episode,)``, used in multi-agent RL. We need to return a single scalar for each episode's result to monitor training in the multi-agent RL setting. This function specifies what is the desired metric, e.g., the reward of agent 1 or the average reward over all agents. :param BaseLogger logger: A logger that logs statistics during training/testing/updating. Default to a logger that doesn't log anything. :param bool verbose: whether to print the information. Default to True. :param bool test_in_train: whether to test in the training phase. Default to True. :return: See :func:`~tianshou.trainer.gather_info`. .. note:: Only either one of step_per_collect and episode_per_collect can be specified. """ env_step, gradient_step = 0, 0 last_rew, last_len = 0.0, 0 stat: Dict[str, MovAvg] = defaultdict(MovAvg) start_time = time.time() train_collector.reset_stat() test_collector.reset_stat() test_in_train = test_in_train and train_collector.policy == policy test_result = test_episode(policy, test_collector, test_fn, 0, episode_per_test, logger, env_step, reward_metric) best_epoch = 0 best_reward, best_reward_std = test_result["rew"], test_result["rew_std"] for epoch in range(1, 1 + max_epoch): # train policy.train() with tqdm.tqdm(total=step_per_epoch, desc=f"Epoch #{epoch}", **tqdm_config) as t: while t.n < t.total: if train_fn: train_fn(epoch, env_step) result = train_collector.collect(n_step=step_per_collect, n_episode=episode_per_collect) if reward_metric: result["rews"] = reward_metric(result["rews"]) env_step += int(result["n/st"]) t.update(result["n/st"]) logger.log_train_data(result, env_step) last_rew = result['rew'] if 'rew' in result else last_rew last_len = result['len'] if 'len' in result else last_len data = { "env_step": str(env_step), "rew": f"{last_rew:.2f}", "len": str(int(last_len)), "n/ep": str(int(result["n/ep"])), "n/st": str(int(result["n/st"])), } if test_in_train and stop_fn and stop_fn(result["rew"]): test_result = test_episode(policy, test_collector, test_fn, epoch, episode_per_test, logger, env_step) if stop_fn(test_result["rew"]): if save_fn: save_fn(policy) t.set_postfix(**data) return gather_info(start_time, train_collector, test_collector, test_result["rew"], test_result["rew_std"]) else: policy.train() losses = policy.update(0, train_collector.buffer, batch_size=batch_size, repeat=repeat_per_collect) train_collector.reset_buffer() step = max( [1] + [len(v) for v in losses.values() if isinstance(v, list)]) gradient_step += step for k in losses.keys(): stat[k].add(losses[k]) losses[k] = stat[k].get() data[k] = f"{losses[k]:.3f}" logger.log_update_data(losses, gradient_step) t.set_postfix(**data) if t.n <= t.total: t.update() # test test_result = test_episode(policy, test_collector, test_fn, epoch, episode_per_test, logger, env_step) rew, rew_std = test_result["rew"], test_result["rew_std"] if best_epoch == -1 or best_reward < rew: best_reward, best_reward_std = rew, rew_std best_epoch = epoch if save_fn: save_fn(policy) if verbose: print( f"Epoch #{epoch}: test_reward: {rew:.6f} ± {rew_std:.6f}, best_rew" f"ard: {best_reward:.6f} ± {best_reward_std:.6f} in #{best_epoch}" ) if stop_fn and stop_fn(best_reward): break return gather_info(start_time, train_collector, test_collector, best_reward, best_reward_std)
def test_sac(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) # train_envs = gym.make(args.task) if args.training_num > 1: train_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)] ) else: train_envs = gym.make(args.task) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)] ) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net_a = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) actor = ActorProb( net_a, args.action_shape, max_action=args.max_action, device=args.device, unbounded=True, conditioned_sigma=True ).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) net_c1 = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device ) net_c2 = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device ) critic1 = Critic(net_c1, device=args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(net_c2, device=args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) if args.auto_alpha: target_entropy = -np.prod(env.action_space.shape) log_alpha = torch.zeros(1, requires_grad=True, device=args.device) alpha_optim = torch.optim.Adam([log_alpha], lr=args.alpha_lr) args.alpha = (target_entropy, log_alpha, alpha_optim) policy = SACPolicy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, tau=args.tau, gamma=args.gamma, alpha=args.alpha, estimation_step=args.n_step, action_space=env.action_space ) # load a previous policy if args.resume_path: policy.load_state_dict(torch.load(args.resume_path, map_location=args.device)) print("Loaded agent from: ", args.resume_path) # collector if args.training_num > 1: buffer = VectorReplayBuffer(args.buffer_size, len(train_envs)) else: buffer = ReplayBuffer(args.buffer_size) train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) train_collector.collect(n_step=args.start_timesteps, random=True) # log t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_sac' log_path = os.path.join(args.logdir, args.task, 'sac', log_file) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) logger = TensorboardLogger(writer) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) if not args.watch: # trainer result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.step_per_collect, args.test_num, args.batch_size, save_fn=save_fn, logger=logger, update_per_step=args.update_per_step, test_in_train=False ) pprint.pprint(result) # Let's watch its performance! policy.eval() test_envs.seed(args.seed) test_collector.reset() result = test_collector.collect(n_episode=args.test_num, render=args.render) print(f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}')
def offline_trainer( policy: BasePolicy, buffer: ReplayBuffer, test_collector: Collector, max_epoch: int, update_per_epoch: int, episode_per_test: int, batch_size: int, test_fn: Optional[Callable[[int, Optional[int]], None]] = None, stop_fn: Optional[Callable[[float], bool]] = None, save_fn: Optional[Callable[[BasePolicy], None]] = None, reward_metric: Optional[Callable[[np.ndarray], np.ndarray]] = None, logger: BaseLogger = LazyLogger(), verbose: bool = True, ) -> Dict[str, Union[float, str]]: """A wrapper for offline trainer procedure. The "step" in offline trainer means a gradient step. :param policy: an instance of the :class:`~tianshou.policy.BasePolicy` class. :param Collector test_collector: the collector used for testing. :param int max_epoch: the maximum number of epochs for training. The training process might be finished before reaching ``max_epoch`` if ``stop_fn`` is set. :param int update_per_epoch: the number of policy network updates, so-called gradient steps, per epoch. :param episode_per_test: the number of episodes for one policy evaluation. :param int batch_size: the batch size of sample data, which is going to feed in the policy network. :param function test_fn: a hook called at the beginning of testing in each epoch. It can be used to perform custom additional operations, with the signature ``f( num_epoch: int, step_idx: int) -> None``. :param function save_fn: a hook called when the undiscounted average mean reward in evaluation phase gets better, with the signature ``f(policy: BasePolicy) -> None``. :param function stop_fn: a function with signature ``f(mean_rewards: float) -> bool``, receives the average undiscounted returns of the testing result, returns a boolean which indicates whether reaching the goal. :param function reward_metric: a function with signature ``f(rewards: np.ndarray with shape (num_episode, agent_num)) -> np.ndarray with shape (num_episode,)``, used in multi-agent RL. We need to return a single scalar for each episode's result to monitor training in the multi-agent RL setting. This function specifies what is the desired metric, e.g., the reward of agent 1 or the average reward over all agents. :param BaseLogger logger: A logger that logs statistics during updating/testing. Default to a logger that doesn't log anything. :param bool verbose: whether to print the information. Default to True. :return: See :func:`~tianshou.trainer.gather_info`. """ gradient_step = 0 stat: Dict[str, MovAvg] = defaultdict(MovAvg) start_time = time.time() test_collector.reset_stat() test_result = test_episode(policy, test_collector, test_fn, 0, episode_per_test, logger, gradient_step, reward_metric) best_epoch = 0 best_reward, best_reward_std = test_result["rew"], test_result["rew_std"] for epoch in range(1, 1 + max_epoch): policy.train() with tqdm.trange(update_per_epoch, desc=f"Epoch #{epoch}", **tqdm_config) as t: for i in t: gradient_step += 1 losses = policy.update(batch_size, buffer) data = {"gradient_step": str(gradient_step)} for k in losses.keys(): stat[k].add(losses[k]) losses[k] = stat[k].get() data[k] = f"{losses[k]:.3f}" logger.log_update_data(losses, gradient_step) t.set_postfix(**data) # test test_result = test_episode(policy, test_collector, test_fn, epoch, episode_per_test, logger, gradient_step, reward_metric) rew, rew_std = test_result["rew"], test_result["rew_std"] if best_epoch == -1 or best_reward < rew: best_reward, best_reward_std = rew, rew_std best_epoch = epoch if save_fn: save_fn(policy) if verbose: print( f"Epoch #{epoch}: test_reward: {rew:.6f} ± {rew_std:.6f}, best_rew" f"ard: {best_reward:.6f} ± {best_reward_std:.6f} in #{best_epoch}" ) if stop_fn and stop_fn(best_reward): break return gather_info(start_time, None, test_collector, best_reward, best_reward_std)
def test_ppo(args=get_args()): torch.set_num_threads(1) # for poor CPU env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, device=args.device) actor = Actor(net, args.action_shape).to(args.device) critic = Critic(net).to(args.device) # orthogonal initialization for m in list(actor.modules()) + list(critic.modules()): if isinstance(m, torch.nn.Linear): torch.nn.init.orthogonal_(m.weight) torch.nn.init.zeros_(m.bias) optim = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical policy = PPOPolicy(actor, critic, optim, dist, args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, action_range=None, gae_lambda=args.gae_lambda, reward_normalization=args.rew_norm, dual_clip=args.dual_clip, value_clip=args.value_clip) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'ppo') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def train_agent(args: argparse.Namespace = get_args(), agent_learn: Optional[BasePolicy] = None, agent_opponent: Optional[BasePolicy] = None, optim: Optional[torch.optim.Optimizer] = None, ) -> Tuple[dict, BasePolicy]: def env_func(): return TicTacToeEnv(args.board_size, args.win_size) train_envs = DummyVectorEnv([env_func for _ in range(args.training_num)]) test_envs = DummyVectorEnv([env_func for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) policy, optim = get_agents( args, agent_learn=agent_learn, agent_opponent=agent_opponent, optim=optim) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # policy.set_eps(1) train_collector.collect(n_step=args.batch_size) # log if not hasattr(args, 'writer'): log_path = os.path.join(args.logdir, 'tic_tac_toe', 'dqn') writer = SummaryWriter(log_path) args.writer = writer else: writer = args.writer def save_fn(policy): if hasattr(args, 'model_save_path'): model_save_path = args.model_save_path else: model_save_path = os.path.join( args.logdir, 'tic_tac_toe', 'dqn', 'policy.pth') torch.save( policy.policies[args.agent_id - 1].state_dict(), model_save_path) def stop_fn(x): return x >= args.win_rate def train_fn(x): policy.policies[args.agent_id - 1].set_eps(args.eps_train) def test_fn(x): policy.policies[args.agent_id - 1].set_eps(args.eps_test) # trainer result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, train_fn=train_fn, test_fn=test_fn, stop_fn=stop_fn, save_fn=save_fn, writer=writer, test_in_train=False) return result, policy.policies[args.agent_id - 1]
def test_dqn(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # Q_param = V_param = {"hidden_sizes": [128]} # model net = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, device=args.device, # dueling=(Q_param, V_param), ).to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) policy = DQNPolicy(net, optim, args.gamma, args.n_step, target_update_freq=args.target_update_freq) # buffer if args.prioritized_replay: buf = PrioritizedVectorReplayBuffer(args.buffer_size, buffer_num=len(train_envs), alpha=args.alpha, beta=args.beta) else: buf = VectorReplayBuffer(args.buffer_size, buffer_num=len(train_envs)) # collector train_collector = Collector(policy, train_envs, buf, exploration_noise=True) test_collector = Collector(policy, test_envs, exploration_noise=True) # policy.set_eps(1) train_collector.collect(n_step=args.batch_size * args.training_num) # log log_path = os.path.join(args.logdir, args.task, 'dqn') writer = SummaryWriter(log_path) logger = BasicLogger(writer) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): return mean_rewards >= env.spec.reward_threshold def train_fn(epoch, env_step): # eps annnealing, just a demo if env_step <= 10000: policy.set_eps(args.eps_train) elif env_step <= 50000: eps = args.eps_train - (env_step - 10000) / \ 40000 * (0.9 * args.eps_train) policy.set_eps(eps) else: policy.set_eps(0.1 * args.eps_train) def test_fn(epoch, env_step): policy.set_eps(args.eps_test) # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.step_per_collect, args.test_num, args.batch_size, update_per_step=args.update_per_step, train_fn=train_fn, test_fn=test_fn, stop_fn=stop_fn, save_fn=save_fn, logger=logger) assert stop_fn(result['best_reward']) if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) policy.eval() policy.set_eps(args.eps_test) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) rews, lens = result["rews"], result["lens"] print(f"Final reward: {rews.mean()}, length: {lens.mean()}") # save buffer in pickle format, for imitation learning unittest buf = VectorReplayBuffer(args.buffer_size, buffer_num=len(test_envs)) collector = Collector(policy, test_envs, buf) collector.collect(n_step=args.buffer_size) pickle.dump(buf, open(args.save_buffer_name, "wb"))
def test_td3(args=get_args()): torch.set_num_threads(1) # we just need only one thread for NN env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, device=args.device) actor = Actor(net, args.action_shape, args.max_action, args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) net = Net(args.layer_num, args.state_shape, args.action_shape, concat=True, device=args.device) critic1 = Critic(net, args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(net, args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = TD3Policy(actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, GaussianNoise(sigma=args.exploration_noise), args.policy_noise, args.update_actor_freq, args.noise_clip, [env.action_space.low[0], env.action_space.high[0]], reward_normalization=args.rew_norm, ignore_done=args.ignore_done, estimation_step=args.n_step) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # train_collector.collect(n_step=args.buffer_size) # log log_path = os.path.join(args.logdir, args.task, 'td3') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) policy.eval() collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}')
def test_a2c_with_il(args=get_args()): torch.set_num_threads(1) # for poor CPU env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, device=args.device) actor = Actor(net, args.action_shape).to(args.device) critic = Critic(net).to(args.device) optim = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical policy = A2CPolicy(actor, critic, optim, dist, args.gamma, gae_lambda=args.gae_lambda, vf_coef=args.vf_coef, ent_coef=args.ent_coef, max_grad_norm=args.max_grad_norm, reward_normalization=args.rew_norm) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'a2c') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close() # here we define an imitation collector with a trivial policy if args.task == 'CartPole-v0': env.spec.reward_threshold = 190 # lower the goal net = Net(1, args.state_shape, device=args.device) net = Actor(net, args.action_shape).to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.il_lr) il_policy = ImitationPolicy(net, optim, mode='discrete') il_test_collector = Collector(il_policy, test_envs) train_collector.reset() result = offpolicy_trainer(il_policy, train_collector, il_test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() il_test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(il_policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_sac_with_il(args=get_args()): torch.set_num_threads(1) # we just need only one thread for NN env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) actor = ActorProb(net, args.action_shape, max_action=args.max_action, device=args.device, unbounded=True).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) net_c1 = Net(args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device) critic1 = Critic(net_c1, device=args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) net_c2 = Net(args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device) critic2 = Critic(net_c2, device=args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = SACPolicy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, action_range=[env.action_space.low[0], env.action_space.high[0]], tau=args.tau, gamma=args.gamma, alpha=args.alpha, reward_normalization=args.rew_norm, estimation_step=args.n_step) # collector train_collector = Collector(policy, train_envs, VectorReplayBuffer(args.buffer_size, len(train_envs)), exploration_noise=True) test_collector = Collector(policy, test_envs) # train_collector.collect(n_step=args.buffer_size) # log log_path = os.path.join(args.logdir, args.task, 'sac') writer = SummaryWriter(log_path) logger = BasicLogger(writer) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): return mean_rewards >= env.spec.reward_threshold # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.step_per_collect, args.test_num, args.batch_size, update_per_step=args.update_per_step, stop_fn=stop_fn, save_fn=save_fn, logger=logger) assert stop_fn(result['best_reward']) if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) policy.eval() collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) rews, lens = result["rews"], result["lens"] print(f"Final reward: {rews.mean()}, length: {lens.mean()}") # here we define an imitation collector with a trivial policy policy.eval() if args.task == 'Pendulum-v0': env.spec.reward_threshold = -300 # lower the goal net = Actor(Net(args.state_shape, hidden_sizes=args.imitation_hidden_sizes, device=args.device), args.action_shape, max_action=args.max_action, device=args.device).to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.il_lr) il_policy = ImitationPolicy(net, optim, mode='continuous') il_test_collector = Collector( il_policy, DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)])) train_collector.reset() result = offpolicy_trainer(il_policy, train_collector, il_test_collector, args.epoch, args.il_step_per_epoch, args.step_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, logger=logger) assert stop_fn(result['best_reward']) if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) il_policy.eval() collector = Collector(il_policy, env) result = collector.collect(n_episode=1, render=args.render) rews, lens = result["rews"], result["lens"] print(f"Final reward: {rews.mean()}, length: {lens.mean()}")