def test_dqn(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, args.action_shape, args.device) net = net.to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) policy = DQNPolicy( net, optim, args.gamma, args.n_step, use_target_network=args.target_update_freq > 0, target_update_freq=args.target_update_freq) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # policy.set_eps(1) train_collector.collect(n_step=args.batch_size) # log log_path = os.path.join(args.logdir, args.task, 'dqn') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold def train_fn(x): policy.set_eps(args.eps_train) def test_fn(x): policy.set_eps(args.eps_test) # trainer result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, train_fn=train_fn, test_fn=test_fn, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() pprint.pprint(result)
def watch( args: argparse.Namespace = get_args(), agent_learn: Optional[BasePolicy] = None, agent_opponent: Optional[BasePolicy] = None, ) -> None: env = TicTacToeEnv(args.board_size, args.win_size) policy, optim = get_agents(args, agent_learn=agent_learn, agent_opponent=agent_opponent) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_ppo(args=get_args()): env = create_atari_environment(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space().shape or env.action_space().n # train_envs = gym.make(args.task) train_envs = SubprocVectorEnv([ lambda: create_atari_environment(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv([ lambda: create_atari_environment(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, device=args.device) actor = Actor(net, args.action_shape).to(args.device) critic = Critic(net).to(args.device) optim = torch.optim.Adam(list( actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical policy = PPOPolicy( actor, critic, optim, dist, args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, action_range=None) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size), preprocess_fn=preprocess_fn) test_collector = Collector(policy, test_envs, preprocess_fn=preprocess_fn) # log writer = SummaryWriter(args.logdir + '/' + 'ppo') def stop_fn(x): if env.env.spec.reward_threshold: return x >= env.spec.reward_threshold else: return False # trainer result = onpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = create_atari_environment(args.task) collector = Collector(policy, env, preprocess_fn=preprocess_fn) result = collector.collect(n_step=2000, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_ddpg(args=get_args()): env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = Actor( args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device ).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic = Critic( args.layer_num, args.state_shape, args.action_shape, args.device ).to(args.device) critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) policy = DDPGPolicy( actor, actor_optim, critic, critic_optim, args.tau, args.gamma, args.exploration_noise, [env.action_space.low[0], env.action_space.high[0]], reward_normalization=True, ignore_done=True) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log writer = SummaryWriter(args.logdir + '/' + 'ddpg') def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_dqn(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, args.action_shape, args.device) net = net.to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) policy = DQNPolicy( net, optim, args.gamma, args.n_step, use_target_network=args.target_update_freq > 0, target_update_freq=args.target_update_freq) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # policy.set_eps(1) train_collector.collect(n_step=args.batch_size) # log writer = SummaryWriter(args.logdir + '/' + 'dqn') def stop_fn(x): return x >= env.spec.reward_threshold def train_fn(x): policy.set_eps(args.eps_train) def test_fn(x): policy.set_eps(args.eps_test) # trainer result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, train_fn=train_fn, test_fn=test_fn, stop_fn=stop_fn, writer=writer, task=args.task) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_pg(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, args.action_shape, device=args.device, softmax=True) net = net.to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) dist = torch.distributions.Categorical policy = PGPolicy(net, optim, dist, args.gamma) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log writer = SummaryWriter(args.logdir + '/' + 'pg') def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_a2c(args=get_args()): torch.set_num_threads(1) # for poor CPU env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, device=args.device) actor = Actor(net, args.action_shape).to(args.device) critic = Critic(net).to(args.device) optim = torch.optim.Adam(list( actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical policy = A2CPolicy( actor, critic, optim, dist, args.gamma, gae_lambda=args.gae_lambda, vf_coef=args.vf_coef, ent_coef=args.ent_coef, max_grad_norm=args.max_grad_norm) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'a2c') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = onpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test(args=get_args()): # Let's watch its performance! env = LimitWrapper( StateBonus(ImgObsWrapper(gym.make('MiniGrid-FourRooms-v0')))) args.state_shape = env.observation_space.shape args.action_shape = env.env.action_space.shape or env.env.action_space.n # model net = DQN(args.state_shape[0], args.state_shape[1], args.action_shape, args.device) net = net.to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) policy = DQNPolicy(net, optim, args.gamma, args.n_step, use_target_network=args.target_update_freq > 0, target_update_freq=args.target_update_freq) policy.load_state_dict(torch.load('dqn.pth')) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def train(hyper: dict): env_id = 'CartPole-v1' env = gym.make(env_id) hyper['state_dim'] = 4 hyper['action_dim'] = 2 train_envs = VectorEnv([lambda: gym.make(env_id) for _ in range(hyper['training_num'])]) test_envs = SubprocVectorEnv([lambda: gym.make(env_id) for _ in range(hyper['test_num'])]) if hyper['seed']: np.random.seed(hyper['random_seed']) torch.manual_seed(hyper['random_seed']) train_envs.seed(hyper['random_seed']) test_envs.seed(hyper['random_seed']) device = Pytorch.device() net = Net(hyper['layer_num'], hyper['state_dim'], device=device) actor = Actor(net, hyper['action_dim']).to(device) critic = Critic(net).to(device) optim = torch.optim.Adam(list( actor.parameters()) + list(critic.parameters()), lr=hyper['learning_rate']) dist = torch.distributions.Categorical policy = A2CPolicy( actor, critic, optim, dist, hyper['gamma'], vf_coef=hyper['vf_coef'], ent_coef=hyper['ent_coef'], max_grad_norm=hyper['max_grad_norm']) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(hyper['capacity'])) test_collector = Collector(policy, test_envs) writer = SummaryWriter('./a2c') def stop_fn(x): if env.env.spec.reward_threshold: return x >= env.spec.reward_threshold else: return False result = onpolicy_trainer( policy, train_collector, test_collector, hyper['epoch'], hyper['step_per_epoch'], hyper['collect_per_step'], hyper['repeat_per_collect'], hyper['test_num'], hyper['batch_size'], stop_fn=stop_fn, writer=writer, task=env_id) train_collector.close() test_collector.close() pprint.pprint(result) # 测试 env = gym.make(env_id) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=hyper['render']) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_ppo(args=get_args()): torch.set_num_threads(1) # we just need only one thread for NN env = gym_make() args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.action_range = [env.action_shape.low[0], env.action_space.high[0]] # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym_make() train_envs = VectorEnv( [lambda: gym_make() for _ in range(args.training_num)]) # test_envs = gym_make() test_envs = VectorEnv([lambda: gym_make() for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model policy = init_policy(args, env) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'ppo') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) # trainer result = onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, save_fn=save_fn, writer=writer) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym_make() collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) >>>>>>> 4fd826761c9884457928da9dac52d7ee1c51443a def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = onpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close() if __name__ == '__main__': # test_fn() test_pg()
def test_td3(args=get_args()): # initialize environment env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = Actor(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device, hidden_layer_size=args.hidden_size).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic1 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device, hidden_layer_size=args.hidden_size).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device, hidden_layer_size=args.hidden_size).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = TD3Policy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, GaussianNoise(sigma=args.exploration_noise), args.policy_noise, args.update_actor_freq, args.noise_clip, action_range=[env.action_space.low[0], env.action_space.high[0]], reward_normalization=args.rew_norm, ignore_done=False) # collector if args.training_num == 0: max_episode_steps = train_envs._max_episode_steps else: max_episode_steps = train_envs.envs[0]._max_episode_steps train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size, max_ep_len=max_episode_steps)) test_collector = Collector(policy, test_envs, mode='test') # log log_path = os.path.join(args.logdir, args.task, 'td3', str(args.seed)) writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) env.spec.reward_threshold = 100000 def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = offpolicy_exact_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_ppo(args=get_args()): torch.set_num_threads(1) # for poor CPU env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, device=args.device) actor = Actor(net, args.action_shape).to(args.device) critic = Critic(net).to(args.device) optim = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical policy = PPOPolicy(actor, critic, optim, dist, args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, action_range=None, gae_lambda=args.gae_lambda, reward_normalization=args.rew_norm, dual_clip=args.dual_clip, value_clip=args.value_clip) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'ppo') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() pprint.pprint(result)
def run_pg(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, args.action_shape, device=args.device, softmax=True) net = net.to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) dist = torch.distributions.Categorical policy = PGPolicy(net, optim, dist, args.gamma, reward_normalization=args.rew_norm) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log if not os.path.isdir(os.path.join(args.logdir)): os.mkdir(os.path.join(args.logdir)) if not os.path.isdir(os.path.join(args.logdir, args.task)): os.mkdir(os.path.join(args.logdir, args.task)) if not os.path.isdir(os.path.join(args.logdir, args.task, 'pg')): os.mkdir(os.path.join(args.logdir, args.task, 'pg')) log_path = os.path.join(args.logdir, args.task, 'pg') def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn) train_collector.close() test_collector.close() if __name__ == '__main__': # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_drqn(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Recurrent(args.layer_num, args.state_shape, args.action_shape, args.device) net = net.to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) policy = DQNPolicy(net, optim, args.gamma, args.n_step, use_target_network=args.target_update_freq > 0, target_update_freq=args.target_update_freq) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size, stack_num=args.stack_num, ignore_obs_next=True)) # the stack_num is for RNN training: sample framestack obs test_collector = Collector(policy, test_envs) # policy.set_eps(1) train_collector.collect(n_step=args.batch_size) # log log_path = os.path.join(args.logdir, args.task, 'drqn') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold def train_fn(x): policy.set_eps(args.eps_train) def test_fn(x): policy.set_eps(args.eps_test) # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, train_fn=train_fn, test_fn=test_fn, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def _test_ppo(args=get_args()): # just a demo, I have not made it work :( env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # train_envs = gym.make(args.task) train_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = ActorProb(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device).to(args.device) critic = Critic(args.layer_num, args.state_shape, device=args.device).to(args.device) optim = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Normal policy = PPOPolicy( actor, critic, optim, dist, args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, action_range=[env.action_space.low[0], env.action_space.high[0]]) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) train_collector.collect(n_step=args.step_per_epoch) # log writer = SummaryWriter(args.logdir + '/' + 'ppo') def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
class View(object): def __init__(self, args, mask=None, name='full'): env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 self.state_shape = env.observation_space.shape or env.observation_space.n self.action_shape = env.action_space.shape or env.action_space.n self.max_action = env.action_space.high[0] self.stop_fn = lambda x: x >= env.spec.reward_threshold # env self.train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) self.test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # mask state_dim = int(np.prod(self.state_shape)) self._view_mask = torch.ones(state_dim) if mask == 'even': for i in range(0, state_dim, 2): self._view_mask[i] = 0 elif mask == "odd": for i in range(1, state_dim, 2): self._view_mask[i] = 0 elif type(mask) == int: self._view_mask[mask] = 0 # policy self.actor = ActorProbWithView( args.layer_num, self.state_shape, self.action_shape, self.max_action, self._view_mask, args.device ).to(args.device) self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=args.actor_lr) self.critic1 = CriticWithView( args.layer_num, self.state_shape, self._view_mask, self.action_shape, args.device ).to(args.device) self.critic1_optim = torch.optim.Adam(self.critic1.parameters(), lr=args.critic_lr) self.critic2 = CriticWithView( args.layer_num, self.state_shape, self._view_mask, self.action_shape, args.device ).to(args.device) self.critic2_optim = torch.optim.Adam(self.critic2.parameters(), lr=args.critic_lr) self.policy = SACPolicy( self.actor, self.actor_optim, self.critic1, self.critic1_optim, self.critic2, self.critic2_optim, args.tau, args.gamma, args.alpha, [env.action_space.low[0], env.action_space.high[0]], reward_normalization=True, ignore_done=True) # collector self.train_collector = Collector(self.policy, self.train_envs, ReplayBuffer(args.buffer_size)) self.test_collector = Collector(self.policy, self.test_envs) # log self.writer = SummaryWriter(f"{args.logdir}/{args.task}/sac/{args.note}/{name}") def seed(self, _seed): self.train_envs.seed(_seed) self.test_envs.seed(_seed) def close(self): self.train_collector.close() self.test_collector.close() def train(self): self.actor.train() self.critic1.train() self.critic2.train() def learn_from_demos(self, batch, demo, peer=0): acts = self.policy(batch).act demo = demo.act.detach() loss = F.mse_loss(acts, demo) if peer != 0: peer_demo = demo[torch.randperm(len(demo))] loss -= peer * F.mse_loss(acts, peer_demo) self.policy.actor_optim.zero_grad() loss.backward() self.policy.actor_optim.step()
def test_sac(args=get_args()): # initialize environment env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = ActorProb(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device, hidden_layer_size=args.hidden_size).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic1 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device, hidden_layer_size=args.hidden_size).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device, hidden_layer_size=args.hidden_size).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) # energy-based discriminator disc = Critic( args.layer_num, np.prod(args.state_shape) + np.prod(args.action_shape), 0, args.device, hidden_layer_size=args.hidden_size, output_dim=np.prod(args.state_shape) + 1, ).to(args.device) disc_optim = torch.optim.Adam(disc.parameters(), lr=args.critic_lr) # tunable temperature beta = torch.ones(1, requires_grad=True, device=args.device) beta_optim = torch.optim.Adam([beta], lr=args.critic_lr) if args.auto_alpha: target_entropy = -np.prod(env.action_space.shape) log_alpha = torch.zeros(1, requires_grad=True, device=args.device) alpha_optim = torch.optim.Adam([log_alpha], lr=args.alpha_lr) alpha = (target_entropy, log_alpha, alpha_optim) else: alpha = args.alpha rng = np.random.RandomState(seed=args.seed) policy = SACMUTRIRB2BPolicy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, alpha, [env.action_space.low[0], env.action_space.high[0]], reward_normalization=False, ignore_done=False, norm_diff=False, use_diff=False, process_tri=(lambda x, beta: process_tri(x, rng=rng, beta=beta) ), # continuous transition construction beta=(beta, beta_optim), # the tunable temperature discriminator=(disc, disc_optim), # the energy-based discriminator tor_diff=args.tor_diff # the tolerance of distance ) # collector if args.training_num == 0: max_episode_steps = train_envs._max_episode_steps else: max_episode_steps = train_envs.envs[0]._max_episode_steps train_collector = Collector( policy, train_envs, ReplayBufferTriple(args.buffer_size, max_ep_len=max_episode_steps)) test_collector = Collector(policy, test_envs, mode='test') # log log_path = os.path.join(args.logdir, args.task, 'sac_ct', str(args.seed)) writer = SummaryWriter(log_path) def save_fn(policy, name='policy.pth'): torch.save(policy.state_dict(), os.path.join(log_path, name)) env.spec.reward_threshold = 100000 def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = offpolicy_exact_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer, epochs_to_save=[1, 50, 100, 150, 200]) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
class View(object): def __init__(self, args, mask=None, name='full'): env = gym.make(args.task) self.stop_fn = lambda x: x >= env.spec.reward_threshold self.state_shape = env.observation_space.shape or env.observation_space.n self.action_shape = env.action_space.shape or env.action_space.n self.buffer = ReplayBuffer(400000) # Env # train_envs = gym.make(args.task) self.train_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) self.test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # Mask state_dim = int(np.prod(self.state_shape)) self._view_mask = torch.ones(state_dim) if mask == 'even': for i in range(0, state_dim, 2): self._view_mask[i] = 0 elif mask == "odd": for i in range(1, state_dim, 2): self._view_mask[i] = 0 elif type(mask) == int: self._view_mask[mask] = 0 # Model net = NetWithView(args.layer_num, self.state_shape, device=args.device, mask=self._view_mask) self.actor = Actor(net, self.action_shape).to(args.device) self.critic = Critic(net).to(args.device) optim = torch.optim.Adam(list( self.actor.parameters()) + list(self.critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical self.policy = PPOPolicy( self.actor, self.critic, optim, dist, args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, action_range=None) # Collector self.train_collector = Collector( self.policy, self.train_envs, ReplayBuffer(args.buffer_size)) self.test_collector = Collector(self.policy, self.test_envs) # Log self.writer = SummaryWriter(f'{args.logdir}/{args.task}/ppo/{args.note}/{name}') def seed(self, _seed): self.train_envs.seed(_seed) self.test_envs.seed(_seed) def close(self): self.train_collector.close() self.test_collector.close() def train(self): self.actor.train() self.critic.train() def learn_from_demos(self, batch, demo, peer=0): logits = self.policy(batch).logits demo = demo.act.detach() loss = F.cross_entropy(logits, demo) if peer != 0: peer_demo = demo[torch.randperm(len(demo))] loss -= peer * F.cross_entropy(logits, peer_demo) self.policy.optim.zero_grad() loss.backward() self.policy.optim.step()
def test_dqn(args=get_args()): env = LimitWrapper(StateBonus(ImgObsWrapper(gym.make(args.task)))) # print(env.observation_space.spaces['image']) # args.state_shape = env.observation_space.spaces['image'].shape args.state_shape = env.observation_space.shape args.action_shape = env.env.action_space.shape or env.env.action_space.n # train_envs = gym.make(args.task) train_envs = SubprocVectorEnv([ lambda: LimitWrapper(StateBonus(ImgObsWrapper(gym.make(args.task)))) for _ in range(args.training_num) ]) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv([ lambda: LimitWrapper(StateBonus(ImgObsWrapper(gym.make(args.task)))) for _ in range(args.test_num) ]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = DQN(args.state_shape[0], args.state_shape[1], args.action_shape, args.device) net = net.to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) policy = DQNPolicy(net, optim, args.gamma, args.n_step, use_target_network=args.target_update_freq > 0, target_update_freq=args.target_update_freq) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # policy.set_eps(1) train_collector.collect(n_step=args.batch_size * 4) print(len(train_collector.buffer)) # log writer = SummaryWriter(args.logdir + '/' + 'dqn') def stop_fn(x): # if env.env.spec.reward_threshold: # return x >= env.spec.reward_threshold # else: # return False return False def train_fn(x): policy.set_eps(args.eps_train) def test_fn(x): policy.set_eps(args.eps_test) # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, train_fn=train_fn, test_fn=test_fn, stop_fn=stop_fn, writer=writer, task=args.task) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = LimitWrapper(StateBonus(ImgObsWrapper(gym.make(args.task)))) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close() torch.save(policy.state_dict(), 'dqn.pth')
tb.configure(host="0.0.0.0", logdir=data_path) url = tb.launch() print(f"Started Tensorboard {url} at {data_path}...") writer = SummaryWriter(data_path) ### Configure export def save_fn(policy): torch.save(policy.state_dict(), os.path.join(data_path, 'policy.pth')) ### Configure early stopping of training def stop_fn(x): return x >= TARGET_EPISODE_STEPS ### Run the learning process result = onpolicy_trainer( policy, train_collector, test_collector, **trainer_config, stop_fn=stop_fn, save_fn=save_fn, writer=writer, verbose=True) print(f'Finished training! Use {result["duration"]}') ### Stop the data collectors train_collector.close() test_collector.close() # Enjoy a trained agent ! env = env_creator() collector = Collector(policy, env) result = collector.collect(n_episode=1, render=env.dt) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def train(args=get_args()): torch.set_num_threads(1) # we just need only one thread for NN env = gym_make() args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.action_range = [env.action_space.low, env.action_space.high] train_envs = VectorEnv( [lambda: gym_make() for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv([lambda: gym_make() for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = Actor(args.layer, args.state_shape, args.action_shape, args.action_range, args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic = Critic(args.layer, args.state_shape, args.action_shape, args.device).to(args.device) critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) policy = DDPGPolicy(actor, actor_optim, critic, critic_optim, args.tau, args.gamma, args.exploration_noise, args.action_range, reward_normalization=args.rew_norm, ignore_done=True) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'ddpg') writer = SummaryWriter(log_path) # if a model exist, continue to train it model_path = os.path.join(log_path, 'policy.pth') if os.path.exists(model_path): policy.load_state_dict(torch.load(model_path)) def save_fn(policy): torch.save(policy.state_dict(), model_path) def stop_fn(x): return x >= 100 # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, save_fn=save_fn, writer=writer) train_collector.close() test_collector.close() if __name__ == '__main__': # Let's watch its performance! env = gym_make() collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def train_agent( args: argparse.Namespace = get_args(), agent_learn: Optional[BasePolicy] = None, agent_opponent: Optional[BasePolicy] = None, optim: Optional[torch.optim.Optimizer] = None, ) -> Tuple[dict, BasePolicy]: def env_func(): return TicTacToeEnv(args.board_size, args.win_size) train_envs = VectorEnv([env_func for _ in range(args.training_num)]) test_envs = VectorEnv([env_func for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) policy, optim = get_agents(args, agent_learn=agent_learn, agent_opponent=agent_opponent, optim=optim) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # policy.set_eps(1) train_collector.collect(n_step=args.batch_size) # log if not hasattr(args, 'writer'): log_path = os.path.join(args.logdir, 'tic_tac_toe', 'dqn') writer = SummaryWriter(log_path) args.writer = writer else: writer = args.writer def save_fn(policy): if hasattr(args, 'model_save_path'): model_save_path = args.model_save_path else: model_save_path = os.path.join(args.logdir, 'tic_tac_toe', 'dqn', 'policy.pth') torch.save(policy.policies[args.agent_id - 1].state_dict(), model_save_path) def stop_fn(x): return x >= args.win_rate def train_fn(x): policy.policies[args.agent_id - 1].set_eps(args.eps_train) def test_fn(x): policy.policies[args.agent_id - 1].set_eps(args.eps_test) # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, train_fn=train_fn, test_fn=test_fn, stop_fn=stop_fn, save_fn=save_fn, writer=writer, test_in_train=False) train_collector.close() test_collector.close() return result, policy.policies[args.agent_id - 1]
def test_sac(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = ActorProb(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic1 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = SACPolicy(actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, args.alpha, [env.action_space.low[0], env.action_space.high[0]], reward_normalization=True, ignore_done=True) # Load expert model. assert args.load is not None, 'args.load should not be None' expert = deepcopy(policy) expert.load_state_dict( torch.load(f'{args.logdir}/{args.task}/sac/{args.load}/policy.pth')) expert.eval() # collector expert_collector = Collector(expert, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # train_collector.collect(n_step=args.buffer_size) # log writer = SummaryWriter(f'{args.logdir}/{args.task}/imitation/{args.note}') def stop_fn(x): return x >= (args.reward_threshold or env.spec.reward_threshold) def learner(pol, batch, batch_size, repeat, peer=0.): losses, peer_terms, ent_losses = [], [], [] for _ in range(repeat): for b in batch.split(batch_size): acts = pol(b).act demo = torch.tensor(b.act, dtype=torch.float) loss = F.mse_loss(acts, demo) if peer != 0: peer_demo = demo[torch.randperm(len(demo))] peer_term = peer * F.mse_loss(acts, peer_demo) loss -= peer_term peer_terms.append(peer_term.detach().cpu.numpy()) pol.actor_optim.zero_grad() loss.backward() pol.actor_optim.step() losses.append(loss.detach().cpu().numpy()) return { 'loss': losses, 'loss/ent': ent_losses, 'loss/peer': peer_terms if peer else None, 'peer': peer, } # trainer result = imitation_trainer(policy, learner, expert_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, 1, args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task, peer=args.peer, peer_decay_steps=args.peer_decay_steps) assert stop_fn(result['best_reward']) expert_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_ppo(args=get_args()): torch.set_num_threads(1) # we just need only one thread for NN env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = ActorProb(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device).to(args.device) critic = Critic(args.layer_num, args.state_shape, device=args.device).to(args.device) optim = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Normal policy = PPOPolicy( actor, critic, optim, dist, args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, reward_normalization=args.rew_norm, dual_clip=args.dual_clip, value_clip=args.value_clip, # action_range=[env.action_space.low[0], env.action_space.high[0]],) # if clip the action, ppo would not converge :) gae_lambda=args.gae_lambda) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'ppo') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_dqn(args=get_args()): env = FrameStack(AtariPreprocessing(gym.make(args.task), scale_obs=True), 4) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.env.action_space.shape or env.env.action_space.n # train_envs = gym.make(args.task) train_envs = SubprocVectorEnv([ lambda: FrameStack( AtariPreprocessing(gym.make(args.task), scale_obs=True), 4) for _ in range(args.training_num) ]) # test_envs = gym.make(args.task) test_envs = ShmVecEnv([ lambda: FrameStack( AtariPreprocessing(gym.make(args.task), scale_obs=True), 4) for _ in range(args.test_num) ]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = DQN(args.state_shape[1], args.state_shape[2], args.action_shape, args.device) net = net.to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) policy = DQNPolicy(net, optim, args.gamma, args.n_step, use_target_network=args.target_update_freq > 0, target_update_freq=args.target_update_freq) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size), episodic=True) test_collector = Collector(policy, test_envs) # policy.set_eps(1) train_collector.collect(n_step=10000, sampling=True) print(len(train_collector.buffer)) # log writer = SummaryWriter(args.logdir + '/' + 'dqn') def stop_fn(x): if env.env.spec.reward_threshold: return x >= env.spec.reward_threshold else: return False def train_fn(x): policy.set_eps(args.eps_train) def test_fn(x): policy.set_eps(args.eps_test) # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, train_fn=train_fn, test_fn=test_fn, stop_fn=stop_fn, writer=writer, task=args.task) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = FrameStack( AtariPreprocessing(gym.make(args.task), scale_obs=True), 4) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_sac(): args, log_path, writer = get_args() env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = ShmPipeVecEnv([ lambda: TransformReward(BipedalWrapper(gym.make(args.task)), lambda reward: 5 * reward) for _ in range(args.training_num) ]) # test_envs = gym.make(args.task) test_envs = ShmPipeVecEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed + 1) # model actor = ActorProb(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic = DQCritic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic_target = DQCritic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) policy = SACPolicy(actor, actor_optim, critic, critic_optim, critic_target, env.action_space, args.device, args.tau, args.gamma, args.alpha, reward_normalization=args.rew_norm, ignore_done=False) if args.mode == 'test': policy.load_state_dict( torch.load("{}/{}/{}/policy.pth".format(args.logdir, args.task, args.comment), map_location=args.device)) env = gym.make(args.task) collector = Collector(policy, env # Monitor(env, 'video', force=True) ) result = collector.collect(n_episode=10, render=args.render) print( f'Final reward: {result["ep/reward"]}, length: {result["ep/len"]}') collector.close() exit() # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) train_collector.collect(10000, sampling=True) test_collector = Collector(policy, test_envs) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold + 5 # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_episode, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) pprint.pprint(result)
def test_sac(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = ActorProb(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic1 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) if args.auto_alpha: target_entropy = -np.prod(env.action_space.shape) log_alpha = torch.zeros(1, requires_grad=True, device=args.device) alpha_optim = torch.optim.Adam([log_alpha], lr=args.alpha_lr) alpha = (target_entropy, log_alpha, alpha_optim) else: alpha = args.alpha policy = SACPolicy(actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, alpha, [env.action_space.low[0], env.action_space.high[0]], reward_normalization=args.rew_norm, ignore_done=True, exploration_noise=OUNoise(0.0, args.noise_std)) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # train_collector.collect(n_step=args.buffer_size) # log log_path = os.path.join(args.logdir, args.task, 'sac') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_sac(args=get_args()): torch.set_num_threads(1) # we just need only one thread for NN env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = ActorProb(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic1 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = SACPolicy(actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, args.alpha, [env.action_space.low[0], env.action_space.high[0]], reward_normalization=True, ignore_done=True) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # train_collector.collect(n_step=args.buffer_size) # log log_path = os.path.join(args.logdir, args.task, 'sac') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_dqn(args=get_args()): # env task_env = EnvRegister(args.task) env = gym.make(task_env) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n rospy.loginfo(args.state_shape) rospy.loginfo(args.action_shape) train_envs = VectorEnv( [lambda: gym.make(task_env) for _ in range(args.training_num)]) test_envs = VectorEnv( [lambda: gym.make(task_env) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, args.action_shape, args.device) net = net.to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) policy = DQNPolicy(net, optim, args.gamma, args.n_step, use_target_network=args.target_update_freq > 0, target_update_freq=args.target_update_freq) # collector rospy.loginfo("init collector") train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) train_collector.collect(n_step=args.batch_size) # log writer = SummaryWriter(args.logdir + '/' + 'dqn') rew_record = [] def stop_fn(x): # if x >= 10000: for s in x: if s.get('reach_goal') == True: rew_record.extend(s) rospy.loginfo("reach goal times = " + str(len(rew_record))) if (len(rew_record) > 1000): return True else: return False # else: # rew_record.clear() # return False def train_fn(x): policy.set_eps(args.eps_train, args.eps_decay, args.eps_min) def test_fn(x): policy.set_eps(args.eps_test, args.eps_decay, args.eps_min) # trainer rospy.loginfo("start training") result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, train_fn=train_fn, test_fn=test_fn, stop_fn=stop_fn, writer=writer) assert stop_fn(result['best_reward']) pprint.pprint(result) train_collector.close() test_collector.close() # save network torch.save(net, 'ginger_dqn_pathplanning.pkl') rospy.loginfo("training finish, testing...") # Let's watch its performance! env_test = gym.make(task_env) net_test = torch.load('ginger_dqn_pathplanning.pkl') policy_test = DQNPolicy(net_test, optim, args.gamma, args.n_step, use_target_network=args.target_update_freq > 0, target_update_freq=args.target_update_freq) collector = Collector(policy_test, env_test) result = collector.collect(n_episode=1, render=args.render) rospy.loginfo(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()