def test_noise(): noise = GaussianNoise() size = (3, 4, 5) assert np.allclose(noise(size).shape, size) noise = OUNoise() noise.reset() assert np.allclose(noise(size).shape, size)
def test_td3(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # train_envs = gym.make(args.task) train_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, device=args.device) actor = Actor( net, args.action_shape, args.max_action, args.device ).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) net = Net(args.layer_num, args.state_shape, args.action_shape, concat=True, device=args.device) critic1 = Critic(net, args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(net, args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = TD3Policy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, GaussianNoise(sigma=args.exploration_noise), args.policy_noise, args.update_actor_freq, args.noise_clip, [env.action_space.low[0], env.action_space.high[0]], reward_normalization=True, ignore_done=True) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # train_collector.collect(n_step=args.buffer_size) # log writer = SummaryWriter(args.logdir + '/' + 'td3') def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer) assert stop_fn(result['best_reward']) if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}')
def init_policy(args, env): actor = Actor(layer=None, state_shape=args.state_shape, action_shape=args.action_shape, action_range=args.action_range, device=args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic1 = Critic(layer=None, state_shape=args.state_shape, action_shape=args.action_shape, device=args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(layer=None, state_shape=args.state_shape, action_shape=args.action_shape, device=args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = TD3Policy(actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, GaussianNoise(sigma=args.exploration_noise), args.policy_noise, args.update_actor_freq, args.noise_clip, args.action_range, reward_normalization=args.rew_norm, ignore_done=args.ignore_done, estimation_step=args.n_step) return policy
def __init__( self, actor: torch.nn.Module, actor_optim: torch.optim.Optimizer, critic1: torch.nn.Module, critic1_optim: torch.optim.Optimizer, critic2: torch.nn.Module, critic2_optim: torch.optim.Optimizer, tau: float = 0.005, gamma: float = 0.99, exploration_noise: Optional[BaseNoise] = GaussianNoise(sigma=0.1), policy_noise: float = 0.2, update_actor_freq: int = 2, noise_clip: float = 0.5, alpha: float = 2.5, reward_normalization: bool = False, estimation_step: int = 1, **kwargs: Any, ) -> None: super().__init__( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, tau, gamma, exploration_noise, policy_noise, update_actor_freq, noise_clip, reward_normalization, estimation_step, **kwargs ) self._alpha = alpha
def __init__( self, actor: torch.nn.Module, actor_optim: torch.optim.Optimizer, critic1: torch.nn.Module, critic1_optim: torch.optim.Optimizer, critic2: torch.nn.Module, critic2_optim: torch.optim.Optimizer, tau: float = 0.005, gamma: float = 0.99, exploration_noise: Optional[BaseNoise] = GaussianNoise(sigma=0.1), policy_noise: float = 0.2, update_actor_freq: int = 2, noise_clip: float = 0.5, reward_normalization: bool = False, estimation_step: int = 1, **kwargs: Any, ) -> None: super().__init__(actor, actor_optim, None, None, tau, gamma, exploration_noise, reward_normalization, estimation_step, **kwargs) self.critic1, self.critic1_old = critic1, deepcopy(critic1) self.critic1_old.eval() self.critic1_optim = critic1_optim self.critic2, self.critic2_old = critic2, deepcopy(critic2) self.critic2_old.eval() self.critic2_optim = critic2_optim self._policy_noise = policy_noise self._freq = update_actor_freq self._noise_clip = noise_clip self._cnt = 0 self._last = 0
def __init__( self, actor: Optional[torch.nn.Module], actor_optim: Optional[torch.optim.Optimizer], critic: Optional[torch.nn.Module], critic_optim: Optional[torch.optim.Optimizer], simulator: Optional[torch.nn.Module], args, action_range: Tuple[float, float], tau: float = 0.005, gamma: float = 0.99, exploration_noise: Optional[BaseNoise] = GaussianNoise(sigma=0.1), reward_normalization: bool = False, ignore_done: bool = False, estimation_step: int = 1, **kwargs: Any, ) -> None: super().__init__(**kwargs) if actor is not None and actor_optim is not None: self.actor: torch.nn.Module = actor self.actor_old = deepcopy(actor) self.actor_old.eval() self.actor_optim: torch.optim.Optimizer = actor_optim if critic is not None and critic_optim is not None: self.critic: torch.nn.Module = critic self.critic_old = deepcopy(critic) self.critic_old.eval() self.critic_optim: torch.optim.Optimizer = critic_optim if simulator is not None: self.simulator = simulator self.args = args self.simulation_env = None self.simulator_loss_threshold = self.args.simulator_loss_threshold self.base_env = gym.make(args.task) assert 0.0 <= tau <= 1.0, "tau should be in [0, 1]" self._tau = tau assert 0.0 <= gamma <= 1.0, "gamma should be in [0, 1]" self._gamma = gamma self._noise = exploration_noise self._range = action_range self._action_bias = (action_range[0] + action_range[1]) / 2.0 self._action_scale = (action_range[1] - action_range[0]) / 2.0 # it is only a little difference to use GaussianNoise # self.noise = OUNoise() self._rm_done = ignore_done self._rew_norm = reward_normalization assert estimation_step > 0, "estimation_step should be greater than 0" self._n_step = estimation_step self.loss_history = [] self.gbm_model = None self.update_step = self.args.max_update_step self.simulator_buffer = ReplayBuffer(size=self.args.buffer_size)
def __init__( self, actor: Optional[torch.nn.Module], actor_optim: Optional[torch.optim.Optimizer], critic: Optional[torch.nn.Module], critic_optim: Optional[torch.optim.Optimizer], action_range: Tuple[float, float], tau: float = 0.005, gamma: float = 0.99, exploration_noise: Optional[BaseNoise] = GaussianNoise(sigma=0.1), reward_normalization: bool = False, ignore_done: bool = False, estimation_step: int = 1, **kwargs: Any, ) -> None: super().__init__(**kwargs) if actor is not None and actor_optim is not None: self.actor: torch.nn.Module = actor self.actor_old = deepcopy(actor) self.actor_old.eval() self.actor_optim: torch.optim.Optimizer = actor_optim if critic is not None and critic_optim is not None: self.critic: torch.nn.Module = critic self.critic_old = deepcopy(critic) self.critic_old.eval() self.critic_optim: torch.optim.Optimizer = critic_optim assert 0.0 <= tau <= 1.0, "tau should be in [0, 1]" self._tau = tau assert 0.0 <= gamma <= 1.0, "gamma should be in [0, 1]" self._gamma = gamma self._noise = exploration_noise self._range = action_range self._action_bias = torch.tensor( (action_range[0] + action_range[1]) / 2.0) # self._action_bias = (action_range[0] + action_range[1]) / 2.0 # force policy to center at init parameters # self._action_bias = torch.tensor(np.array([0.5, 1.57, 6, 20, 0.75, 1, 0.3])) self._action_scale = torch.tensor( (action_range[1] - action_range[0]) / 2.0) # it is only a little difference to use GaussianNoise # self.noise = OUNoise() self._rm_done = ignore_done self._rew_norm = reward_normalization assert estimation_step > 0, "estimation_step should be greater than 0" self._n_step = estimation_step
def __init__(self, actor: torch.nn.Module, actor_optim: torch.optim.Optimizer, critic1: torch.nn.Module, critic1_optim: torch.optim.Optimizer, critic2: torch.nn.Module, critic2_optim: torch.optim.Optimizer, tau: float = 0.005, gamma: float = 0.99, exploration_noise: Optional[BaseNoise] = GaussianNoise(sigma=0.1), policy_noise: float = 0.2, update_actor_freq: int = 2, noise_clip: float = 0.5, action_range: Optional[Tuple[float, float]] = None, reward_normalization: bool = False, ignore_done: bool = False, estimation_step: int = 1, **kwargs) -> None: super().__init__(actor, actor_optim, None, None, tau, gamma, exploration_noise, action_range, reward_normalization, ignore_done, estimation_step, **kwargs) self.critic1, self.critic1_old = critic1, deepcopy(critic1) self.critic1_old.eval() self.critic1_optim = critic1_optim self.critic2, self.critic2_old = critic2, deepcopy(critic2) self.critic2_old.eval() self.critic2_optim = critic2_optim self.norm_func = kwargs.get('norm_func', None) self.process_tri = kwargs.get('process_tri', (lambda x, beta: x)) self.disc, self.disc_optim = kwargs.get('discriminator', [None, None]) self.beta, self.beta_optim = kwargs.get('beta', [None, None]) self.norm_diff = kwargs.get('norm_diff', False) self.tor_diff = kwargs.get('tor_diff', 0.1) self.use_diff = kwargs.get('use_diff', True) self._policy_noise = policy_noise self._freq = update_actor_freq self._noise_clip = noise_clip self._cnt = 0 self._last = 0
def __init__( self, actor: Optional[torch.nn.Module], actor_optim: Optional[torch.optim.Optimizer], critic: Optional[torch.nn.Module], critic_optim: Optional[torch.optim.Optimizer], tau: float = 0.005, gamma: float = 0.99, exploration_noise: Optional[BaseNoise] = GaussianNoise(sigma=0.1), reward_normalization: bool = False, estimation_step: int = 1, action_scaling: bool = True, action_bound_method: str = "clip", **kwargs: Any, ) -> None: super().__init__( action_scaling=action_scaling, action_bound_method=action_bound_method, **kwargs ) assert action_bound_method != "tanh", "tanh mapping is not supported" \ "in policies where action is used as input of critic , because" \ "raw action in range (-inf, inf) will cause instability in training" if actor is not None and actor_optim is not None: self.actor: torch.nn.Module = actor self.actor_old = deepcopy(actor) self.actor_old.eval() self.actor_optim: torch.optim.Optimizer = actor_optim if critic is not None and critic_optim is not None: self.critic: torch.nn.Module = critic self.critic_old = deepcopy(critic) self.critic_old.eval() self.critic_optim: torch.optim.Optimizer = critic_optim assert 0.0 <= tau <= 1.0, "tau should be in [0, 1]" self.tau = tau assert 0.0 <= gamma <= 1.0, "gamma should be in [0, 1]" self._gamma = gamma self._noise = exploration_noise # it is only a little difference to use GaussianNoise # self.noise = OUNoise() self._rew_norm = reward_normalization self._n_step = estimation_step
def __init__(self, actor: torch.nn.Module, actor_optim: torch.optim.Optimizer, critic: torch.nn.Module, critic_optim: torch.optim.Optimizer, tau: float = 0.005, gamma: float = 0.99, exploration_noise: Optional[BaseNoise] = GaussianNoise( sigma=0.1), action_range: Optional[Tuple[float, float]] = None, reward_normalization: bool = False, ignore_done: bool = False, estimation_step: int = 1, **kwargs) -> None: super().__init__(**kwargs) if actor is not None: self.actor, self.actor_old = actor, deepcopy(actor) self.actor_old.eval() self.actor_optim = actor_optim if critic is not None: self.critic, self.critic_old = critic, deepcopy(critic) self.critic_old.eval() self.critic_optim = critic_optim assert 0 <= tau <= 1, 'tau should in [0, 1]' self._tau = tau assert 0 <= gamma <= 1, 'gamma should in [0, 1]' self._gamma = gamma self._noise = exploration_noise assert action_range is not None self._range = action_range self._action_bias = (action_range[0] + action_range[1]) / 2 self._action_scale = (action_range[1] - action_range[0]) / 2 # it is only a little difference to use rand_normal # self.noise = OUNoise() self._rm_done = ignore_done self._rew_norm = reward_normalization assert estimation_step > 0, 'estimation_step should greater than 0' self._n_step = estimation_step
def __init__( self, actor: Optional[torch.nn.Module], actor_optim: Optional[torch.optim.Optimizer], critic: Optional[torch.nn.Module], critic_optim: Optional[torch.optim.Optimizer], action_range: Tuple[float, float], tau: float = 0.005, gamma: float = 0.99, exploration_noise: Optional[BaseNoise] = GaussianNoise(sigma=0.1), reward_normalization: bool = False, estimation_step: int = 1, **kwargs: Any, ) -> None: super().__init__(**kwargs) if actor is not None and actor_optim is not None: self.actor: torch.nn.Module = actor self.actor_old = deepcopy(actor) self.actor_old.eval() self.actor_optim: torch.optim.Optimizer = actor_optim if critic is not None and critic_optim is not None: self.critic: torch.nn.Module = critic self.critic_old = deepcopy(critic) self.critic_old.eval() self.critic_optim: torch.optim.Optimizer = critic_optim assert 0.0 <= tau <= 1.0, "tau should be in [0, 1]" self._tau = tau assert 0.0 <= gamma <= 1.0, "gamma should be in [0, 1]" self._gamma = gamma self._noise = exploration_noise self._range = action_range self._action_bias = (action_range[0] + action_range[1]) / 2.0 self._action_scale = (action_range[1] - action_range[0]) / 2.0 # it is only a little difference to use GaussianNoise # self.noise = OUNoise() self._rew_norm = reward_normalization self._n_step = estimation_step
def test_td3(args=get_args()): # initialize environment env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = Actor(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device, hidden_layer_size=args.hidden_size).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic1 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device, hidden_layer_size=args.hidden_size).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device, hidden_layer_size=args.hidden_size).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = TD3Policy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, GaussianNoise(sigma=args.exploration_noise), args.policy_noise, args.update_actor_freq, args.noise_clip, action_range=[env.action_space.low[0], env.action_space.high[0]], reward_normalization=args.rew_norm, ignore_done=False) # collector if args.training_num == 0: max_episode_steps = train_envs._max_episode_steps else: max_episode_steps = train_envs.envs[0]._max_episode_steps train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size, max_ep_len=max_episode_steps)) test_collector = Collector(policy, test_envs, mode='test') # log log_path = os.path.join(args.logdir, args.task, 'td3', str(args.seed)) writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) env.spec.reward_threshold = 100000 def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = offpolicy_exact_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def main(id, avg, applx): config = init_actor(id) env_config = config['env_config'] if env_config['world_name'] != "sequential_applr_testbed.world": assert os.path.exists(join("/jackal_ws/src/jackal_helper/worlds", path_to_world(worlds[id]))) env_config['world_name'] = path_to_world(worlds[id]) wrapper_config = config['wrapper_config'] training_config = config['training_config'] wrapper_dict = jackal_navi_envs.jackal_env_wrapper.wrapper_dict env = wrapper_dict[wrapper_config['wrapper']](gym.make(config["env"], **env_config), **wrapper_config['wrapper_args']) state_shape = env.observation_space.shape or env.observation_space.n action_shape = env.action_space.shape or env.action_space.n # Load the model device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') net = Net(training_config['num_layers'], state_shape, device=device, hidden_layer_size=training_config['hidden_size']) if config['section'] == 'SAC': actor = ActorProb( net, action_shape, 1, device, hidden_layer_size=training_config['hidden_size'] ).to(device) else: actor = Actor( net, action_shape, 1, device, hidden_layer_size=training_config['hidden_size'] ).to(device) actor_optim = torch.optim.Adam(actor.parameters(), lr=training_config['actor_lr']) net = Net(training_config['num_layers'], state_shape, action_shape, concat=True, device=device, hidden_layer_size=training_config['hidden_size']) critic1 = Critic(net, device, hidden_layer_size=training_config['hidden_size']).to(device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=training_config['critic_lr']) critic2 = Critic(net, device, hidden_layer_size=training_config['hidden_size']).to(device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=training_config['critic_lr']) if config['section'] == 'SAC': policy = SACPolicy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, action_range=[env.action_space.low, env.action_space.high], tau=training_config['tau'], gamma=training_config['gamma'], reward_normalization=training_config['rew_norm'], ignore_done=training_config['ignore_done'], alpha=training_config['sac_alpha'], exploration_noise=None, estimation_step=training_config['n_step']) else: policy = TD3Policy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, action_range=[env.action_space.low, env.action_space.high], tau=training_config['tau'], gamma=training_config['gamma'], exploration_noise=GaussianNoise(sigma=training_config['exploration_noise']), policy_noise=training_config['policy_noise'], update_actor_freq=training_config['update_actor_freq'], noise_clip=training_config['noise_clip'], reward_normalization=training_config['rew_norm'], ignore_done=training_config['ignore_done'], estimation_step=training_config['n_step']) print(env.action_space.low, env.action_space.high) print(">>>>>>>>>>>>>> Running on world_%d <<<<<<<<<<<<<<<<" %(worlds[id])) ep = 0 for _ in range(avg): obs = env.reset() gp = env.gp scan = env.scan obs_batch = Batch(obs=[obs], info={}) ep += 1 traj = [] done = False count = 0 policy = load_model(policy) while not done: obs_x = [scan, gp] if not applx: actions = policy(obs_batch).act.cpu().detach().numpy().reshape(-1) else: actions = APPLX[applx](obs_x) obs_new, rew, done, info = env.step(actions) count += 1 info["world"] = worlds[id] gp = info.pop("gp") scan = info.pop("scan") traj.append([obs, actions, rew, done, {"world": worlds[id], "succeed": info["succeed"]}]) obs_batch = Batch(obs=[obs_new], info={}) obs = obs_new # print('count: %d, rew: %f' %(count, rew)) write_buffer(traj, ep, id) env.close()
def main(id): config = init_actor(id) env_config = config['env_config'] if env_config['world_name'] != "sequential_applr_testbed.world": env_config['world_name'] = 'Benchmarking/train/world_%d.world' %(benchmarking_train[id]) assert os.path.exists('/jackal_ws/src/jackal_helper/worlds/Benchmarking/train/world_%d.world' %(benchmarking_train[id])) wrapper_config = config['wrapper_config'] training_config = config['training_config'] wrapper_dict = jackal_navi_envs.jackal_env_wrapper.wrapper_dict if config['env'] == 'jackal': env = wrapper_dict[wrapper_config['wrapper']](gym.make('jackal_continuous-v0', **env_config), **wrapper_config['wrapper_args']) else: env = gym.make('Pendulum-v0') state_shape = env.observation_space.shape or env.observation_space.n action_shape = env.action_space.shape or env.action_space.n device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') net = Net(training_config['num_layers'], state_shape, device=device, hidden_layer_size=training_config['hidden_size']) actor = Actor( net, action_shape, 1, device, hidden_layer_size=training_config['hidden_size'] ).to(device) actor_optim = torch.optim.Adam(actor.parameters(), lr=training_config['actor_lr']) net = Net(training_config['num_layers'], state_shape, action_shape, concat=True, device=device, hidden_layer_size=training_config['hidden_size']) critic1 = Critic(net, device, hidden_layer_size=training_config['hidden_size']).to(device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=training_config['critic_lr']) critic2 = Critic(net, device, hidden_layer_size=training_config['hidden_size']).to(device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=training_config['critic_lr']) policy = TD3Policy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, action_range=[env.action_space.low, env.action_space.high], tau=training_config['tau'], gamma=training_config['gamma'], exploration_noise=GaussianNoise(sigma=training_config['exploration_noise']), policy_noise=training_config['policy_noise'], update_actor_freq=training_config['update_actor_freq'], noise_clip=training_config['noise_clip'], reward_normalization=training_config['rew_norm'], ignore_done=training_config['ignore_done'], estimation_step=training_config['n_step']) print(env.action_space.low, env.action_space.high) ep = 0 while True: obs = env.reset() obs_batch = Batch(obs=[obs], info={}) ep += 1 traj = [] done = False count = 0 policy, eps = load_model(policy) policy.set_exp_noise(GaussianNoise(sigma=eps)) while not done: time.sleep(0.01) p = random.random() obs = torch.tensor([obs]).float() actions = policy(obs_batch).act.cpu().detach().numpy() #actions = np.array([0.5, 1.57, 6, 20, 0.3]) obs_new, rew, done, info = env.step(actions.reshape(-1)) count += 1 traj.append([obs, actions, rew, done, info]) obs_batch = Batch(obs=[obs_new], info={}) obs = obs_new # print('count: %d, rew: %f' %(count, rew)) write_buffer(traj, ep, id)
def test_td3_bc(): args = get_args() env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # float print("device:", args.device) print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) args.state_dim = args.state_shape[0] args.action_dim = args.action_shape[0] print("Max_action", args.max_action) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) if args.norm_obs: test_envs = VectorEnvNormObs(test_envs, update_obs_rms=False) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) test_envs.seed(args.seed) # model # actor network net_a = Net( args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device, ) actor = Actor( net_a, action_shape=args.action_shape, max_action=args.max_action, device=args.device, ).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) # critic network net_c1 = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device, ) net_c2 = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device, ) critic1 = Critic(net_c1, device=args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(net_c2, device=args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = TD3BCPolicy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, tau=args.tau, gamma=args.gamma, exploration_noise=GaussianNoise(sigma=args.exploration_noise), policy_noise=args.policy_noise, update_actor_freq=args.update_actor_freq, noise_clip=args.noise_clip, alpha=args.alpha, estimation_step=args.n_step, action_space=env.action_space, ) # load a previous policy if args.resume_path: policy.load_state_dict( torch.load(args.resume_path, map_location=args.device)) print("Loaded agent from: ", args.resume_path) # collector test_collector = Collector(policy, test_envs) # log now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") args.algo_name = "td3_bc" log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) log_path = os.path.join(args.logdir, log_name) # logger if args.logger == "wandb": logger = WandbLogger( save_interval=1, name=log_name.replace(os.path.sep, "__"), run_id=args.resume_id, config=args, project=args.wandb_project, ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) if args.logger == "tensorboard": logger = TensorboardLogger(writer) else: # wandb logger.load(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth")) def watch(): if args.resume_path is None: args.resume_path = os.path.join(log_path, "policy.pth") policy.load_state_dict( torch.load(args.resume_path, map_location=torch.device("cpu"))) policy.eval() collector = Collector(policy, env) collector.collect(n_episode=1, render=1 / 35) if not args.watch: replay_buffer = load_buffer_d4rl(args.expert_data_task) if args.norm_obs: replay_buffer, obs_rms = normalize_all_obs_in_replay_buffer( replay_buffer) test_envs.set_obs_rms(obs_rms) # trainer result = offline_trainer( policy, replay_buffer, test_collector, args.epoch, args.step_per_epoch, args.test_num, args.batch_size, save_best_fn=save_best_fn, logger=logger, ) pprint.pprint(result) else: watch() # Let's watch its performance! policy.eval() test_envs.seed(args.seed) test_collector.reset() result = test_collector.collect(n_episode=args.test_num, render=args.render) print( f"Final reward: {result['rews'].mean()}, length: {result['lens'].mean()}" )
def test_ddpg(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # train_envs = gym.make(args.task) train_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) actor = Actor(net, args.action_shape, max_action=args.max_action, device=args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) net = Net(args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device) critic = Critic(net, device=args.device).to(args.device) critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) policy = DDPGPolicy( actor, actor_optim, critic, critic_optim, action_range=[env.action_space.low[0], env.action_space.high[0]], tau=args.tau, gamma=args.gamma, exploration_noise=GaussianNoise(sigma=args.exploration_noise), reward_normalization=True, ignore_done=True) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log writer = SummaryWriter(args.logdir + '/' + 'ddpg') def stop_fn(mean_rewards): return mean_rewards >= env.spec.reward_threshold # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer) assert stop_fn(result['best_reward']) if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! policy.eval() test_envs.seed(args.seed) test_collector.reset() result = test_collector.collect(n_episode=[1] * args.test_num, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}')
def test_sddpg(args=get_args()): t = time.time() torch.set_num_threads(1) # we just need only one thread for NN env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, device=args.device) actor = Actor(net, args.action_shape, args.max_action, args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) net = Net(args.layer_num, args.state_shape, args.action_shape, concat=True, device=args.device) critic = Critic(net, args.device).to(args.device) critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) if args.model == 'ODEGBM': model = ODEGBM(args).to(args.device) elif args.model == 'PriorGBM': model = PriorGBM(args).to(args.device) elif args.model == 'NODAE': model = NODAE(args).to(args.device) else: assert args.model == 'ODENet' model = ODENet(args).to(args.device) policy = SDDPGPolicy( actor, actor_optim, critic, critic_optim, model, args, action_range=[env.action_space.low[0], env.action_space.high[0]], tau=args.tau, gamma=args.gamma, exploration_noise=GaussianNoise(sigma=args.exploration_noise), reward_normalization=args.rew_norm, ignore_done=args.ignore_done, estimation_step=args.n_step) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector( policy, test_envs, action_noise=GaussianNoise(sigma=args.test_noise)) # log log_path = os.path.join(args.logdir, args.task, 'sddpg') writer = SummaryWriter(log_path) def train_fn(x, global_step): loss_history = np.array(policy.loss_history) if len(loss_history) <= args.max_update_step: return None x = np.arange(len(loss_history)) fig, ax = plt.subplots(figsize=(50, 40)) ax.plot(x[:args.max_update_step], loss_history[:args.max_update_step, 0], label="Transition loss") ax.plot(x[:args.max_update_step], loss_history[:args.max_update_step, 1], label="Reward loss") ax.plot(x, loss_history[:, 2], label="Actor loss") ax.plot(x, loss_history[:, 3], label="Critic loss") ax.plot(x[args.max_update_step:], loss_history[args.max_update_step:, 4], label="Actor loss (simulation)") ax.plot(x[args.max_update_step:], loss_history[args.max_update_step:, 5], label="Critic loss (simulation)") ax.set_xlabel('Step') ax.set_ylabel('Loss') ax.legend(loc='best') plt.savefig(log_path + str(args.max_update_step) + "_" + str(args.trans_relative_noise) + str(args.seed) + "_" + str(time.time() - t) + ".pdf") plt.close() return None def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, train_fn=train_fn, stop_fn=stop_fn, save_fn=save_fn, writer=writer, verbose=False, update_per_step=1) assert stop_fn(result['best_reward']) if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) policy.eval() collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}')
def test_td3_bc(args=get_args()): if os.path.exists(args.load_buffer_name) and os.path.isfile(args.load_buffer_name): if args.load_buffer_name.endswith(".hdf5"): buffer = VectorReplayBuffer.load_hdf5(args.load_buffer_name) else: buffer = pickle.load(open(args.load_buffer_name, "rb")) else: buffer = gather_data() env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # float if args.reward_threshold is None: # too low? default_reward_threshold = {"Pendulum-v0": -1200, "Pendulum-v1": -1200} args.reward_threshold = default_reward_threshold.get( args.task, env.spec.reward_threshold ) args.state_dim = args.state_shape[0] args.action_dim = args.action_shape[0] # test_envs = gym.make(args.task) test_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)] ) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) test_envs.seed(args.seed) # model # actor network net_a = Net( args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device, ) actor = Actor( net_a, action_shape=args.action_shape, max_action=args.max_action, device=args.device, ).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) # critic network net_c1 = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device, ) net_c2 = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device, ) critic1 = Critic(net_c1, device=args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(net_c2, device=args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = TD3BCPolicy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, tau=args.tau, gamma=args.gamma, exploration_noise=GaussianNoise(sigma=args.exploration_noise), policy_noise=args.policy_noise, update_actor_freq=args.update_actor_freq, noise_clip=args.noise_clip, alpha=args.alpha, estimation_step=args.n_step, action_space=env.action_space, ) # load a previous policy if args.resume_path: policy.load_state_dict(torch.load(args.resume_path, map_location=args.device)) print("Loaded agent from: ", args.resume_path) # collector # buffer has been gathered # train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) # log t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_td3_bc' log_path = os.path.join(args.logdir, args.task, 'td3_bc', log_file) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) logger = TensorboardLogger(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): return mean_rewards >= args.reward_threshold def watch(): policy.load_state_dict( torch.load( os.path.join(log_path, 'policy.pth'), map_location=torch.device('cpu') ) ) policy.eval() collector = Collector(policy, env) collector.collect(n_episode=1, render=1 / 35) # trainer trainer = OfflineTrainer( policy, buffer, test_collector, args.epoch, args.step_per_epoch, args.test_num, args.batch_size, save_best_fn=save_best_fn, stop_fn=stop_fn, logger=logger, ) for epoch, epoch_stat, info in trainer: print(f"Epoch: {epoch}") print(epoch_stat) print(info) assert stop_fn(info["best_reward"]) # Let's watch its performance! if __name__ == "__main__": pprint.pprint(info) env = gym.make(args.task) policy.eval() collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) rews, lens = result["rews"], result["lens"] print(f"Final reward: {rews.mean()}, length: {lens.mean()}")
def test_ddpg(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] args.exploration_noise = args.exploration_noise * args.max_action print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) # train_envs = gym.make(args.task) if args.training_num > 1: train_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) else: train_envs = gym.make(args.task) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net_a = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) actor = Actor( net_a, args.action_shape, max_action=args.max_action, device=args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) net_c = Net(args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device) critic = Critic(net_c, device=args.device).to(args.device) critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) policy = DDPGPolicy( actor, actor_optim, critic, critic_optim, action_range=[env.action_space.low[0], env.action_space.high[0]], tau=args.tau, gamma=args.gamma, exploration_noise=GaussianNoise(sigma=args.exploration_noise), estimation_step=args.n_step) # load a previous policy if args.resume_path: policy.load_state_dict(torch.load( args.resume_path, map_location=args.device )) print("Loaded agent from: ", args.resume_path) # collector if args.training_num > 1: buffer = VectorReplayBuffer(args.buffer_size, len(train_envs)) else: buffer = ReplayBuffer(args.buffer_size) train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) train_collector.collect(n_step=args.start_timesteps, random=True) # log log_path = os.path.join(args.logdir, args.task, 'ddpg', 'seed_' + str( args.seed) + '_' + datetime.datetime.now().strftime('%m%d-%H%M%S')) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) logger = BasicLogger(writer) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) # trainer result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.step_per_collect, args.test_num, args.batch_size, save_fn=save_fn, logger=logger, update_per_step=args.update_per_step, test_in_train=False) # Let's watch its performance! policy.eval() test_envs.seed(args.seed) test_collector.reset() result = test_collector.collect(n_episode=args.test_num, render=args.render) print(f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}')
args.action_shape, concat=True, device=args.device) critic1 = Critic(net, args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(net, args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = TD3Policy(actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, GaussianNoise(sigma=args.exploration_noise), args.policy_noise, args.update_actor_freq, args.noise_clip, [env.action_space.low[0], env.action_space.high[0]], reward_normalization=args.rew_norm, ignore_done=args.ignore_done, estimation_step=args.n_step) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # train_collector.collect(n_step=args.buffer_size) # log log_path = os.path.join(args.logdir, args.task, 'td3') writer = SummaryWriter(log_path)
def test_td3(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] if args.reward_threshold is None: default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250} args.reward_threshold = default_reward_threshold.get( args.task, env.spec.reward_threshold) # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) actor = Actor(net, args.action_shape, max_action=args.max_action, device=args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) net_c1 = Net(args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device) critic1 = Critic(net_c1, device=args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) net_c2 = Net(args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device) critic2 = Critic(net_c2, device=args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = TD3Policy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, tau=args.tau, gamma=args.gamma, exploration_noise=GaussianNoise(sigma=args.exploration_noise), policy_noise=args.policy_noise, update_actor_freq=args.update_actor_freq, noise_clip=args.noise_clip, reward_normalization=args.rew_norm, estimation_step=args.n_step, action_space=env.action_space) # collector train_collector = Collector(policy, train_envs, VectorReplayBuffer(args.buffer_size, len(train_envs)), exploration_noise=True) test_collector = Collector(policy, test_envs) # train_collector.collect(n_step=args.buffer_size) # log log_path = os.path.join(args.logdir, args.task, 'td3') writer = SummaryWriter(log_path) logger = TensorboardLogger(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): return mean_rewards >= args.reward_threshold # Iterator trainer trainer = OffpolicyTrainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.step_per_collect, args.test_num, args.batch_size, update_per_step=args.update_per_step, stop_fn=stop_fn, save_best_fn=save_best_fn, logger=logger, ) for epoch, epoch_stat, info in trainer: print(f"Epoch: {epoch}") print(epoch_stat) print(info) assert stop_fn(info["best_reward"]) if __name__ == "__main__": pprint.pprint(info) # Let's watch its performance! env = gym.make(args.task) policy.eval() collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) rews, lens = result["rews"], result["lens"] print(f"Final reward: {rews.mean()}, length: {lens.mean()}")
def main(id): config = init_actor(id) env_config = config['env_config'] if env_config['world_name'] != "sequential_applr_testbed.world": assert os.path.exists( join("/jackal_ws/src/jackal_helper/worlds", path_to_world(train_worlds[id]))) env_config['world_name'] = path_to_world(train_worlds[id]) wrapper_config = config['wrapper_config'] training_config = config['training_config'] wrapper_dict = jackal_navi_envs.jackal_env_wrapper.wrapper_dict env = wrapper_dict[wrapper_config['wrapper']](gym.make( config["env"], **env_config), **wrapper_config['wrapper_args']) state_shape = env.observation_space.shape or env.observation_space.n action_shape = env.action_space.shape or env.action_space.n device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') Net = CNN if training_config["cnn"] == True else MLP net = Net(training_config['num_layers'], state_shape, device=device, hidden_layer_size=training_config['hidden_size']) if config['section'] == 'SAC': actor = ActorProb( net, action_shape, 1, device, hidden_layer_size=training_config['hidden_size']).to(device) else: actor = Actor( net, action_shape, 1, device, hidden_layer_size=training_config['hidden_size']).to(device) actor_optim = torch.optim.Adam(actor.parameters(), lr=training_config['actor_lr']) net = Net(training_config['num_layers'], state_shape, action_shape, concat=True, device=device, hidden_layer_size=training_config['hidden_size']) critic1 = Critic( net, device, hidden_layer_size=training_config['hidden_size']).to(device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=training_config['critic_lr']) critic2 = Critic( net, device, hidden_layer_size=training_config['hidden_size']).to(device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=training_config['critic_lr']) if config['section'] == 'SAC': policy = SACPolicy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, action_range=[env.action_space.low, env.action_space.high], tau=training_config['tau'], gamma=training_config['gamma'], reward_normalization=training_config['rew_norm'], ignore_done=training_config['ignore_done'], alpha=training_config['sac_alpha'], exploration_noise=None, estimation_step=training_config['n_step']) else: policy = TD3Policy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, action_range=[env.action_space.low, env.action_space.high], tau=training_config['tau'], gamma=training_config['gamma'], exploration_noise=GaussianNoise( sigma=training_config['exploration_noise']), policy_noise=training_config['policy_noise'], update_actor_freq=training_config['update_actor_freq'], noise_clip=training_config['noise_clip'], reward_normalization=training_config['rew_norm'], ignore_done=training_config['ignore_done'], estimation_step=training_config['n_step']) print(env.action_space.low, env.action_space.high) print(">>>>>>>>>>>>>> Running on world_%d <<<<<<<<<<<<<<<<" % (train_worlds[id])) ep = 0 while True: obs = env.reset() gp = env.gp scan = env.scan obs_batch = Batch(obs=[obs], info={}) ep += 1 traj = [] ctcs = [] done = False count = 0 policy, eps = load_model(policy) try: policy.set_exp_noise(GaussianNoise(sigma=eps)) except: pass while not done: time.sleep(0.01) p = random.random() obs = torch.tensor([obs]).float() # actions = np.array([0.5, 1.57, 6, 20, 0.8, 1, 0.3]) #else: obs_x = [scan, gp] """ if p < eps/3.: actions = APPLD_policy.forward(obs_x) print("APPLD", actions) elif p < 2*eps/3.: actions = APPLI_policy.forward(obs_x) print("APPLI", actions) elif p < eps: actions = APPLE_policy.forward(obs_x) print("APPLE", actions) else: actions = policy(obs_batch).act.cpu().detach().numpy().reshape(-1) if p < eps: if train_worlds[id] in [74, 271, 213, 283, 265, 273, 137, 209, 194]: actions = APPLI_policy.forward(obs_x) elif train_worlds[id] in [293, 105, 153, 292, 254, 221, 245]: actions = APPLD_policy.forward(obs_x) """ if p < eps: actions = get_random_action() actions = np.array(actions) else: actions = policy(obs_batch).act.cpu().detach().numpy().reshape( -1) ctc = critic1(obs, torch.tensor([ actions ]).float()).cpu().detach().numpy().reshape(-1)[0] ctcs.append(ctc) obs_new, rew, done, info = env.step(actions) count += 1 gp = info.pop("gp") scan = info.pop("scan") info["world"] = train_worlds[id] traj.append([obs, actions, rew, done, info]) obs_batch = Batch(obs=[obs_new], info={}) obs = obs_new #print(rew, done, info) """ # filter the traj that has lower discounted reward as it predicted by the critic if p < eps: def compute_discouted_rew(rew, gamma): return sum([r*(gamma**i) for i, r in enumerate(rew)]) rews = [t[2] for t in traj] discounted_rew = [compute_discouted_rew(rews[i:], training_config["gamma"]) for i in range(len(rews))] assert len(ctcs) == len(discounted_rew) use = [r > c for r, c in zip(discounted_rew, ctcs)] traj_new = [t for u, t in zip(use, traj) if u] else: traj_new = traj """ traj_new = traj if len(traj_new) > 0: write_buffer(traj_new, ep, id)
def test_td3(args=get_args()): torch.set_num_threads(1) # we just need only one thread for NN env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, device=args.device) actor = Actor(net, args.action_shape, args.max_action, args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) net = Net(args.layer_num, args.state_shape, args.action_shape, concat=True, device=args.device) critic1 = Critic(net, args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(net, args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = TD3Policy(actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, GaussianNoise(sigma=args.exploration_noise), args.policy_noise, args.update_actor_freq, args.noise_clip, [env.action_space.low[0], env.action_space.high[0]], reward_normalization=args.rew_norm, ignore_done=args.ignore_done, estimation_step=args.n_step) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # train_collector.collect(n_step=args.buffer_size) # log log_path = os.path.join(args.logdir, args.task, 'td3') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) policy.eval() collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}')
action_space_low = np.array([ range_dict[pn][0] for pn in env_config['param_list'] ]) if config['env'] == 'jackal' else np.array([-2]) action_space_high = np.array([ range_dict[pn][1] for pn in env_config['param_list'] ]) if config['env'] == 'jackal' else np.array([2]) policy = TD3Policy(actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, action_range=[action_space_low, action_space_high], tau=training_config['tau'], gamma=training_config['gamma'], exploration_noise=GaussianNoise( sigma=training_config['exploration_noise']), policy_noise=training_config['policy_noise'], update_actor_freq=training_config['update_actor_freq'], noise_clip=training_config['noise_clip'], reward_normalization=training_config['rew_norm'], ignore_done=training_config['ignore_done'], estimation_step=training_config['n_step']) if training_config['prioritized_replay']: buf = PrioritizedReplayBuffer(training_config['buffer_size'], alpha=training_config['alpha'], beta=training_config['beta']) else: buf = ReplayBuffer(training_config['buffer_size']) train_collector = Collector(policy, train_envs, buf)
if config['section'] == 'SAC': policy = SACPolicy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, action_range=[env.action_space.low, env.action_space.high], tau=training_config['tau'], gamma=training_config['gamma'], reward_normalization=training_config['rew_norm'], ignore_done=training_config['ignore_done'], alpha=training_config['sac_alpha'], exploration_noise=None, estimation_step=training_config['n_step']) else: policy = TD3Policy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, action_range=[env.action_space.low, env.action_space.high], tau=training_config['tau'], gamma=training_config['gamma'], exploration_noise=GaussianNoise(sigma=training_config['exploration_noise']), policy_noise=training_config['policy_noise'], update_actor_freq=training_config['update_actor_freq'], noise_clip=training_config['noise_clip'], reward_normalization=training_config['rew_norm'], ignore_done=training_config['ignore_done'], estimation_step=training_config['n_step']) print(training_config['hidden_size']) state_dict = torch.load(model_path) policy.load_state_dict(state_dict) if not noise: policy._noise = None print(env.action_space.low, env.action_space.high) for w in worlds:
def test_ddpg(args=get_args()): torch.set_num_threads(1) # we just need only one thread for NN env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) actor = Actor(net, args.action_shape, max_action=args.max_action, device=args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) net = Net(args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device) critic = Critic(net, device=args.device).to(args.device) critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) policy = DDPGPolicy( actor, actor_optim, critic, critic_optim, tau=args.tau, gamma=args.gamma, exploration_noise=GaussianNoise(sigma=args.exploration_noise), reward_normalization=args.rew_norm, estimation_step=args.n_step, action_space=env.action_space) # collector train_collector = Collector(policy, train_envs, VectorReplayBuffer(args.buffer_size, len(train_envs)), exploration_noise=True) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'ddpg') writer = SummaryWriter(log_path) logger = BasicLogger(writer) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): return mean_rewards >= env.spec.reward_threshold # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.step_per_collect, args.test_num, args.batch_size, update_per_step=args.update_per_step, stop_fn=stop_fn, save_fn=save_fn, logger=logger) assert stop_fn(result['best_reward']) if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) policy.eval() collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) rews, lens = result["rews"], result["lens"] print(f"Final reward: {rews.mean()}, length: {lens.mean()}")
def test_ddpg(args=get_args()): env, train_envs, test_envs = make_mujoco_env(args.task, args.seed, args.training_num, args.test_num, obs_norm=False) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] args.exploration_noise = args.exploration_noise * args.max_action print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) # model net_a = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) actor = Actor(net_a, args.action_shape, max_action=args.max_action, device=args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) net_c = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device, ) critic = Critic(net_c, device=args.device).to(args.device) critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) policy = DDPGPolicy( actor, actor_optim, critic, critic_optim, tau=args.tau, gamma=args.gamma, exploration_noise=GaussianNoise(sigma=args.exploration_noise), estimation_step=args.n_step, action_space=env.action_space, ) # load a previous policy if args.resume_path: policy.load_state_dict( torch.load(args.resume_path, map_location=args.device)) print("Loaded agent from: ", args.resume_path) # collector if args.training_num > 1: buffer = VectorReplayBuffer(args.buffer_size, len(train_envs)) else: buffer = ReplayBuffer(args.buffer_size) train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) train_collector.collect(n_step=args.start_timesteps, random=True) # log now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") args.algo_name = "ddpg" log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) log_path = os.path.join(args.logdir, log_name) # logger if args.logger == "wandb": logger = WandbLogger( save_interval=1, name=log_name.replace(os.path.sep, "__"), run_id=args.resume_id, config=args, project=args.wandb_project, ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) if args.logger == "tensorboard": logger = TensorboardLogger(writer) else: # wandb logger.load(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth")) if not args.watch: # trainer result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.step_per_collect, args.test_num, args.batch_size, save_best_fn=save_best_fn, logger=logger, update_per_step=args.update_per_step, test_in_train=False, ) pprint.pprint(result) # Let's watch its performance! policy.eval() test_envs.seed(args.seed) test_collector.reset() result = test_collector.collect(n_episode=args.test_num, render=args.render) print( f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}' )
def test_td3(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] args.exploration_noise = args.exploration_noise * args.max_action args.policy_noise = args.policy_noise * args.max_action args.noise_clip = args.noise_clip * args.max_action print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) print("Action range:", np.min(env.action_space.low), np.max(env.action_space.high)) # train_envs = gym.make(args.task) if args.training_num > 1: train_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)] ) else: train_envs = gym.make(args.task) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)] ) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net_a = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) actor = Actor( net_a, args.action_shape, max_action=args.max_action, device=args.device ).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) net_c1 = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device ) net_c2 = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device ) critic1 = Critic(net_c1, device=args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(net_c2, device=args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = TD3Policy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, tau=args.tau, gamma=args.gamma, exploration_noise=GaussianNoise(sigma=args.exploration_noise), policy_noise=args.policy_noise, update_actor_freq=args.update_actor_freq, noise_clip=args.noise_clip, estimation_step=args.n_step, action_space=env.action_space ) # load a previous policy if args.resume_path: policy.load_state_dict(torch.load(args.resume_path, map_location=args.device)) print("Loaded agent from: ", args.resume_path) # collector if args.training_num > 1: buffer = VectorReplayBuffer(args.buffer_size, len(train_envs)) else: buffer = ReplayBuffer(args.buffer_size) train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) train_collector.collect(n_step=args.start_timesteps, random=True) # log t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_td3' log_path = os.path.join(args.logdir, args.task, 'td3', log_file) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) logger = TensorboardLogger(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) if not args.watch: # trainer result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.step_per_collect, args.test_num, args.batch_size, save_best_fn=save_best_fn, logger=logger, update_per_step=args.update_per_step, test_in_train=False ) pprint.pprint(result) # Let's watch its performance! policy.eval() test_envs.seed(args.seed) test_collector.reset() result = test_collector.collect(n_episode=args.test_num, render=args.render) print(f'Final reward: {result["rews"].mean()}, length: {result["lens"].mean()}')
def test_td3(args=get_args()): # initialize environment env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = Actor(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device, hidden_layer_size=args.hidden_size).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic1 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device, hidden_layer_size=args.hidden_size).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device, hidden_layer_size=args.hidden_size).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) # energy-based discriminator disc = Critic( args.layer_num, np.prod(args.state_shape) + np.prod(args.action_shape), 0, args.device, hidden_layer_size=args.hidden_size, output_dim=np.prod(args.state_shape) + 1, ).to(args.device) disc_optim = torch.optim.Adam(disc.parameters(), lr=3e-4) # tunable temperature beta = torch.ones(1, requires_grad=True, device=args.device) beta_optim = torch.optim.Adam([beta], lr=args.critic_lr) rng = np.random.RandomState(seed=args.seed) policy = TD3MUTRIRB2BPolicy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, GaussianNoise(sigma=args.exploration_noise), args.policy_noise, args.update_actor_freq, args.noise_clip, action_range=[env.action_space.low[0], env.action_space.high[0]], reward_normalization=False, ignore_done=False, norm_diff=False, use_diff=False, process_tri=(lambda x, beta: process_tri(x, rng=rng, beta=beta) ), # continuous transition construction beta=(beta, beta_optim), # the tunable temperature discriminator=(disc, disc_optim), # the energy-based discriminator tor_diff=args.tor_diff # the tolerance of distance ) # collector if args.training_num == 0: max_episode_steps = train_envs._max_episode_steps else: max_episode_steps = train_envs.envs[0]._max_episode_steps train_collector = Collector( policy, train_envs, ReplayBufferTriple(args.buffer_size, max_ep_len=max_episode_steps)) test_collector = Collector(policy, test_envs, mode='test') # log log_path = os.path.join(args.logdir, args.task, 'td3_ct', str(args.seed)) writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) env.spec.reward_threshold = 100000 def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = offpolicy_exact_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()