def __init__(self, config: TestConfig = None): if config is None: config = TestConfig() self.params = config self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = gym.make(self.params.env_name) self.obs_dim = self.env.observation_space.shape[0] self.act_dim = self.env.action_space.shape[0] self.act_limit = self.env.action_space.high # setup seed self.seed = self.params.seed self._seed() self.actor = ActorTD3(self.obs_dim, self.act_dim, self.act_limit) self.critic = CriticTD3(self.obs_dim, self.act_dim) self.replay = ReplayBuffer(self.obs_dim, self.act_dim, self.params.replay_size) self.policy = TD3Policy( replay_buffer=self.replay, actor=self.actor, critic=self.critic, actor_lr=self.params.pi_lr, critic_lr=self.params.q_lr, polyak=self.params.polyak, bsize=self.params.batch_size, policy_noise_std=self.params.target_noise, policy_noise_clip=self.params.noise_clip, discount=self.params.gamma, device=self.device, ) self.path = Path( f'/tmp/experiments/unittest_hiro/{self.params.env_name}/s{self.seed}' ) self.logger = VectorLogger(output_dir=str(self.path)) # Set up model saving self.logger.setup_pytorch_saver(self.actor)
def main(id): config = init_actor(id) env_config = config['env_config'] if env_config['world_name'] != "sequential_applr_testbed.world": assert os.path.exists( join("/jackal_ws/src/jackal_helper/worlds", path_to_world(train_worlds[id]))) env_config['world_name'] = path_to_world(train_worlds[id]) wrapper_config = config['wrapper_config'] training_config = config['training_config'] wrapper_dict = jackal_navi_envs.jackal_env_wrapper.wrapper_dict env = wrapper_dict[wrapper_config['wrapper']](gym.make( config["env"], **env_config), **wrapper_config['wrapper_args']) state_shape = env.observation_space.shape or env.observation_space.n action_shape = env.action_space.shape or env.action_space.n device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') Net = CNN if training_config["cnn"] == True else MLP net = Net(training_config['num_layers'], state_shape, device=device, hidden_layer_size=training_config['hidden_size']) if config['section'] == 'SAC': actor = ActorProb( net, action_shape, 1, device, hidden_layer_size=training_config['hidden_size']).to(device) else: actor = Actor( net, action_shape, 1, device, hidden_layer_size=training_config['hidden_size']).to(device) actor_optim = torch.optim.Adam(actor.parameters(), lr=training_config['actor_lr']) net = Net(training_config['num_layers'], state_shape, action_shape, concat=True, device=device, hidden_layer_size=training_config['hidden_size']) critic1 = Critic( net, device, hidden_layer_size=training_config['hidden_size']).to(device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=training_config['critic_lr']) critic2 = Critic( net, device, hidden_layer_size=training_config['hidden_size']).to(device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=training_config['critic_lr']) if config['section'] == 'SAC': policy = SACPolicy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, action_range=[env.action_space.low, env.action_space.high], tau=training_config['tau'], gamma=training_config['gamma'], reward_normalization=training_config['rew_norm'], ignore_done=training_config['ignore_done'], alpha=training_config['sac_alpha'], exploration_noise=None, estimation_step=training_config['n_step']) else: policy = TD3Policy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, action_range=[env.action_space.low, env.action_space.high], tau=training_config['tau'], gamma=training_config['gamma'], exploration_noise=GaussianNoise( sigma=training_config['exploration_noise']), policy_noise=training_config['policy_noise'], update_actor_freq=training_config['update_actor_freq'], noise_clip=training_config['noise_clip'], reward_normalization=training_config['rew_norm'], ignore_done=training_config['ignore_done'], estimation_step=training_config['n_step']) print(env.action_space.low, env.action_space.high) print(">>>>>>>>>>>>>> Running on world_%d <<<<<<<<<<<<<<<<" % (train_worlds[id])) ep = 0 while True: obs = env.reset() gp = env.gp scan = env.scan obs_batch = Batch(obs=[obs], info={}) ep += 1 traj = [] ctcs = [] done = False count = 0 policy, eps = load_model(policy) try: policy.set_exp_noise(GaussianNoise(sigma=eps)) except: pass while not done: time.sleep(0.01) p = random.random() obs = torch.tensor([obs]).float() # actions = np.array([0.5, 1.57, 6, 20, 0.8, 1, 0.3]) #else: obs_x = [scan, gp] """ if p < eps/3.: actions = APPLD_policy.forward(obs_x) print("APPLD", actions) elif p < 2*eps/3.: actions = APPLI_policy.forward(obs_x) print("APPLI", actions) elif p < eps: actions = APPLE_policy.forward(obs_x) print("APPLE", actions) else: actions = policy(obs_batch).act.cpu().detach().numpy().reshape(-1) if p < eps: if train_worlds[id] in [74, 271, 213, 283, 265, 273, 137, 209, 194]: actions = APPLI_policy.forward(obs_x) elif train_worlds[id] in [293, 105, 153, 292, 254, 221, 245]: actions = APPLD_policy.forward(obs_x) """ if p < eps: actions = get_random_action() actions = np.array(actions) else: actions = policy(obs_batch).act.cpu().detach().numpy().reshape( -1) ctc = critic1(obs, torch.tensor([ actions ]).float()).cpu().detach().numpy().reshape(-1)[0] ctcs.append(ctc) obs_new, rew, done, info = env.step(actions) count += 1 gp = info.pop("gp") scan = info.pop("scan") info["world"] = train_worlds[id] traj.append([obs, actions, rew, done, info]) obs_batch = Batch(obs=[obs_new], info={}) obs = obs_new #print(rew, done, info) """ # filter the traj that has lower discounted reward as it predicted by the critic if p < eps: def compute_discouted_rew(rew, gamma): return sum([r*(gamma**i) for i, r in enumerate(rew)]) rews = [t[2] for t in traj] discounted_rew = [compute_discouted_rew(rews[i:], training_config["gamma"]) for i in range(len(rews))] assert len(ctcs) == len(discounted_rew) use = [r > c for r, c in zip(discounted_rew, ctcs)] traj_new = [t for u, t in zip(use, traj) if u] else: traj_new = traj """ traj_new = traj if len(traj_new) > 0: write_buffer(traj_new, ep, id)
def main(id, avg, applx): config = init_actor(id) env_config = config['env_config'] if env_config['world_name'] != "sequential_applr_testbed.world": assert os.path.exists(join("/jackal_ws/src/jackal_helper/worlds", path_to_world(worlds[id]))) env_config['world_name'] = path_to_world(worlds[id]) wrapper_config = config['wrapper_config'] training_config = config['training_config'] wrapper_dict = jackal_navi_envs.jackal_env_wrapper.wrapper_dict env = wrapper_dict[wrapper_config['wrapper']](gym.make(config["env"], **env_config), **wrapper_config['wrapper_args']) state_shape = env.observation_space.shape or env.observation_space.n action_shape = env.action_space.shape or env.action_space.n # Load the model device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') net = Net(training_config['num_layers'], state_shape, device=device, hidden_layer_size=training_config['hidden_size']) if config['section'] == 'SAC': actor = ActorProb( net, action_shape, 1, device, hidden_layer_size=training_config['hidden_size'] ).to(device) else: actor = Actor( net, action_shape, 1, device, hidden_layer_size=training_config['hidden_size'] ).to(device) actor_optim = torch.optim.Adam(actor.parameters(), lr=training_config['actor_lr']) net = Net(training_config['num_layers'], state_shape, action_shape, concat=True, device=device, hidden_layer_size=training_config['hidden_size']) critic1 = Critic(net, device, hidden_layer_size=training_config['hidden_size']).to(device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=training_config['critic_lr']) critic2 = Critic(net, device, hidden_layer_size=training_config['hidden_size']).to(device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=training_config['critic_lr']) if config['section'] == 'SAC': policy = SACPolicy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, action_range=[env.action_space.low, env.action_space.high], tau=training_config['tau'], gamma=training_config['gamma'], reward_normalization=training_config['rew_norm'], ignore_done=training_config['ignore_done'], alpha=training_config['sac_alpha'], exploration_noise=None, estimation_step=training_config['n_step']) else: policy = TD3Policy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, action_range=[env.action_space.low, env.action_space.high], tau=training_config['tau'], gamma=training_config['gamma'], exploration_noise=GaussianNoise(sigma=training_config['exploration_noise']), policy_noise=training_config['policy_noise'], update_actor_freq=training_config['update_actor_freq'], noise_clip=training_config['noise_clip'], reward_normalization=training_config['rew_norm'], ignore_done=training_config['ignore_done'], estimation_step=training_config['n_step']) print(env.action_space.low, env.action_space.high) print(">>>>>>>>>>>>>> Running on world_%d <<<<<<<<<<<<<<<<" %(worlds[id])) ep = 0 for _ in range(avg): obs = env.reset() gp = env.gp scan = env.scan obs_batch = Batch(obs=[obs], info={}) ep += 1 traj = [] done = False count = 0 policy = load_model(policy) while not done: obs_x = [scan, gp] if not applx: actions = policy(obs_batch).act.cpu().detach().numpy().reshape(-1) else: actions = APPLX[applx](obs_x) obs_new, rew, done, info = env.step(actions) count += 1 info["world"] = worlds[id] gp = info.pop("gp") scan = info.pop("scan") traj.append([obs, actions, rew, done, {"world": worlds[id], "succeed": info["succeed"]}]) obs_batch = Batch(obs=[obs_new], info={}) obs = obs_new # print('count: %d, rew: %f' %(count, rew)) write_buffer(traj, ep, id) env.close()
action_space_low = np.array([ range_dict[pn][0] for pn in env_config['param_list'] ]) if config['env'] == 'jackal' else np.array([-2]) action_space_high = np.array([ range_dict[pn][1] for pn in env_config['param_list'] ]) if config['env'] == 'jackal' else np.array([2]) policy = TD3Policy(actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, action_range=[action_space_low, action_space_high], tau=training_config['tau'], gamma=training_config['gamma'], exploration_noise=GaussianNoise( sigma=training_config['exploration_noise']), policy_noise=training_config['policy_noise'], update_actor_freq=training_config['update_actor_freq'], noise_clip=training_config['noise_clip'], reward_normalization=training_config['rew_norm'], ignore_done=training_config['ignore_done'], estimation_step=training_config['n_step']) if training_config['prioritized_replay']: buf = PrioritizedReplayBuffer(training_config['buffer_size'], alpha=training_config['alpha'], beta=training_config['beta']) else: buf = ReplayBuffer(training_config['buffer_size'])
def __init__(self, params: HiroConfig): self.params = params self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if params.checkpoint: self.params = self.load_checkpoint(params.checkpoint, get_params_only=True) self.env = self.params.env self.seed = self.params.seed self.state_dim = self.params.state_dim self.goal_dim = self.params.goal_dim self.act_dim = self.params.action_dim self.action_scale = self.params.action_scale self.goal_scale = get_goal_scale(self.params.env_name, device=self.device) self.target = get_target_position(self.params.env_name, device=self.device) self.rew_scaling_lo = self.params.rew_scaling_lo self.rew_scaling_hi = self.params.rew_scaling_hi self.ep_len = self.params.episode_len # setup seed self._seed() # policy parameters actor_lr = self.params.actor_lr critic_lr = self.params.critic_lr polyak = self.params.polyak bsize = self.params.batch_size noise_std = self.params.policy_noise_std noise_clip = self.params.policy_noise_clip discount = self.params.discount buffer_size = self.params.replay_buffer_size self.actor_lo: ActorLow = ActorLow(self.state_dim, self.goal_dim, self.act_dim, self.action_scale) self.critic_lo: CriticLow = CriticLow(self.state_dim, self.goal_dim, self.act_dim) self.replay_lo = ReplayBufferLo(self.state_dim, self.goal_dim, self.act_dim, buffer_size) self.agent_lo = TD3Policy( replay_buffer=self.replay_lo, actor=self.actor_lo, critic=self.critic_lo, actor_lr=actor_lr, critic_lr=critic_lr, polyak=polyak, bsize=bsize, policy_noise_std=noise_std, policy_noise_clip=noise_clip, discount=discount, device=self.device, ) self.actor_hi: ActorHigh = ActorHigh(self.state_dim, self.goal_dim, self.goal_scale) self.critic_hi: CriticHi = CriticHi(self.state_dim, self.goal_dim) self.replay_hi = ReplayBufferHi(self.state_dim, self.goal_dim, buffer_size) self.agent_hi = TD3Policy( replay_buffer=self.replay_hi, actor=self.actor_hi, critic=self.critic_hi, actor_lr=actor_lr, critic_lr=critic_lr, polyak=polyak, bsize=bsize, policy_noise_std=noise_std, policy_noise_clip=noise_clip, discount=discount, device=self.device ) # book keeping self.step = 0 self.ep_idx = 0 self.rollouts = dict(states=[], actions=[], goals=[], rewards=[]) if params.checkpoint: self.load_checkpoint(params.checkpoint) root = f'hiro_{self.params.env_name}' if params.prefix: root = f'{root}_{params.prefix}' self.log_dir = Path('runs') / root / f's{self.seed}' self.logger = VectorLogger(output_dir=self.log_dir) self.logger.save_config(self.params.state_dict())
def main(id, avg, default): config = init_actor(id) env_config = config['env_config'] if env_config['world_name'] != "sequential_applr_testbed.world": env_config['world_name'] = 'Benchmarking/%s/world_%d.world' % ( SET, benchmarking_test[id]) assert os.path.exists( '/jackal_ws/src/jackal_helper/worlds/Benchmarking/%s/world_%d.world' % (SET, benchmarking_test[id])) wrapper_config = config['wrapper_config'] training_config = config['training_config'] wrapper_dict = jackal_navi_envs.jackal_env_wrapper.wrapper_dict if config['env'] == 'jackal': env = wrapper_dict[wrapper_config['wrapper']](gym.make( 'jackal_continuous-v0', **env_config), **wrapper_config['wrapper_args']) else: env = gym.make('CartPole-v1') state_shape = env.observation_space.shape or env.observation_space.n action_shape = env.action_space.shape or env.action_space.n # Load the model device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') net = Net(training_config['num_layers'], state_shape, device=device, hidden_layer_size=training_config['hidden_size']) actor = Actor(net, action_shape, 1, device, hidden_layer_size=training_config['hidden_size']).to(device) actor_optim = torch.optim.Adam(actor.parameters(), lr=training_config['actor_lr']) net = Net(training_config['num_layers'], state_shape, action_shape, concat=True, device=device, hidden_layer_size=training_config['hi']) critic1 = Critic( net, device, hidden_layer_size=training_config['hidden_size']).to(device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=training_config['critic_lr']) critic2 = Critic( net, device, hidden_layer_size=training_config['hidden_size']).to(device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=training_config['critic_lr']) policy = TD3Policy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, action_range=[env.action_space.low, env.action_space.high], tau=training_config['tau'], gamma=training_config['gamma'], exploration_noise=None, policy_noise=training_config['policy_noise'], update_actor_freq=training_config['update_actor_freq'], noise_clip=training_config['noise_clip'], reward_normalization=training_config['rew_norm'], ignore_done=training_config['ignore_done'], estimation_step=training_config['n_step']) print(env.action_space.low, env.action_space.high) ep = 0 for _ in range(avg): obs = env.reset() obs_batch = Batch(obs=[obs], info={}) ep += 1 traj = [] done = False count = 0 policy = load_model(policy) while not done: if not default: actions = policy(obs_batch).act.cpu().detach().numpy().reshape( -1) else: actions = np.array([0.5, 1.57, 6, 20, 0.75, 1, 0.3]) obs_new, rew, done, info = env.step(actions) count += 1 traj.append([obs, actions, rew, done, info]) obs_batch = Batch(obs=[obs_new], info={}) obs = obs_new # print('count: %d, rew: %f' %(count, rew)) write_buffer(traj, ep, id) env.close()