Esempio n. 1
0
    def __init__(self, config: TestConfig = None):

        if config is None:
            config = TestConfig()

        self.params = config
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = gym.make(self.params.env_name)
        self.obs_dim = self.env.observation_space.shape[0]
        self.act_dim = self.env.action_space.shape[0]
        self.act_limit = self.env.action_space.high

        # setup seed
        self.seed = self.params.seed
        self._seed()

        self.actor = ActorTD3(self.obs_dim, self.act_dim, self.act_limit)
        self.critic = CriticTD3(self.obs_dim, self.act_dim)
        self.replay = ReplayBuffer(self.obs_dim, self.act_dim,
                                   self.params.replay_size)
        self.policy = TD3Policy(
            replay_buffer=self.replay,
            actor=self.actor,
            critic=self.critic,
            actor_lr=self.params.pi_lr,
            critic_lr=self.params.q_lr,
            polyak=self.params.polyak,
            bsize=self.params.batch_size,
            policy_noise_std=self.params.target_noise,
            policy_noise_clip=self.params.noise_clip,
            discount=self.params.gamma,
            device=self.device,
        )

        self.path = Path(
            f'/tmp/experiments/unittest_hiro/{self.params.env_name}/s{self.seed}'
        )
        self.logger = VectorLogger(output_dir=str(self.path))

        # Set up model saving
        self.logger.setup_pytorch_saver(self.actor)
Esempio n. 2
0
def main(id):

    config = init_actor(id)
    env_config = config['env_config']
    if env_config['world_name'] != "sequential_applr_testbed.world":
        assert os.path.exists(
            join("/jackal_ws/src/jackal_helper/worlds",
                 path_to_world(train_worlds[id])))
        env_config['world_name'] = path_to_world(train_worlds[id])
    wrapper_config = config['wrapper_config']
    training_config = config['training_config']
    wrapper_dict = jackal_navi_envs.jackal_env_wrapper.wrapper_dict
    env = wrapper_dict[wrapper_config['wrapper']](gym.make(
        config["env"], **env_config), **wrapper_config['wrapper_args'])
    state_shape = env.observation_space.shape or env.observation_space.n
    action_shape = env.action_space.shape or env.action_space.n

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    Net = CNN if training_config["cnn"] == True else MLP
    net = Net(training_config['num_layers'],
              state_shape,
              device=device,
              hidden_layer_size=training_config['hidden_size'])
    if config['section'] == 'SAC':
        actor = ActorProb(
            net,
            action_shape,
            1,
            device,
            hidden_layer_size=training_config['hidden_size']).to(device)
    else:
        actor = Actor(
            net,
            action_shape,
            1,
            device,
            hidden_layer_size=training_config['hidden_size']).to(device)
    actor_optim = torch.optim.Adam(actor.parameters(),
                                   lr=training_config['actor_lr'])
    net = Net(training_config['num_layers'],
              state_shape,
              action_shape,
              concat=True,
              device=device,
              hidden_layer_size=training_config['hidden_size'])
    critic1 = Critic(
        net, device,
        hidden_layer_size=training_config['hidden_size']).to(device)
    critic1_optim = torch.optim.Adam(critic1.parameters(),
                                     lr=training_config['critic_lr'])
    critic2 = Critic(
        net, device,
        hidden_layer_size=training_config['hidden_size']).to(device)
    critic2_optim = torch.optim.Adam(critic2.parameters(),
                                     lr=training_config['critic_lr'])

    if config['section'] == 'SAC':
        policy = SACPolicy(
            actor,
            actor_optim,
            critic1,
            critic1_optim,
            critic2,
            critic2_optim,
            action_range=[env.action_space.low, env.action_space.high],
            tau=training_config['tau'],
            gamma=training_config['gamma'],
            reward_normalization=training_config['rew_norm'],
            ignore_done=training_config['ignore_done'],
            alpha=training_config['sac_alpha'],
            exploration_noise=None,
            estimation_step=training_config['n_step'])
    else:
        policy = TD3Policy(
            actor,
            actor_optim,
            critic1,
            critic1_optim,
            critic2,
            critic2_optim,
            action_range=[env.action_space.low, env.action_space.high],
            tau=training_config['tau'],
            gamma=training_config['gamma'],
            exploration_noise=GaussianNoise(
                sigma=training_config['exploration_noise']),
            policy_noise=training_config['policy_noise'],
            update_actor_freq=training_config['update_actor_freq'],
            noise_clip=training_config['noise_clip'],
            reward_normalization=training_config['rew_norm'],
            ignore_done=training_config['ignore_done'],
            estimation_step=training_config['n_step'])

    print(env.action_space.low, env.action_space.high)
    print(">>>>>>>>>>>>>> Running on world_%d <<<<<<<<<<<<<<<<" %
          (train_worlds[id]))
    ep = 0
    while True:
        obs = env.reset()
        gp = env.gp
        scan = env.scan
        obs_batch = Batch(obs=[obs], info={})
        ep += 1
        traj = []
        ctcs = []
        done = False
        count = 0
        policy, eps = load_model(policy)
        try:
            policy.set_exp_noise(GaussianNoise(sigma=eps))
        except:
            pass
        while not done:
            time.sleep(0.01)
            p = random.random()
            obs = torch.tensor([obs]).float()
            # actions = np.array([0.5, 1.57, 6, 20, 0.8, 1, 0.3])
            #else:
            obs_x = [scan, gp]
            """
            if p < eps/3.:
                actions = APPLD_policy.forward(obs_x)
                print("APPLD", actions)
            elif p < 2*eps/3.:
                actions = APPLI_policy.forward(obs_x)
                print("APPLI", actions)
            elif p < eps:
                actions = APPLE_policy.forward(obs_x)
                print("APPLE", actions)
            else:
                actions = policy(obs_batch).act.cpu().detach().numpy().reshape(-1)
            if p < eps:
                if train_worlds[id] in [74, 271, 213, 283, 265, 273, 137, 209, 194]:
                    actions = APPLI_policy.forward(obs_x)
                elif train_worlds[id] in [293, 105, 153, 292, 254, 221, 245]:
                    actions = APPLD_policy.forward(obs_x) 
            """
            if p < eps:
                actions = get_random_action()
                actions = np.array(actions)
            else:
                actions = policy(obs_batch).act.cpu().detach().numpy().reshape(
                    -1)
            ctc = critic1(obs,
                          torch.tensor([
                              actions
                          ]).float()).cpu().detach().numpy().reshape(-1)[0]
            ctcs.append(ctc)
            obs_new, rew, done, info = env.step(actions)
            count += 1
            gp = info.pop("gp")
            scan = info.pop("scan")
            info["world"] = train_worlds[id]
            traj.append([obs, actions, rew, done, info])
            obs_batch = Batch(obs=[obs_new], info={})
            obs = obs_new
            #print(rew, done, info)
        """
        # filter the traj that has lower discounted reward as it predicted by the critic
        if p < eps:
            def compute_discouted_rew(rew, gamma):
                return sum([r*(gamma**i) for i, r in enumerate(rew)])
            rews = [t[2] for t in traj]
            discounted_rew = [compute_discouted_rew(rews[i:], training_config["gamma"]) for i in range(len(rews))]
            assert len(ctcs) == len(discounted_rew)
            use = [r > c for r, c in zip(discounted_rew, ctcs)]
            traj_new = [t for u, t in zip(use, traj) if u]
        else:
            traj_new = traj
        """
        traj_new = traj
        if len(traj_new) > 0:
            write_buffer(traj_new, ep, id)
Esempio n. 3
0
def main(id, avg, applx):

    config = init_actor(id)
    env_config = config['env_config']
    if env_config['world_name'] != "sequential_applr_testbed.world":
        assert os.path.exists(join("/jackal_ws/src/jackal_helper/worlds", path_to_world(worlds[id])))
        env_config['world_name'] = path_to_world(worlds[id])
    wrapper_config = config['wrapper_config']
    training_config = config['training_config']
    wrapper_dict = jackal_navi_envs.jackal_env_wrapper.wrapper_dict
    env = wrapper_dict[wrapper_config['wrapper']](gym.make(config["env"], **env_config), **wrapper_config['wrapper_args'])
    state_shape = env.observation_space.shape or env.observation_space.n
    action_shape = env.action_space.shape or env.action_space.n

    # Load the model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    net = Net(training_config['num_layers'], state_shape, device=device, hidden_layer_size=training_config['hidden_size'])
    if config['section'] == 'SAC':
        actor = ActorProb(
            net, action_shape,
            1, device, hidden_layer_size=training_config['hidden_size']
        ).to(device)
    else:
        actor = Actor(
            net, action_shape,
            1, device, hidden_layer_size=training_config['hidden_size']
        ).to(device)
    actor_optim = torch.optim.Adam(actor.parameters(), lr=training_config['actor_lr'])
    net = Net(training_config['num_layers'], state_shape,
              action_shape, concat=True, device=device, hidden_layer_size=training_config['hidden_size'])
    critic1 = Critic(net, device, hidden_layer_size=training_config['hidden_size']).to(device)
    critic1_optim = torch.optim.Adam(critic1.parameters(), lr=training_config['critic_lr'])
    critic2 = Critic(net, device, hidden_layer_size=training_config['hidden_size']).to(device)
    critic2_optim = torch.optim.Adam(critic2.parameters(), lr=training_config['critic_lr'])

    if config['section'] == 'SAC':
        policy = SACPolicy(
            actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim,
            action_range=[env.action_space.low, env.action_space.high],
            tau=training_config['tau'], gamma=training_config['gamma'],
            reward_normalization=training_config['rew_norm'],
            ignore_done=training_config['ignore_done'],
            alpha=training_config['sac_alpha'],
            exploration_noise=None,
            estimation_step=training_config['n_step'])
    else:
        policy = TD3Policy(
            actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim,
            action_range=[env.action_space.low, env.action_space.high],
            tau=training_config['tau'], gamma=training_config['gamma'],
            exploration_noise=GaussianNoise(sigma=training_config['exploration_noise']),
            policy_noise=training_config['policy_noise'],
            update_actor_freq=training_config['update_actor_freq'],
            noise_clip=training_config['noise_clip'],
            reward_normalization=training_config['rew_norm'],
            ignore_done=training_config['ignore_done'],
            estimation_step=training_config['n_step'])
    print(env.action_space.low, env.action_space.high)
    print(">>>>>>>>>>>>>> Running on world_%d <<<<<<<<<<<<<<<<" %(worlds[id]))
    ep = 0
    for _ in range(avg):
        obs = env.reset()
        gp = env.gp
        scan = env.scan
        obs_batch = Batch(obs=[obs], info={})
        ep += 1
        traj = []
        done = False
        count = 0
        policy = load_model(policy)
        while not done:
            obs_x = [scan, gp]
            if not applx:
                actions = policy(obs_batch).act.cpu().detach().numpy().reshape(-1)
            else:
                actions = APPLX[applx](obs_x)
            obs_new, rew, done, info = env.step(actions)
            count += 1
            info["world"] = worlds[id]
            gp = info.pop("gp")
            scan = info.pop("scan")
            traj.append([obs, actions, rew, done, {"world": worlds[id], "succeed": info["succeed"]}])
            obs_batch = Batch(obs=[obs_new], info={})
            obs = obs_new
        # print('count: %d, rew: %f' %(count, rew))
        write_buffer(traj, ep, id)
    env.close()
Esempio n. 4
0
action_space_low = np.array([
    range_dict[pn][0] for pn in env_config['param_list']
]) if config['env'] == 'jackal' else np.array([-2])
action_space_high = np.array([
    range_dict[pn][1] for pn in env_config['param_list']
]) if config['env'] == 'jackal' else np.array([2])
policy = TD3Policy(actor,
                   actor_optim,
                   critic1,
                   critic1_optim,
                   critic2,
                   critic2_optim,
                   action_range=[action_space_low, action_space_high],
                   tau=training_config['tau'],
                   gamma=training_config['gamma'],
                   exploration_noise=GaussianNoise(
                       sigma=training_config['exploration_noise']),
                   policy_noise=training_config['policy_noise'],
                   update_actor_freq=training_config['update_actor_freq'],
                   noise_clip=training_config['noise_clip'],
                   reward_normalization=training_config['rew_norm'],
                   ignore_done=training_config['ignore_done'],
                   estimation_step=training_config['n_step'])

if training_config['prioritized_replay']:
    buf = PrioritizedReplayBuffer(training_config['buffer_size'],
                                  alpha=training_config['alpha'],
                                  beta=training_config['beta'])
else:
    buf = ReplayBuffer(training_config['buffer_size'])
Esempio n. 5
0
    def __init__(self, params: HiroConfig):
        self.params = params
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        if params.checkpoint:
            self.params = self.load_checkpoint(params.checkpoint, get_params_only=True)


        self.env = self.params.env
        self.seed = self.params.seed
        self.state_dim = self.params.state_dim
        self.goal_dim = self.params.goal_dim
        self.act_dim = self.params.action_dim
        self.action_scale = self.params.action_scale
        self.goal_scale = get_goal_scale(self.params.env_name, device=self.device)
        self.target = get_target_position(self.params.env_name, device=self.device)
        self.rew_scaling_lo = self.params.rew_scaling_lo
        self.rew_scaling_hi = self.params.rew_scaling_hi

        self.ep_len = self.params.episode_len

        # setup seed
        self._seed()

        # policy parameters
        actor_lr = self.params.actor_lr
        critic_lr = self.params.critic_lr
        polyak = self.params.polyak
        bsize = self.params.batch_size
        noise_std = self.params.policy_noise_std
        noise_clip = self.params.policy_noise_clip
        discount = self.params.discount
        buffer_size = self.params.replay_buffer_size

        self.actor_lo: ActorLow = ActorLow(self.state_dim, self.goal_dim, self.act_dim,
                                           self.action_scale)
        self.critic_lo: CriticLow = CriticLow(self.state_dim, self.goal_dim, self.act_dim)
        self.replay_lo = ReplayBufferLo(self.state_dim, self.goal_dim, self.act_dim, buffer_size)
        self.agent_lo = TD3Policy(
            replay_buffer=self.replay_lo,
            actor=self.actor_lo,
            critic=self.critic_lo,
            actor_lr=actor_lr,
            critic_lr=critic_lr,
            polyak=polyak,
            bsize=bsize,
            policy_noise_std=noise_std,
            policy_noise_clip=noise_clip,
            discount=discount,
            device=self.device,
        )

        self.actor_hi: ActorHigh = ActorHigh(self.state_dim, self.goal_dim, self.goal_scale)
        self.critic_hi: CriticHi = CriticHi(self.state_dim, self.goal_dim)
        self.replay_hi = ReplayBufferHi(self.state_dim, self.goal_dim, buffer_size)
        self.agent_hi = TD3Policy(
            replay_buffer=self.replay_hi,
            actor=self.actor_hi,
            critic=self.critic_hi,
            actor_lr=actor_lr,
            critic_lr=critic_lr,
            polyak=polyak,
            bsize=bsize,
            policy_noise_std=noise_std,
            policy_noise_clip=noise_clip,
            discount=discount,
            device=self.device
        )

        # book keeping
        self.step = 0
        self.ep_idx = 0
        self.rollouts = dict(states=[], actions=[], goals=[], rewards=[])

        if params.checkpoint:
            self.load_checkpoint(params.checkpoint)

        root = f'hiro_{self.params.env_name}'
        if params.prefix:
            root = f'{root}_{params.prefix}'
        self.log_dir = Path('runs') / root / f's{self.seed}'
        self.logger = VectorLogger(output_dir=self.log_dir)
        self.logger.save_config(self.params.state_dict())
Esempio n. 6
0
def main(id, avg, default):

    config = init_actor(id)
    env_config = config['env_config']
    if env_config['world_name'] != "sequential_applr_testbed.world":
        env_config['world_name'] = 'Benchmarking/%s/world_%d.world' % (
            SET, benchmarking_test[id])
        assert os.path.exists(
            '/jackal_ws/src/jackal_helper/worlds/Benchmarking/%s/world_%d.world'
            % (SET, benchmarking_test[id]))
    wrapper_config = config['wrapper_config']
    training_config = config['training_config']
    wrapper_dict = jackal_navi_envs.jackal_env_wrapper.wrapper_dict
    if config['env'] == 'jackal':
        env = wrapper_dict[wrapper_config['wrapper']](gym.make(
            'jackal_continuous-v0',
            **env_config), **wrapper_config['wrapper_args'])
    else:
        env = gym.make('CartPole-v1')
    state_shape = env.observation_space.shape or env.observation_space.n
    action_shape = env.action_space.shape or env.action_space.n

    # Load the model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    net = Net(training_config['num_layers'],
              state_shape,
              device=device,
              hidden_layer_size=training_config['hidden_size'])
    actor = Actor(net,
                  action_shape,
                  1,
                  device,
                  hidden_layer_size=training_config['hidden_size']).to(device)
    actor_optim = torch.optim.Adam(actor.parameters(),
                                   lr=training_config['actor_lr'])
    net = Net(training_config['num_layers'],
              state_shape,
              action_shape,
              concat=True,
              device=device,
              hidden_layer_size=training_config['hi'])
    critic1 = Critic(
        net, device,
        hidden_layer_size=training_config['hidden_size']).to(device)
    critic1_optim = torch.optim.Adam(critic1.parameters(),
                                     lr=training_config['critic_lr'])
    critic2 = Critic(
        net, device,
        hidden_layer_size=training_config['hidden_size']).to(device)
    critic2_optim = torch.optim.Adam(critic2.parameters(),
                                     lr=training_config['critic_lr'])
    policy = TD3Policy(
        actor,
        actor_optim,
        critic1,
        critic1_optim,
        critic2,
        critic2_optim,
        action_range=[env.action_space.low, env.action_space.high],
        tau=training_config['tau'],
        gamma=training_config['gamma'],
        exploration_noise=None,
        policy_noise=training_config['policy_noise'],
        update_actor_freq=training_config['update_actor_freq'],
        noise_clip=training_config['noise_clip'],
        reward_normalization=training_config['rew_norm'],
        ignore_done=training_config['ignore_done'],
        estimation_step=training_config['n_step'])
    print(env.action_space.low, env.action_space.high)
    ep = 0
    for _ in range(avg):
        obs = env.reset()
        obs_batch = Batch(obs=[obs], info={})
        ep += 1
        traj = []
        done = False
        count = 0
        policy = load_model(policy)
        while not done:
            if not default:
                actions = policy(obs_batch).act.cpu().detach().numpy().reshape(
                    -1)
            else:
                actions = np.array([0.5, 1.57, 6, 20, 0.75, 1, 0.3])
            obs_new, rew, done, info = env.step(actions)
            count += 1
            traj.append([obs, actions, rew, done, info])
            obs_batch = Batch(obs=[obs_new], info={})
            obs = obs_new
        # print('count: %d, rew: %f' %(count, rew))
        write_buffer(traj, ep, id)
    env.close()