예제 #1
0
 def test_agent(self):
     policy = DeterministicPolicy(
         copy.deepcopy(self.policy_model),
         None,
         self.action_space,
     )
     return TimeFeature(DDPGTestAgent(policy))
예제 #2
0
def main():
    parser = argparse.ArgumentParser(description="Watch a continuous agent.")
    parser.add_argument("env", help="ID of the Environment")
    parser.add_argument("dir",
                        help="Directory where the agent's model was saved.")
    parser.add_argument(
        "--device",
        default="cpu",
        help=
        "The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)",
    )
    parser.add_argument(
        "--fps",
        default=120,
        help="Playback speed",
    )
    args = parser.parse_args()

    if args.env in ENVS:
        env_id = ENVS[args.env]
    else:
        env_id = args.env

    env = GymEnvironment(env_id, device=args.device)
    agent = TimeFeature(GreedyAgent.load(args.dir, env))
    watch(agent, env, fps=args.fps)
예제 #3
0
    def _ppo(envs, writer=DummyWriter()):
        final_anneal_step = last_frame * epochs * minibatches / (n_steps *
                                                                 n_envs)
        env = envs[0]

        feature_model, value_model, policy_model = fc_actor_critic(env)
        feature_model.to(device)
        value_model.to(device)
        policy_model.to(device)

        feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps)
        value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)

        features = FeatureNetwork(feature_model,
                                  feature_optimizer,
                                  clip_grad=clip_grad,
                                  scheduler=CosineAnnealingLR(
                                      feature_optimizer, final_anneal_step),
                                  writer=writer)
        v = VNetwork(
            value_model,
            value_optimizer,
            loss_scaling=value_loss_scaling,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(value_optimizer, final_anneal_step),
        )
        policy = GaussianPolicy(
            policy_model,
            policy_optimizer,
            env.action_space,
            clip_grad=clip_grad,
            writer=writer,
            scheduler=CosineAnnealingLR(policy_optimizer, final_anneal_step),
        )

        return TimeFeature(
            PPO(
                features,
                v,
                policy,
                epsilon=LinearScheduler(clip_initial,
                                        clip_final,
                                        0,
                                        final_anneal_step,
                                        name='clip',
                                        writer=writer),
                epochs=epochs,
                minibatches=minibatches,
                n_envs=n_envs,
                n_steps=n_steps,
                discount_factor=discount_factor,
                lam=lam,
                entropy_loss_scaling=entropy_loss_scaling,
                writer=writer,
            ))
예제 #4
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = train_steps * self.hyperparameters[
            'epochs'] * self.hyperparameters['minibatches'] / (
                self.hyperparameters['n_steps'] *
                self.hyperparameters['n_envs'])

        value_optimizer = Adam(self.value_model.parameters(),
                               lr=self.hyperparameters['lr'],
                               eps=self.hyperparameters['eps'])
        policy_optimizer = Adam(self.policy_model.parameters(),
                                lr=self.hyperparameters['lr'],
                                eps=self.hyperparameters['eps'])

        features = Identity(self.device)

        v = VNetwork(
            self.value_model,
            value_optimizer,
            loss_scaling=self.hyperparameters['value_loss_scaling'],
            clip_grad=self.hyperparameters['clip_grad'],
            writer=writer,
            scheduler=CosineAnnealingLR(value_optimizer, n_updates),
        )

        policy = GaussianPolicy(
            self.policy_model,
            policy_optimizer,
            self.action_space,
            clip_grad=self.hyperparameters['clip_grad'],
            writer=writer,
            scheduler=CosineAnnealingLR(policy_optimizer, n_updates),
        )

        return TimeFeature(
            PPO(
                features,
                v,
                policy,
                epsilon=LinearScheduler(self.hyperparameters['clip_initial'],
                                        self.hyperparameters['clip_final'],
                                        0,
                                        n_updates,
                                        name='clip',
                                        writer=writer),
                epochs=self.hyperparameters['epochs'],
                minibatches=self.hyperparameters['minibatches'],
                n_envs=self.hyperparameters['n_envs'],
                n_steps=self.hyperparameters['n_steps'],
                discount_factor=self.hyperparameters['discount_factor'],
                lam=self.hyperparameters['lam'],
                entropy_loss_scaling=self.
                hyperparameters['entropy_loss_scaling'],
                writer=writer,
            ))
예제 #5
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = (train_steps - self.hyperparameters["replay_start_size"]
                     ) / self.hyperparameters["update_frequency"]

        q_optimizer = Adam(self.q_model.parameters(),
                           lr=self.hyperparameters["lr_q"])

        q = QContinuous(self.q_model,
                        q_optimizer,
                        target=PolyakTarget(
                            self.hyperparameters["polyak_rate"]),
                        scheduler=CosineAnnealingLR(q_optimizer, n_updates),
                        writer=writer)

        policy_optimizer = Adam(self.policy_model.parameters(),
                                lr=self.hyperparameters["lr_pi"])
        policy = DeterministicPolicy(
            self.policy_model,
            policy_optimizer,
            self.action_space,
            target=PolyakTarget(self.hyperparameters["polyak_rate"]),
            scheduler=CosineAnnealingLR(policy_optimizer, n_updates),
            writer=writer)

        replay_buffer = ExperienceReplayBuffer(
            self.hyperparameters["replay_buffer_size"], device=self.device)

        return TimeFeature(
            DDPG(
                q,
                policy,
                replay_buffer,
                self.action_space,
                noise=self.hyperparameters["noise"],
                replay_start_size=self.hyperparameters["replay_start_size"],
                discount_factor=self.hyperparameters["discount_factor"],
                update_frequency=self.hyperparameters["update_frequency"],
                minibatch_size=self.hyperparameters["minibatch_size"],
            ))
예제 #6
0
    def _ddpg(env, writer=DummyWriter()):
        final_anneal_step = (last_frame -
                             replay_start_size) // update_frequency

        q_model = fc_q(env).to(device)
        q_optimizer = Adam(q_model.parameters(), lr=lr_q)
        q = QContinuous(q_model,
                        q_optimizer,
                        target=PolyakTarget(polyak_rate),
                        scheduler=CosineAnnealingLR(q_optimizer,
                                                    final_anneal_step),
                        writer=writer)

        policy_model = fc_deterministic_policy(env).to(device)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi)
        policy = DeterministicPolicy(policy_model,
                                     policy_optimizer,
                                     env.action_space,
                                     target=PolyakTarget(polyak_rate),
                                     scheduler=CosineAnnealingLR(
                                         policy_optimizer, final_anneal_step),
                                     writer=writer)

        replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                               device=device)

        return TimeFeature(
            DDPG(
                q,
                policy,
                replay_buffer,
                env.action_space,
                noise=noise,
                replay_start_size=replay_start_size,
                discount_factor=discount_factor,
                update_frequency=update_frequency,
                minibatch_size=minibatch_size,
            ))
예제 #7
0
 def setUp(self):
     torch.manual_seed(2)
     self.test_agent = TestAgent()
     self.agent = TimeFeature(self.test_agent)
예제 #8
0
class TimeFeatureTest(unittest.TestCase):
    def setUp(self):
        torch.manual_seed(2)
        self.test_agent = TestAgent()
        self.agent = TimeFeature(self.test_agent)

    def test_init(self):
        state = State(torch.randn(4))
        self.agent.act(state)
        tt.assert_allclose(self.test_agent.last_state.observation, torch.tensor(
            [0.3923, -0.2236, -0.3195, -1.2050, 0.0000]), atol=1e-04)

    def test_single_env(self):
        state = State(torch.randn(4))
        self.agent.act(state)
        tt.assert_allclose(self.test_agent.last_state.observation, torch.tensor(
            [0.3923, -0.2236, -0.3195, -1.2050, 0.]), atol=1e-04)
        self.agent.act(state)
        tt.assert_allclose(self.test_agent.last_state.observation, torch.tensor(
            [0.3923, -0.2236, -0.3195, -1.2050, 1e-3]), atol=1e-04)
        self.agent.act(state)
        tt.assert_allclose(self.test_agent.last_state.observation, torch.tensor(
            [0.3923, -0.2236, -0.3195, -1.2050, 2e-3]), atol=1e-04)

    def test_reset(self):
        state = State(torch.randn(4))
        self.agent.act(state)
        tt.assert_allclose(self.test_agent.last_state.observation, torch.tensor(
            [0.3923, -0.2236, -0.3195, -1.2050, 0.0000]), atol=1e-04)
        self.agent.act(state)
        tt.assert_allclose(self.test_agent.last_state.observation, torch.tensor(
            [0.3923, -0.2236, -0.3195, -1.2050, 1e-3]), atol=1e-04)
        self.agent.act(State(state.observation, done=True))
        tt.assert_allclose(self.test_agent.last_state.observation, torch.tensor(
            [0.3923, -0.2236, -0.3195, -1.2050, 2e-3]), atol=1e-04)
        self.agent.act(State(state.observation))
        tt.assert_allclose(self.test_agent.last_state.observation, torch.tensor(
            [0.3923, -0.2236, -0.3195, -1.2050, 0.0000]), atol=1e-04)
        self.agent.act(state)
        tt.assert_allclose(self.test_agent.last_state.observation, torch.tensor(
            [0.3923, -0.2236, -0.3195, -1.2050, 1e-3]), atol=1e-04)

    def test_multi_env(self):
        state = StateArray(torch.randn(2, 2), (2,))
        self.agent.act(state)
        tt.assert_allclose(self.test_agent.last_state.observation, torch.tensor(
            [[0.3923, -0.2236, 0.], [-0.3195, -1.2050, 0.]]), atol=1e-04)
        self.agent.act(state)
        tt.assert_allclose(self.test_agent.last_state.observation, torch.tensor(
            [[0.3923, -0.2236, 1e-3], [-0.3195, -1.2050, 1e-3]]), atol=1e-04)
        self.agent.act(StateArray(state.observation, (2,), done=torch.tensor([False, True])))
        tt.assert_allclose(self.test_agent.last_state.observation, torch.tensor(
            [[0.3923, -0.2236, 2e-3], [-0.3195, -1.2050, 2e-3]]), atol=1e-04)
        self.agent.act(state)
        tt.assert_allclose(self.test_agent.last_state.observation, torch.tensor(
            [[0.3923, -0.2236, 3e-3], [-0.3195, -1.2050, 0.]]), atol=1e-04)
        self.agent.act(state)
        tt.assert_allclose(self.test_agent.last_state.observation, torch.tensor(
            [[0.3923, -0.2236, 4e-3], [-0.3195, -1.2050, 1e-3]]), atol=1e-04)
예제 #9
0
    def _sac(env, writer=DummyWriter()):
        final_anneal_step = (last_frame -
                             replay_start_size) // update_frequency

        q_1_model = fc_q(env).to(device)
        q_1_optimizer = Adam(q_1_model.parameters(), lr=lr_q)
        q_1 = QContinuous(q_1_model,
                          q_1_optimizer,
                          scheduler=CosineAnnealingLR(q_1_optimizer,
                                                      final_anneal_step),
                          writer=writer,
                          name='q_1')

        q_2_model = fc_q(env).to(device)
        q_2_optimizer = Adam(q_2_model.parameters(), lr=lr_q)
        q_2 = QContinuous(q_2_model,
                          q_2_optimizer,
                          scheduler=CosineAnnealingLR(q_2_optimizer,
                                                      final_anneal_step),
                          writer=writer,
                          name='q_2')

        v_model = fc_v(env).to(device)
        v_optimizer = Adam(v_model.parameters(), lr=lr_v)
        v = VNetwork(
            v_model,
            v_optimizer,
            scheduler=CosineAnnealingLR(v_optimizer, final_anneal_step),
            target=PolyakTarget(polyak_rate),
            writer=writer,
            name='v',
        )

        policy_model = fc_soft_policy(env).to(device)
        policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi)
        policy = SoftDeterministicPolicy(policy_model,
                                         policy_optimizer,
                                         env.action_space,
                                         scheduler=CosineAnnealingLR(
                                             policy_optimizer,
                                             final_anneal_step),
                                         writer=writer)

        replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                               device=device)

        return TimeFeature(
            SAC(policy,
                q_1,
                q_2,
                v,
                replay_buffer,
                temperature_initial=temperature_initial,
                entropy_target=(-env.action_space.shape[0] *
                                entropy_target_scaling),
                lr_temperature=lr_temperature,
                replay_start_size=replay_start_size,
                discount_factor=discount_factor,
                update_frequency=update_frequency,
                minibatch_size=minibatch_size,
                writer=writer))
예제 #10
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = (train_steps - self.hyperparameters["replay_start_size"]
                     ) / self.hyperparameters["update_frequency"]

        q_1_optimizer = Adam(self.q_1_model.parameters(),
                             lr=self.hyperparameters["lr_q"])
        q_1 = QContinuous(self.q_1_model,
                          q_1_optimizer,
                          scheduler=CosineAnnealingLR(q_1_optimizer,
                                                      n_updates),
                          writer=writer,
                          name='q_1')

        q_2_optimizer = Adam(self.q_2_model.parameters(),
                             lr=self.hyperparameters["lr_q"])
        q_2 = QContinuous(self.q_2_model,
                          q_2_optimizer,
                          scheduler=CosineAnnealingLR(q_2_optimizer,
                                                      n_updates),
                          writer=writer,
                          name='q_2')

        v_optimizer = Adam(self.v_model.parameters(),
                           lr=self.hyperparameters["lr_v"])
        v = VNetwork(
            self.v_model,
            v_optimizer,
            scheduler=CosineAnnealingLR(v_optimizer, n_updates),
            target=PolyakTarget(self.hyperparameters["polyak_rate"]),
            writer=writer,
            name='v',
        )

        policy_optimizer = Adam(self.policy_model.parameters(),
                                lr=self.hyperparameters["lr_pi"])
        policy = SoftDeterministicPolicy(self.policy_model,
                                         policy_optimizer,
                                         self.action_space,
                                         scheduler=CosineAnnealingLR(
                                             policy_optimizer, n_updates),
                                         writer=writer)

        replay_buffer = ExperienceReplayBuffer(
            self.hyperparameters["replay_buffer_size"], device=self.device)

        return TimeFeature(
            SAC(policy,
                q_1,
                q_2,
                v,
                replay_buffer,
                temperature_initial=self.
                hyperparameters["temperature_initial"],
                entropy_target=(
                    -self.action_space.shape[0] *
                    self.hyperparameters["entropy_target_scaling"]),
                lr_temperature=self.hyperparameters["lr_temperature"],
                replay_start_size=self.hyperparameters["replay_start_size"],
                discount_factor=self.hyperparameters["discount_factor"],
                update_frequency=self.hyperparameters["update_frequency"],
                minibatch_size=self.hyperparameters["minibatch_size"],
                writer=writer))
예제 #11
0
 def test_agent(self):
     policy = SoftDeterministicPolicy(copy.deepcopy(self.policy_model),
                                      space=self.action_space)
     return TimeFeature(SACTestAgent(policy))
예제 #12
0
class TimeFeatureTest(unittest.TestCase):
    def setUp(self):
        torch.manual_seed(2)
        self.test_agent = TestAgent()
        self.agent = TimeFeature(self.test_agent)

    def test_init(self):
        state = State(torch.randn(1, 4))
        self.agent.act(state, 0)
        tt.assert_allclose(self.test_agent.last_state.features,
                           torch.tensor(
                               [[0.3923, -0.2236, -0.3195, -1.2050, 0.0000]]),
                           atol=1e-04)

    def test_single_env(self):
        state = State(torch.randn(1, 4))
        self.agent.act(state, 0)
        tt.assert_allclose(self.test_agent.last_state.features,
                           torch.tensor(
                               [[0.3923, -0.2236, -0.3195, -1.2050, 0.]]),
                           atol=1e-04)
        self.agent.act(state, 0)
        tt.assert_allclose(self.test_agent.last_state.features,
                           torch.tensor(
                               [[0.3923, -0.2236, -0.3195, -1.2050, 1e-3]]),
                           atol=1e-04)
        self.agent.act(state, 0)
        tt.assert_allclose(self.test_agent.last_state.features,
                           torch.tensor(
                               [[0.3923, -0.2236, -0.3195, -1.2050, 2e-3]]),
                           atol=1e-04)

    def test_reset(self):
        state = State(torch.randn(1, 4))
        self.agent.act(state, 0)
        tt.assert_allclose(self.test_agent.last_state.features,
                           torch.tensor(
                               [[0.3923, -0.2236, -0.3195, -1.2050, 0.0000]]),
                           atol=1e-04)
        self.agent.act(state, 0)
        tt.assert_allclose(self.test_agent.last_state.features,
                           torch.tensor(
                               [[0.3923, -0.2236, -0.3195, -1.2050, 1e-3]]),
                           atol=1e-04)
        self.agent.act(State(state.features, DONE), 0)
        tt.assert_allclose(self.test_agent.last_state.features,
                           torch.tensor(
                               [[0.3923, -0.2236, -0.3195, -1.2050, 2e-3]]),
                           atol=1e-04)
        self.agent.act(State(state.features), 0)
        tt.assert_allclose(self.test_agent.last_state.features,
                           torch.tensor(
                               [[0.3923, -0.2236, -0.3195, -1.2050, 0.0000]]),
                           atol=1e-04)
        self.agent.act(state, 0)
        tt.assert_allclose(self.test_agent.last_state.features,
                           torch.tensor(
                               [[0.3923, -0.2236, -0.3195, -1.2050, 1e-3]]),
                           atol=1e-04)

    def test_multi_env(self):
        state = State(torch.randn(2, 2))
        self.agent.act(state, 0)
        tt.assert_allclose(self.test_agent.last_state.features,
                           torch.tensor([[0.3923, -0.2236, 0.],
                                         [-0.3195, -1.2050, 0.]]),
                           atol=1e-04)
        self.agent.act(state, 0)
        tt.assert_allclose(self.test_agent.last_state.features,
                           torch.tensor([[0.3923, -0.2236, 1e-3],
                                         [-0.3195, -1.2050, 1e-3]]),
                           atol=1e-04)
        self.agent.act(State(state.features, torch.tensor([1., 0.])), 0)
        tt.assert_allclose(self.test_agent.last_state.features,
                           torch.tensor([[0.3923, -0.2236, 2e-3],
                                         [-0.3195, -1.2050, 2e-3]]),
                           atol=1e-04)
        self.agent.act(state, 0)
        tt.assert_allclose(self.test_agent.last_state.features,
                           torch.tensor([[0.3923, -0.2236, 3e-3],
                                         [-0.3195, -1.2050, 0.]]),
                           atol=1e-04)
        self.agent.act(state, 0)
        tt.assert_allclose(self.test_agent.last_state.features,
                           torch.tensor([[0.3923, -0.2236, 4e-3],
                                         [-0.3195, -1.2050, 1e-3]]),
                           atol=1e-04)
예제 #13
0
 def test_agent(self):
     policy = GaussianPolicy(copy.deepcopy(self.policy_model),
                             space=self.action_space)
     return TimeFeature(PPOTestAgent(Identity(self.device), policy))
    def _sac(env, writer=DummyWriter()):
        final_anneal_step = (last_frame -
                             replay_start_size) // update_frequency

        v_model = v_model_constructor(env).to(device)
        q_1_model = q1_model_constructor(env).to(device)
        q_2_model = q2_model_constructor(env).to(device)
        #quick and dirty implementation of parallel branch un/freeze
        policy_model = policy_model_constructor(
            env=env, train_parallel=train_parallel).to(device)

        if pretrained_models is not None:
            q_1_model = pretrained_models.q_1.model.to(device)
            q_2_model = pretrained_models.q_2.model.to(device)
            v_model = pretrained_models.v.model.to(device)
            policy_model = pretrained_models.policy.model.to(device)

        q_1_optimizer = Adam(q_1_model.parameters(), lr=lr_q)
        q_1 = QContinuousCtrlRep(q_1_model,
                                 q_1_optimizer,
                                 scheduler=CosineAnnealingLR(
                                     q_1_optimizer, final_anneal_step),
                                 target=FixedTarget(1000),
                                 writer=writer,
                                 name='q_1')

        q_2_optimizer = Adam(q_2_model.parameters(), lr=lr_q)
        q_2 = QContinuousCtrlRep(q_2_model,
                                 q_2_optimizer,
                                 scheduler=CosineAnnealingLR(
                                     q_2_optimizer, final_anneal_step),
                                 target=FixedTarget(1000),
                                 writer=writer,
                                 name='q_2')

        v_optimizer = Adam(v_model.parameters(), lr=lr_v)
        v = VNetworkCtrlRep(
            v_model,
            v_optimizer,
            scheduler=CosineAnnealingLR(v_optimizer, final_anneal_step),
            target=PolyakTarget(polyak_rate),
            writer=writer,
            name='v',
        )

        policy_optimizer = Adam(filter(lambda p: p.requires_grad,
                                       policy_model.parameters()),
                                lr=lr_pi)
        policy = SoftDeterministicPolicyCtrlRep(policy_model,
                                                policy_optimizer,
                                                env.action_space,
                                                scheduler=CosineAnnealingLR(
                                                    policy_optimizer,
                                                    final_anneal_step),
                                                target=FixedTarget(1000),
                                                writer=writer)

        replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                               device=device)

        return TimeFeature(
            SACCtrlRep(policy=policy,
                       q_1=q_1,
                       q_2=q_2,
                       v=v,
                       replay_buffer=replay_buffer,
                       temperature_initial=temperature_initial,
                       entropy_target=(-env.action_space.shape[0] *
                                       entropy_target_scaling),
                       lr_temperature=lr_temperature,
                       replay_start_size=replay_start_size,
                       discount_factor=discount_factor,
                       update_frequency=update_frequency,
                       minibatch_size=minibatch_size,
                       writer=writer))