예제 #1
0
 def _dqn(env, writer=DummyWriter()):
     _model = nature_dqn(env).to(device)
     _optimizer = Adam(_model.parameters(), lr=lr, eps=eps)
     q = QNetwork(_model,
                  _optimizer,
                  env.action_space.n,
                  target=FixedTarget(target_update_frequency),
                  loss=smooth_l1_loss,
                  writer=writer)
     policy = GreedyPolicy(q,
                           env.action_space.n,
                           epsilon=LinearScheduler(initial_exploration,
                                                   final_exploration,
                                                   replay_start_size,
                                                   final_exploration_frame,
                                                   name="epsilon",
                                                   writer=writer))
     replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                            device=device)
     return DeepmindAtariBody(
         DQN(
             q,
             policy,
             replay_buffer,
             discount_factor=discount_factor,
             minibatch_size=minibatch_size,
             replay_start_size=replay_start_size,
             update_frequency=update_frequency,
         ), )
예제 #2
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        optimizer = Adam(self.model.parameters(),
                         lr=self.hyperparameters['lr'])

        q = QDist(
            self.model,
            optimizer,
            self.n_actions,
            self.hyperparameters['atoms'],
            v_min=self.hyperparameters['v_min'],
            v_max=self.hyperparameters['v_max'],
            target=FixedTarget(
                self.hyperparameters['target_update_frequency']),
            writer=writer,
        )

        replay_buffer = ExperienceReplayBuffer(
            self.hyperparameters['replay_buffer_size'], device=self.device)

        return C51(q,
                   replay_buffer,
                   exploration=LinearScheduler(
                       self.hyperparameters['initial_exploration'],
                       self.hyperparameters['final_exploration'],
                       0,
                       self.hyperparameters["final_exploration_step"] -
                       self.hyperparameters["replay_start_size"],
                       name="epsilon",
                       writer=writer,
                   ),
                   discount_factor=self.hyperparameters["discount_factor"],
                   minibatch_size=self.hyperparameters["minibatch_size"],
                   replay_start_size=self.hyperparameters["replay_start_size"],
                   update_frequency=self.hyperparameters["update_frequency"],
                   writer=writer)
예제 #3
0
    def _ddqn(env, writer=DummyWriter()):
        action_repeat = 1
        last_timestep = last_frame / action_repeat
        last_update = (last_timestep - replay_start_size) / update_frequency
        final_exploration_step = final_exploration_frame / action_repeat

        model = model_constructor(env).to(device)
        optimizer = Adam(model.parameters(), lr=lr, eps=eps)
        q = QNetwork(model,
                     optimizer,
                     scheduler=CosineAnnealingLR(optimizer, last_update),
                     target=FixedTarget(target_update_frequency),
                     writer=writer)
        policy = SharedAutonomyPolicy(q,
                                      env.action_space.n,
                                      epsilon=0,
                                      pilot_tol=pilot_tol)

        if prioritized_replay:
            replay_buffer = PrioritizedReplayBuffer(replay_buffer_size,
                                                    alpha=alpha,
                                                    beta=beta,
                                                    device=device)
        else:
            replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                                   device=device)

        return co_DDQN(q,
                       policy,
                       replay_buffer,
                       loss=weighted_smooth_l1_loss,
                       discount_factor=discount_factor,
                       minibatch_size=minibatch_size,
                       replay_start_size=replay_start_size,
                       update_frequency=update_frequency)
    def test_target(self):
        self.policy = DeterministicPolicy(self.model,
                                          self.optimizer,
                                          self.space,
                                          target=FixedTarget(3))
        state = State(torch.ones(1, STATE_DIM))

        # run update step, make sure target network doesn't change
        self.policy(state).sum().backward()
        self.policy.step()
        tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM))

        # again...
        self.policy(state).sum().backward()
        self.policy.step()
        tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM))

        # third time, target should be updated
        self.policy(state).sum().backward()
        self.policy.step()
        tt.assert_allclose(
            self.policy.target(state),
            torch.tensor([[-0.574482, -0.574482, -0.574482]]),
            atol=1e-4,
        )
    def test_target(self):
        self.policy = DeterministicPolicy(
            self.model,
            self.optimizer,
            self.space,
            target=FixedTarget(3)
        )

        # choose initial action
        state = State(torch.ones(1, STATE_DIM))
        action = self.policy(state)
        tt.assert_equal(action, torch.zeros(1, ACTION_DIM))

        # run update step, make sure target network doesn't change
        action.sum().backward(retain_graph=True)
        self.policy.step()
        tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM))

        # again...
        action.sum().backward(retain_graph=True)
        self.policy.step()
        tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM))

        # third time, target should be updated
        action.sum().backward(retain_graph=True)
        self.policy.step()
        tt.assert_allclose(
            self.policy.eval(state),
            torch.tensor([[-0.595883, -0.595883, -0.595883]]),
            atol=1e-4,
        )
예제 #6
0
 def _ddqn(env, writer=DummyWriter()):
     model = model_constructor(env).to(device)
     optimizer = Adam(model.parameters(), lr=lr)
     q = QNetwork(model,
                  optimizer,
                  target=FixedTarget(target_update_frequency),
                  writer=writer)
     policy = GreedyPolicy(q,
                           env.action_space.n,
                           epsilon=LinearScheduler(initial_exploration,
                                                   final_exploration,
                                                   replay_start_size,
                                                   final_exploration_frame,
                                                   name="epsilon",
                                                   writer=writer))
     replay_buffer = PrioritizedReplayBuffer(replay_buffer_size,
                                             alpha=alpha,
                                             beta=beta,
                                             device=device)
     return DDQN(q,
                 policy,
                 replay_buffer,
                 discount_factor=discount_factor,
                 replay_start_size=replay_start_size,
                 update_frequency=update_frequency,
                 minibatch_size=minibatch_size)
예제 #7
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        optimizer = Adam(self.model.parameters(),
                         lr=self.hyperparameters['lr'])

        q = QNetwork(self.model,
                     optimizer,
                     target=FixedTarget(
                         self.hyperparameters['target_update_frequency']),
                     writer=writer)

        policy = GreedyPolicy(
            q,
            self.n_actions,
            epsilon=LinearScheduler(
                self.hyperparameters['initial_exploration'],
                self.hyperparameters['final_exploration'],
                self.hyperparameters['replay_start_size'],
                self.hyperparameters['final_exploration_step'] -
                self.hyperparameters['replay_start_size'],
                name="exploration",
                writer=writer))

        replay_buffer = ExperienceReplayBuffer(
            self.hyperparameters['replay_buffer_size'], device=self.device)

        return DQN(
            q,
            policy,
            replay_buffer,
            discount_factor=self.hyperparameters['discount_factor'],
            minibatch_size=self.hyperparameters['minibatch_size'],
            replay_start_size=self.hyperparameters['replay_start_size'],
            update_frequency=self.hyperparameters['update_frequency'],
        )
예제 #8
0
 def _dqn(env, writer=DummyWriter()):
     model = fc_relu_q(env).to(device)
     optimizer = Adam(model.parameters(), lr=lr)
     q = QNetwork(model,
                  optimizer,
                  env.action_space.n,
                  target=FixedTarget(target_update_frequency),
                  loss=mse_loss,
                  writer=writer)
     policy = GreedyPolicy(q,
                           env.action_space.n,
                           epsilon=LinearScheduler(initial_exploration,
                                                   final_exploration,
                                                   replay_start_size,
                                                   final_exploration_frame,
                                                   name="epsilon",
                                                   writer=writer))
     replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                            device=device)
     return DQN(q,
                policy,
                replay_buffer,
                discount_factor=discount_factor,
                replay_start_size=replay_start_size,
                update_frequency=update_frequency,
                minibatch_size=minibatch_size)
예제 #9
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency']

        optimizer = Adam(
            self.model.parameters(),
            lr=self.hyperparameters['lr'],
            eps=self.hyperparameters['eps']
        )

        q_dist = QDist(
            self.model,
            optimizer,
            self.n_actions,
            self.hyperparameters['atoms'],
            scheduler=CosineAnnealingLR(optimizer, n_updates),
            v_min=self.hyperparameters['v_min'],
            v_max=self.hyperparameters['v_max'],
            target=FixedTarget(self.hyperparameters['target_update_frequency']),
            writer=writer,
        )

        replay_buffer = NStepReplayBuffer(
            self.hyperparameters['n_steps'],
            self.hyperparameters['discount_factor'],
            PrioritizedReplayBuffer(
                self.hyperparameters['replay_buffer_size'],
                alpha=self.hyperparameters['alpha'],
                beta=self.hyperparameters['beta'],
                device=self.device
            )
        )
        def agent_constructor(writer):
            return DeepmindAtariBody(
                Rainbow(
                    q_dist,
                    replay_buffer,
                    exploration=LinearScheduler(
                        self.hyperparameters['initial_exploration'],
                        self.hyperparameters['final_exploration'],
                        0,
                        train_steps - self.hyperparameters['replay_start_size'],
                        name="exploration",
                        writer=writer
                    ),
                    discount_factor=self.hyperparameters['discount_factor'] ** self.hyperparameters["n_steps"],
                    minibatch_size=self.hyperparameters['minibatch_size'],
                    replay_start_size=self.hyperparameters['replay_start_size'],
                    update_frequency=self.hyperparameters['update_frequency'],
                    writer=writer,
                ),
                lazy_frames=True,
                episodic_lives=True
            )

        return MultiagentEncoder(IndependentMultiagent({
            agent : agent_constructor(writers[agent])
            for agent in env.agents
        }), env.agents, device)
예제 #10
0
    def _dqn(env, writer=DummyWriter()):
        action_repeat = 4
        last_timestep = last_frame / action_repeat
        last_update = (last_timestep - replay_start_size) / update_frequency
        final_exploration_step = final_exploration_frame / action_repeat

        model = nature_dqn(env).to(device)

        optimizer = Adam(
            model.parameters(),
            lr=lr,
            eps=eps
        )

        q = QNetwork(
            model,
            optimizer,
            scheduler=CosineAnnealingLR(optimizer, last_update),
            target=FixedTarget(target_update_frequency),
            writer=writer
        )

        policy = GreedyPolicy(
            q,
            env.action_space.n,
            epsilon=LinearScheduler(
                initial_exploration,
                final_exploration,
                replay_start_size,
                final_exploration_step - replay_start_size,
                name="epsilon",
                writer=writer
            )
        )

        replay_buffer = ExperienceReplayBuffer(
            replay_buffer_size,
            device=device
        )

        return DeepmindAtariBody(
            DQN(
                q,
                policy,
                replay_buffer,
                discount_factor=discount_factor,
                loss=smooth_l1_loss,
                minibatch_size=minibatch_size,
                replay_start_size=replay_start_size,
                update_frequency=update_frequency,
            ),
            lazy_frames=True
        )
예제 #11
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency']

        optimizer = Adam(
            self.model.parameters(),
            lr=self.hyperparameters['lr'],
            eps=self.hyperparameters['eps']
        )

        q = QNetwork(
            self.model,
            optimizer,
            scheduler=CosineAnnealingLR(optimizer, n_updates),
            target=FixedTarget(self.hyperparameters['target_update_frequency']),
            writer=writer
        )

        policy = GreedyPolicy(
            q,
            self.n_actions,
            epsilon=LinearScheduler(
                self.hyperparameters['initial_exploration'],
                self.hyperparameters['final_exploration'],
                self.hyperparameters['replay_start_size'],
                self.hyperparameters['final_exploration_step'] - self.hyperparameters['replay_start_size'],
                name="exploration",
                writer=writer
            )
        )

        replay_buffer = ExperienceReplayBuffer(
            self.hyperparameters['replay_buffer_size'],
            device=self.device
        )

        return DeepmindAtariBody(
            DQN(
                q,
                policy,
                replay_buffer,
                discount_factor=self.hyperparameters['discount_factor'],
                loss=smooth_l1_loss,
                minibatch_size=self.hyperparameters['minibatch_size'],
                replay_start_size=self.hyperparameters['replay_start_size'],
                update_frequency=self.hyperparameters['update_frequency'],
            ),
            lazy_frames=True
        )
예제 #12
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        n_updates = (train_steps - self.hyperparameters['replay_start_size']) / self.hyperparameters['update_frequency']

        optimizer = Adam(
            self.model.parameters(),
            lr=self.hyperparameters['lr'],
            eps=self.hyperparameters['eps']
        )

        q = QDist(
            self.model,
            optimizer,
            self.n_actions,
            self.hyperparameters['atoms'],
            v_min=self.hyperparameters['v_min'],
            v_max=self.hyperparameters['v_max'],
            target=FixedTarget(self.hyperparameters['target_update_frequency']),
            scheduler=CosineAnnealingLR(optimizer, n_updates),
            writer=writer,
        )

        replay_buffer = ExperienceReplayBuffer(
            self.hyperparameters['replay_buffer_size'],
            device=self.device
        )

        return DeepmindAtariBody(
            C51(
                q,
                replay_buffer,
                exploration=LinearScheduler(
                    self.hyperparameters['initial_exploration'],
                    self.hyperparameters['final_exploration'],
                    0,
                    self.hyperparameters["final_exploration_step"] - self.hyperparameters["replay_start_size"],
                    name="epsilon",
                    writer=writer,
                ),
                discount_factor=self.hyperparameters["discount_factor"],
                minibatch_size=self.hyperparameters["minibatch_size"],
                replay_start_size=self.hyperparameters["replay_start_size"],
                update_frequency=self.hyperparameters["update_frequency"],
                writer=writer
            ),
            lazy_frames=True,
            episodic_lives=True
        )
예제 #13
0
    def _c51(env, writer=DummyWriter()):
        action_repeat = 4
        last_timestep = last_frame / action_repeat
        last_update = (last_timestep - replay_start_size) / update_frequency

        model = nature_c51(env, atoms=atoms).to(device)
        optimizer = Adam(
            model.parameters(),
            lr=lr,
            eps=eps
        )
        q = QDist(
            model,
            optimizer,
            env.action_space.n,
            atoms,
            v_min=v_min,
            v_max=v_max,
            target=FixedTarget(target_update_frequency),
            scheduler=CosineAnnealingLR(optimizer, last_update),
            writer=writer,
        )
        replay_buffer = ExperienceReplayBuffer(
            replay_buffer_size,
            device=device
        )
        return DeepmindAtariBody(
            C51(
                q,
                replay_buffer,
                exploration=LinearScheduler(
                    initial_exploration,
                    final_exploration,
                    0,
                    last_timestep,
                    name="epsilon",
                    writer=writer,
                ),
                discount_factor=discount_factor,
                minibatch_size=minibatch_size,
                replay_start_size=replay_start_size,
                update_frequency=update_frequency,
                writer=writer
            ),
            lazy_frames=True
        )
예제 #14
0
    def agent(self, writer=DummyWriter(), train_steps=float('inf')):
        optimizer = Adam(
            self.model.parameters(),
            lr=self.hyperparameters['lr'],
            eps=self.hyperparameters['eps']
        )

        q_dist = QDist(
            self.model,
            optimizer,
            self.n_actions,
            self.hyperparameters['atoms'],
            v_min=self.hyperparameters['v_min'],
            v_max=self.hyperparameters['v_max'],
            target=FixedTarget(self.hyperparameters['target_update_frequency']),
            writer=writer,
        )

        replay_buffer = NStepReplayBuffer(
            self.hyperparameters['n_steps'],
            self.hyperparameters['discount_factor'],
            PrioritizedReplayBuffer(
                self.hyperparameters['replay_buffer_size'],
                alpha=self.hyperparameters['alpha'],
                beta=self.hyperparameters['beta'],
                device=self.device
            )
        )

        return Rainbow(
            q_dist,
            replay_buffer,
            exploration=LinearScheduler(
                self.hyperparameters['initial_exploration'],
                self.hyperparameters['final_exploration'],
                0,
                train_steps - self.hyperparameters['replay_start_size'],
                name="exploration",
                writer=writer
            ),
            discount_factor=self.hyperparameters['discount_factor'] ** self.hyperparameters["n_steps"],
            minibatch_size=self.hyperparameters['minibatch_size'],
            replay_start_size=self.hyperparameters['replay_start_size'],
            update_frequency=self.hyperparameters['update_frequency'],
            writer=writer,
        )
예제 #15
0
    def __init__(self, policy, logger, out_dim, device="cpu"):
        self.hyperparameters = hyperparameters = default_hyperparameters
        self.policy = policy
        self.model = policy.model
        self.device = device
        self.logger = logger
        self.discount_factor = hyperparameters['discount_factor']
        self.out_dim = out_dim
        writer = DummyWriter()
        optimizer = Adam(self.model.parameters(),
                         lr=self.hyperparameters['lr'])

        self.q = q = QNetwork(
            self.model,
            optimizer,
            target=FixedTarget(
                self.hyperparameters['target_update_frequency']),
            writer=writer)
    def _rainbow(env, writer=DummyWriter()):
        action_repeat = 4
        last_timestep = last_frame / action_repeat
        last_update = (last_timestep - replay_start_size) / update_frequency

        model = model_constructor(env, atoms=atoms, sigma=sigma).to(device)
        optimizer = Adam(model.parameters(), lr=lr, eps=eps)
        q = QDist(
            model,
            optimizer,
            env.action_space.n,
            atoms,
            scheduler=CosineAnnealingLR(optimizer, last_update),
            v_min=v_min,
            v_max=v_max,
            target=FixedTarget(target_update_frequency),
            writer=writer,
        )
        replay_buffer = PrioritizedReplayBuffer(replay_buffer_size,
                                                alpha=alpha,
                                                beta=beta,
                                                device=device)
        replay_buffer = NStepReplayBuffer(n_steps, discount_factor,
                                          replay_buffer)

        agent = Rainbow(
            q,
            replay_buffer,
            exploration=LinearScheduler(initial_exploration,
                                        final_exploration,
                                        0,
                                        last_timestep,
                                        name='exploration',
                                        writer=writer),
            discount_factor=discount_factor**n_steps,
            minibatch_size=minibatch_size,
            replay_start_size=replay_start_size,
            update_frequency=update_frequency,
            writer=writer,
        )
        return DeepmindAtariBody(agent, lazy_frames=True, episodic_lives=True)
예제 #17
0
    def test_target_net(self):
        torch.manual_seed(2)
        model = nn.Sequential(nn.Linear(1, 1))
        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
        q = QNetwork(model,
                     optimizer,
                     1,
                     loss=smooth_l1_loss,
                     target=FixedTarget(3))
        inputs = State(torch.tensor([1.]))
        errors = torch.tensor([-1.])

        policy_value = q(inputs).item()
        target_value = q.target(inputs).item()
        np.testing.assert_equal(policy_value, -0.008584141731262207)
        np.testing.assert_equal(target_value, -0.008584141731262207)

        q.reinforce(errors)
        policy_value = q(inputs).item()
        target_value = q.target(inputs).item()
        np.testing.assert_equal(policy_value, -0.20858412981033325)
        np.testing.assert_equal(target_value, -0.008584141731262207)

        q.reinforce(errors)
        policy_value = q(inputs).item()
        target_value = q.target(inputs).item()
        np.testing.assert_equal(policy_value, -0.4085841178894043)
        np.testing.assert_equal(target_value, -0.008584141731262207)

        q.reinforce(errors)
        policy_value = q(inputs).item()
        target_value = q.target(inputs).item()
        np.testing.assert_equal(policy_value, -0.6085841655731201)
        np.testing.assert_equal(target_value, -0.6085841655731201)

        q.reinforce(errors)
        policy_value = q(inputs).item()
        target_value = q.target(inputs).item()
        np.testing.assert_equal(policy_value, -0.8085841536521912)
        np.testing.assert_equal(target_value, -0.6085841655731201)
예제 #18
0
 def _c51(env, writer=DummyWriter()):
     model = nature_c51(env, atoms=51).to(device)
     optimizer = Adam(
         model.parameters(),
         lr=lr,
         eps=eps
     )
     q = QDist(
         model,
         optimizer,
         env.action_space.n,
         atoms,
         v_min=v_min,
         v_max=v_max,
         target=FixedTarget(target_update_frequency),
         writer=writer,
     )
     replay_buffer = ExperienceReplayBuffer(
         replay_buffer_size,
         device=device
     )
     return DeepmindAtariBody(
         C51(
             q,
             replay_buffer,
             exploration=LinearScheduler(
                 initial_exploration,
                 final_exploration,
                 replay_start_size,
                 final_exploration_frame,
                 name="epsilon",
                 writer=writer,
             ),
             discount_factor=discount_factor,
             minibatch_size=minibatch_size,
             replay_start_size=replay_start_size,
             update_frequency=update_frequency,
             writer=writer
         )
     )
예제 #19
0
    def _dqn(env, writers=None):
        action_repeat = 4
        last_timestep = last_frame / action_repeat
        last_update = (last_timestep - replay_start_size) / update_frequency
        final_exploration_step = final_exploration_frame / action_repeat

        n_agents = len(env.agents)
        n_actions = env.action_spaces['first_0'].n

        model = model_constructor(env).to(device)

        optimizer = Adam(
            model.parameters(),
            lr=lr,
            eps=eps
        )

        q = Approximation(
            model,
            optimizer,
            scheduler=CosineAnnealingLR(optimizer, last_update),
            target=FixedTarget(target_update_frequency),
            writer=writers['first_0']
        )

        replay_buffer = ExperienceReplayBuffer(
            replay_buffer_size,
            store_device=device,
            device=device
        )

        def agent_constructor(writer):
            policy = GreedyPolicy(
                q,
                n_actions,
                epsilon=LinearScheduler(
                    initial_exploration,
                    final_exploration,
                    replay_start_size,
                    final_exploration_step - replay_start_size,
                    name="epsilon",
                    writer=writer
                )
            )

            return DeepmindAtariBody(
                DQN(
                    q,
                    policy,
                    replay_buffer,
                    discount_factor=discount_factor,
                    loss=smooth_l1_loss,
                    minibatch_size=minibatch_size,
                    replay_start_size=replay_start_size,
                    update_frequency=update_frequency,
                ),
                lazy_frames=True
            )

        return MultiagentEncoder(IndependentMultiagent({
            agent : agent_constructor(writers[agent])
            for agent in env.agents
        }), env.agents, device)
    def _sac(env, writer=DummyWriter()):
        final_anneal_step = (last_frame -
                             replay_start_size) // update_frequency

        v_model = v_model_constructor(env).to(device)
        q_1_model = q1_model_constructor(env).to(device)
        q_2_model = q2_model_constructor(env).to(device)
        #quick and dirty implementation of parallel branch un/freeze
        policy_model = policy_model_constructor(
            env=env, train_parallel=train_parallel).to(device)

        if pretrained_models is not None:
            q_1_model = pretrained_models.q_1.model.to(device)
            q_2_model = pretrained_models.q_2.model.to(device)
            v_model = pretrained_models.v.model.to(device)
            policy_model = pretrained_models.policy.model.to(device)

        q_1_optimizer = Adam(q_1_model.parameters(), lr=lr_q)
        q_1 = QContinuousCtrlRep(q_1_model,
                                 q_1_optimizer,
                                 scheduler=CosineAnnealingLR(
                                     q_1_optimizer, final_anneal_step),
                                 target=FixedTarget(1000),
                                 writer=writer,
                                 name='q_1')

        q_2_optimizer = Adam(q_2_model.parameters(), lr=lr_q)
        q_2 = QContinuousCtrlRep(q_2_model,
                                 q_2_optimizer,
                                 scheduler=CosineAnnealingLR(
                                     q_2_optimizer, final_anneal_step),
                                 target=FixedTarget(1000),
                                 writer=writer,
                                 name='q_2')

        v_optimizer = Adam(v_model.parameters(), lr=lr_v)
        v = VNetworkCtrlRep(
            v_model,
            v_optimizer,
            scheduler=CosineAnnealingLR(v_optimizer, final_anneal_step),
            target=PolyakTarget(polyak_rate),
            writer=writer,
            name='v',
        )

        policy_optimizer = Adam(filter(lambda p: p.requires_grad,
                                       policy_model.parameters()),
                                lr=lr_pi)
        policy = SoftDeterministicPolicyCtrlRep(policy_model,
                                                policy_optimizer,
                                                env.action_space,
                                                scheduler=CosineAnnealingLR(
                                                    policy_optimizer,
                                                    final_anneal_step),
                                                target=FixedTarget(1000),
                                                writer=writer)

        replay_buffer = ExperienceReplayBuffer(replay_buffer_size,
                                               device=device)

        return TimeFeature(
            SACCtrlRep(policy=policy,
                       q_1=q_1,
                       q_2=q_2,
                       v=v,
                       replay_buffer=replay_buffer,
                       temperature_initial=temperature_initial,
                       entropy_target=(-env.action_space.shape[0] *
                                       entropy_target_scaling),
                       lr_temperature=lr_temperature,
                       replay_start_size=replay_start_size,
                       discount_factor=discount_factor,
                       update_frequency=update_frequency,
                       minibatch_size=minibatch_size,
                       writer=writer))
 def setUp(self):
     self.model = Identity('cpu', target=FixedTarget(10))