Beispiel #1
0
class DDPG:
    def __init__(self, action_dim, action_bound, actor_learning_rate,
                 critic_learning_rate, batch_size, memory_size, gamma, tau,
                 explore_steps, explore_noise, noise_clip, ctx):
        self.action_dim = action_dim
        self.action_bound = nd.array(action_bound, ctx=ctx)
        self.actor_learning_rate = actor_learning_rate
        self.critic_learning_rate = critic_learning_rate
        self.batch_size = batch_size
        self.memory_size = memory_size
        self.gamma = gamma
        self.tau = tau
        self.explore_steps = explore_steps
        self.explore_noise = explore_noise
        self.noise_clip = noise_clip
        self.ctx = ctx
        self.total_steps = 0

        self.memory_buffer = MemoryBuffer(self.memory_size, ctx=ctx)

        self.target_actor_network = ActorNetwork(self.action_dim,
                                                 self.action_bound)
        self.main_actor_network = ActorNetwork(self.action_dim,
                                               self.action_bound)
        self.target_critic_network = CriticNetwork()
        self.main_critic_network = CriticNetwork()

        self.target_actor_network.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)
        self.target_critic_network.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)
        self.main_actor_network.collect_params().initialize(init=init.Xavier(),
                                                            ctx=ctx)
        self.main_critic_network.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)

        self.actor_optimizer = gluon.Trainer(
            self.main_actor_network.collect_params(), 'adam',
            {'learning_rate': self.actor_learning_rate})
        self.critic_optimizer = gluon.Trainer(
            self.main_critic_network.collect_params(), 'adam',
            {'learning_rate': self.critic_learning_rate})

    def choose_action_train(self, state):
        state = nd.array([state], ctx=self.ctx)
        action = self.main_actor_network(state)
        # no noise clip
        noise = nd.normal(loc=0,
                          scale=self.explore_noise,
                          shape=action.shape,
                          ctx=self.ctx)
        action += noise
        clipped_action = self.action_clip(action)
        return clipped_action

    def choose_action_evaluate(self, state):
        state = nd.array([state], ctx=self.ctx)
        action = self.main_actor_network(state)
        return action

    def action_clip(self, action):
        low_bound = [
            float(self.action_bound[i][0].asnumpy())
            for i in range(self.action_dim)
        ]
        high_bound = [
            float(self.action_bound[i][1].asnumpy())
            for i in range(self.action_dim)
        ]
        bound = list(zip(low_bound, high_bound))
        # clip and reshape
        action_list = [
            nd.clip(action[:, i], bound[i][0], bound[i][1]).reshape(-1, 1)
            for i in range(self.action_dim)
        ]
        # concat
        clipped_action = reduce(nd.concat, action_list)
        return clipped_action.squeeze()

    def soft_update(self, target_network, main_network):
        target_parameters = target_network.collect_params().keys()
        main_parameters = main_network.collect_params().keys()
        d = zip(target_parameters, main_parameters)
        for x, y in d:
            target_network.collect_params()[x].data()[:] = \
                target_network.collect_params()[x].data() * \
                (1 - self.tau) + main_network.collect_params()[y].data() * self.tau

    def update(self):
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory_buffer.sample(
            self.batch_size)

        # ---------------optimize critic------------------
        with autograd.record():
            next_action_batch = self.target_actor_network(next_state_batch)
            next_q = self.target_critic_network(next_state_batch,
                                                next_action_batch).squeeze()
            target_q = reward_batch + (1 - done_batch) * self.gamma * next_q

            current_q = self.main_critic_network(state_batch, action_batch)
            loss = gloss.L2Loss()
            value_loss = loss(target_q.detach(), current_q)
        self.main_critic_network.collect_params().zero_grad()
        value_loss.backward()
        self.critic_optimizer.step(self.batch_size)

        # ---------------optimize actor-------------------
        with autograd.record():
            pred_action_batch = self.main_actor_network(state_batch)
            actor_loss = -nd.mean(
                self.main_critic_network(state_batch, pred_action_batch))
        self.main_actor_network.collect_params().zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step(1)

        self.soft_update(self.target_actor_network, self.main_actor_network)
        self.soft_update(self.target_critic_network, self.main_critic_network)

    def save(self):
        self.main_actor_network.save_parameters(
            'DDPG Pendulum Main Actor.params')
        self.target_actor_network.save_parameters(
            'DDPG Pendulum Target Actor.params')
        self.main_critic_network.save_parameters(
            'DDPG Pendulum Main Critic.params')
        self.target_critic_network.save_parameters(
            'DDPG Pendulum Target Critic.params')

    def load(self):
        self.main_actor_network.load_parameters(
            'DDPG Pendulum Main Actor.params')
        self.target_actor_network.load_parameters(
            'DDPG Pendulum Target Actor.params')
        self.main_critic_network.load_parameters(
            'DDPG Pendulum  Main Critic.params')
        self.target_critic_network.load_parameters(
            'DDPG Pendulum Target Critic.params')
Beispiel #2
0
class DoubleDQN:
    def __init__(self, n_action, init_epsilon, final_epsilon, gamma,
                 buffer_size, batch_size, replace_iter, annealing,
                 learning_rate, ctx):
        self.n_action = n_action
        self.epsilon = init_epsilon
        self.init_epsilon = init_epsilon
        self.final_epsilon = final_epsilon
        # discount factor
        self.gamma = gamma
        # memory buffer size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        # replace the parameters of the target network every T time steps
        self.replace_iter = replace_iter
        # The number of step it will take to linearly anneal the epsilon to its min value
        self.annealing = annealing
        self.learning_rate = learning_rate
        self.ctx = ctx

        self.total_steps = 0
        self.replay_buffer = MemoryBuffer(self.buffer_size, ctx)  # use deque

        # build the network
        self.target_network = DoubleQNetwork(n_action)
        self.main_network = DoubleQNetwork(n_action)
        self.target_network.collect_params().initialize(
            init.Xavier(), ctx=ctx)  # initialize the params
        self.main_network.collect_params().initialize(init.Xavier(), ctx=ctx)

        # optimize the main network
        self.optimizer = gluon.Trainer(self.main_network.collect_params(),
                                       'adam',
                                       {'learning_rate': self.learning_rate})

    def choose_action(self, state):
        state = nd.array([state], ctx=self.ctx)
        if nd.random.uniform(0, 1) > self.epsilon:
            # choose the best action
            q_value = self.main_network(state)
            action = int(nd.argmax(q_value, axis=1).asnumpy())
        else:
            # random choice
            action = random.choice(range(self.n_action))
        # anneal
        self.epsilon = max(
            self.final_epsilon, self.epsilon -
            (self.init_epsilon - self.final_epsilon) / self.annealing)
        self.total_steps += 1
        return action

    def update(self):
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.replay_buffer.sample(
            self.batch_size)
        with autograd.record():
            # get the Q(s,a)
            all_current_q_value = self.main_network(state_batch)
            main_q_value = nd.pick(all_current_q_value, action_batch)

            # different from DQN
            # get next action from main network, then get its Q value from target network
            all_next_q_value = self.target_network(
                next_state_batch).detach()  # only get gradient of main network
            max_action = nd.argmax(all_current_q_value, axis=1)
            target_q_value = nd.pick(all_next_q_value, max_action).detach()

            target_q_value = reward_batch + (
                1 - done_batch) * self.gamma * target_q_value

            # record loss
            loss = gloss.L2Loss()
            value_loss = loss(target_q_value, main_q_value)
        self.main_network.collect_params().zero_grad()
        value_loss.backward()
        self.optimizer.step(batch_size=self.batch_size)

    def replace_parameters(self):
        self.main_network.save_parameters('Double_DQN_temp_params')
        self.target_network.load_parameters('Double_DQN_temp_params')
        print('Double_DQN parameters replaced')

    def save_parameters(self):
        self.target_network.save_parameters(
            'Double_DQN_target_network_parameters')
        self.main_network.save_parameters('Double_DQN_main_network_parameters')

    def load_parameters(self):
        self.target_network.load_parameters(
            'Double_DQN_target_network_parameters')
        self.main_network.load_parameters('Double_DQN_main_network_parameters')
Beispiel #3
0
class TD3:
    def __init__(self, action_dim, action_bound, actor_learning_rate,
                 critic_learning_rate, batch_size, memory_size, gamma, tau,
                 explore_steps, policy_update, policy_noise, explore_noise,
                 noise_clip, ctx):
        self.action_dim = action_dim
        self.action_bound = nd.array(action_bound, ctx=ctx)

        self.actor_learning_rate = actor_learning_rate
        self.critic_learning_rate = critic_learning_rate
        self.batch_size = batch_size
        self.memory_size = memory_size
        self.gamma = gamma
        self.tau = tau
        self.explore_steps = explore_steps
        self.policy_update = policy_update
        self.policy_noise = policy_noise
        self.explore_noise = explore_noise
        self.noise_clip = noise_clip
        self.ctx = ctx

        self.main_actor_network = Actor(action_dim, self.action_bound)
        self.target_actor_network = Actor(action_dim, self.action_bound)
        self.main_critic_network1 = Critic()
        self.target_critic_network1 = Critic()
        self.main_critic_network2 = Critic()
        self.target_critic_network2 = Critic()

        self.main_actor_network.collect_params().initialize(init=init.Xavier(),
                                                            ctx=ctx)
        self.target_actor_network.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)
        self.main_critic_network1.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)
        self.target_critic_network1.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)
        self.main_critic_network2.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)
        self.target_critic_network2.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)

        self.actor_optimizer = gluon.Trainer(
            self.main_actor_network.collect_params(), 'adam',
            {'learning_rate': self.actor_learning_rate})
        self.critic1_optimizer = gluon.Trainer(
            self.main_critic_network1.collect_params(), 'adam',
            {'learning_rate': self.critic_learning_rate})
        self.critic2_optimizer = gluon.Trainer(
            self.main_critic_network2.collect_params(), 'adam',
            {'learning_rate': self.critic_learning_rate})

        self.total_steps = 0
        self.total_train_steps = 0

        self.memory_buffer = MemoryBuffer(buffer_size=self.memory_size,
                                          ctx=ctx)

    def choose_action_train(self, state):
        state = nd.array([state], ctx=self.ctx)
        action = self.main_actor_network(state)
        # no noise clip
        noise = nd.normal(loc=0,
                          scale=self.explore_noise,
                          shape=action.shape,
                          ctx=self.ctx)
        action += noise
        clipped_action = self.action_clip(action)
        return clipped_action

    # when you test the agent, use this to choose action.
    def choose_action_evaluate(self, state):
        state = nd.array([state], ctx=self.ctx)
        action = self.main_actor_network(state)
        return action

    # after adding the noise to action, you need to clip it to restrain it between available action bound.
    # Maybe you have a better way to do it. I think i make it too complicated!!!!
    def action_clip(self, action):
        low_bound = [
            float(self.action_bound[i][0].asnumpy())
            for i in range(self.action_dim)
        ]
        high_bound = [
            float(self.action_bound[i][1].asnumpy())
            for i in range(self.action_dim)
        ]
        bound = list(zip(low_bound, high_bound))
        # clip and reshape
        action_list = [
            nd.clip(action[:, i], bound[i][0], bound[i][1]).reshape(-1, 1)
            for i in range(self.action_dim)
        ]
        # concat
        clipped_action = reduce(nd.concat, action_list)
        return clipped_action.squeeze()

    def soft_update(self, target_network, main_network):
        target_parameters = target_network.collect_params().keys()
        main_parameters = main_network.collect_params().keys()
        d = zip(target_parameters, main_parameters)
        for x, y in d:
            target_network.collect_params()[x].data()[:] = \
                target_network.collect_params()[x].data() * \
                (1 - self.tau) + main_network.collect_params()[y].data() * self.tau

    def update(self):
        self.total_train_steps += 1
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory_buffer.sample(
            self.batch_size)

        # --------------optimize the critic network--------------------
        with autograd.record():
            # choose next action according to target policy network
            next_action_batch = self.target_actor_network(next_state_batch)
            noise = nd.normal(loc=0,
                              scale=self.policy_noise,
                              shape=next_action_batch.shape,
                              ctx=self.ctx)
            # with noise clip
            noise = nd.clip(noise,
                            a_min=-self.noise_clip,
                            a_max=self.noise_clip)
            next_action_batch = next_action_batch + noise
            clipped_action = self.action_clip(next_action_batch)

            # get target q value
            target_q_value1 = self.target_critic_network1(
                next_state_batch, clipped_action)
            target_q_value2 = self.target_critic_network2(
                next_state_batch, clipped_action)
            target_q_value = nd.minimum(target_q_value1,
                                        target_q_value2).squeeze()
            target_q_value = reward_batch + (1.0 - done_batch) * (
                self.gamma * target_q_value)

            # get current q value
            current_q_value1 = self.main_critic_network1(
                state_batch, action_batch)
            current_q_value2 = self.main_critic_network2(
                state_batch, action_batch)
            loss = gloss.L2Loss()

            value_loss1 = loss(current_q_value1, target_q_value.detach())
            value_loss2 = loss(current_q_value2, target_q_value.detach())

        self.main_critic_network1.collect_params().zero_grad()
        value_loss1.backward()
        self.critic1_optimizer.step(self.batch_size)

        self.main_critic_network2.collect_params().zero_grad()
        value_loss2.backward()
        self.critic2_optimizer.step(self.batch_size)

        # ---------------optimize the actor network-------------------------
        if self.total_train_steps % self.policy_update == 0:
            with autograd.record():
                pred_action_batch = self.main_actor_network(state_batch)
                actor_loss = -nd.mean(
                    self.main_critic_network1(state_batch, pred_action_batch))

            self.main_actor_network.collect_params().zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step(1)

            self.soft_update(self.target_actor_network,
                             self.main_actor_network)
            self.soft_update(self.target_critic_network1,
                             self.main_critic_network1)
            self.soft_update(self.target_critic_network2,
                             self.main_critic_network2)

    def save(self):
        self.main_actor_network.save_parameters(
            'TD3 LunarLander main actor network.params')
        self.target_actor_network.save_parameters(
            'TD3 LunarLander target actor network.params')
        self.main_critic_network1.save_parameters(
            'TD3 LunarLander main critic network.params')
        self.main_critic_network2.save_parameters(
            'TD3 LunarLander main critic network.params')
        self.target_critic_network1.save_parameters(
            'TD3 LunarLander target critic network.params')
        self.target_critic_network2.save_parameters(
            'TD3 LunarLander target critic network.params')

    def load(self):
        self.main_actor_network.load_parameters(
            'TD3 LunarLander main actor network.params')
        self.target_actor_network.load_parameters(
            'TD3 LunarLander target actor network.params')
        self.main_critic_network1.load_parameters(
            'TD3 LunarLander main critic network.params')
        self.main_critic_network2.load_parameters(
            'TD3 LunarLander main critic network.params')
        self.target_critic_network1.load_parameters(
            'TD3 LunarLander target critic network.params')
        self.target_critic_network2.load_parameters(
            'TD3 LunarLander target critic network.params')