コード例 #1
0
ファイル: DQN.py プロジェクト: durgaharish1993/RL_Course
    def update(self, state, action, reward, new_state, done):

        self.experience_replay.append((state, action, reward, new_state,
                                       done))  # add new transition to dataset

        self.epsilon = max(
            self.epsilon -
            (self.initial_epsilon - self.final_epsilon) / self.epsilon_decay,
            0)

        if len(self.experience_replay
               ) >= self.observation:  # if have enough experience example, go

            # minibatch = np.array(random.sample(self.experience_replay, self.batch_size))
            # states, actions, rewards, new_states, dones = tuple(minibatch[:, k] for k in range(5))

            mini_batch = random.sample(self.experience_replay, self.batch_size)
            states = torch.cat([
                mini_batch[k][0].unsqueeze(0) for k in range(self.batch_size)
            ])
            actions = [mini_batch[k][1] for k in range(self.batch_size)]
            rewards = [mini_batch[k][2] for k in range(self.batch_size)]
            new_states = torch.cat([
                mini_batch[k][3].unsqueeze(0) for k in range(self.batch_size)
            ])
            dones = [mini_batch[k][4] for k in range(self.batch_size)]

            new_states = torch.cat([x.unsqueeze(0) for x in new_states], 0)
            new_states = to_variable(new_states)

            q_prime = to_numpy(self.net.forward(new_states))

            states = torch.cat([x.unsqueeze(0) for x in states], 0)
            states = to_variable(states)
            out = self.net.forward(states)

            # Perform Gradient Descent
            action_input = to_variable(actions, dtype='long')
            y_label = to_variable([
                rewards[i] if dones[i] else rewards[i] +
                self.gamma * np.max(q_prime[i]) for i in range(self.batch_size)
            ])

            try:
                y_out = out.gather(1, action_input.view(-1, 1))
            except RuntimeError:
                pass

            self.optimizer.zero_grad()
            loss = self.loss(y_out, y_label)
            loss.backward()
            self.optimizer.step()
コード例 #2
0
ファイル: DDPG.py プロジェクト: durgaharish1993/RL_Course
    def update(self, state, action, reward, new_state, done):

        self.experience_replay.append((state, action, reward, new_state, done))

        if len(self.experience_replay
               ) >= self.observation:  # if have enough experience example, go
            # Sample batch from memory replay

            mini_batch = random.sample(self.experience_replay, self.batch_size)
            state_batch = torch.cat([
                mini_batch[k][0].unsqueeze(0) for k in range(self.batch_size)
            ])
            action_batch = [mini_batch[k][1] for k in range(self.batch_size)]
            reward_batch = [mini_batch[k][2] for k in range(self.batch_size)]
            next_state_batch = torch.cat([
                mini_batch[k][3].unsqueeze(0) for k in range(self.batch_size)
            ])
            terminal_batch = [mini_batch[k][4] for k in range(self.batch_size)]

            action_tensor = to_variable(np.vstack(action_batch))

            # Prepare for the target q batch
            value = self.actor_target.forward(
                to_variable(next_state_batch, volatile=True))
            next_q_values = self.critic_target.forward(
                [to_variable(next_state_batch, volatile=True), value])

            next_q_values.volatile = False

            try:
                y_batch = to_variable(reward_batch) + self.discount * \
                    to_variable(terminal_batch) * next_q_values
            except RuntimeError:
                print(reward_batch)

            # Critic update
            self.critic.zero_grad()

            q_batch = self.critic.forward(
                [to_variable(state_batch), action_tensor])

            value_loss = self.loss(q_batch, y_batch)
            value_loss.backward()
            self.critic_optim.step()

            # Actor update
            self.actor.zero_grad()

            value = self.actor.forward(to_variable(state_batch))
            policy_loss = -self.critic.forward(
                [to_variable(state_batch), value])

            policy_loss = policy_loss.mean()
            policy_loss.backward()

            self.actor_optim.step()

            # Target update
            soft_update(self.actor_target, self.actor, self.tau)
            soft_update(self.critic_target, self.critic, self.tau)
コード例 #3
0
    def update(self, state, action, reward, new_state, done):
        if self.config['use_memory']:
            self.experience_replay.append(
                new_state.numpy(), action.tolist(), reward, done)  # add new transition to dataset
        else:
            self.experience_replay.append((state, action.tolist(), reward, new_state, done))

        if done:
            self.random_process.reset_states()

        self.epsilon -= self.depsilon

        if len(self.experience_replay) >= self.observation:  # if have enough experience example, go
            # Sample batch from memory replay

            if self.config['use_memory']:
                state_batch, action_batch, reward_batch, \
                next_state_batch, terminal_batch = self.experience_replay.sample_and_split(self.batch_size)
                state_batch = state_batch.reshape(-1, 4, 80, 80)
                next_state_batch = next_state_batch.reshape(-1, 4, 80, 80)

            else:
                mini_batch = random.sample(self.experience_replay, self.batch_size)
                state_batch = torch.cat(mini_batch[k][0].unsqueeze(0) for k in range(self.batch_size))
                action_batch = [mini_batch[k][1] for k in range(self.batch_size)]
                reward_batch = [mini_batch[k][2] for k in range(self.batch_size)]
                next_state_batch = torch.cat(mini_batch[k][3].unsqueeze(0) for k in range(self.batch_size))
                terminal_batch = [mini_batch[k][4] for k in range(self.batch_size)]

            # Prepare for the target q batch
            value_c, _ = self.actor_target.forward(to_variable(next_state_batch, volatile=True))
            next_q_values = self.critic_target.forward([to_variable(next_state_batch, volatile=True), value_c])
            next_q_values.volatile = False

            y_batch = to_variable(reward_batch) + self.discount * \
                to_variable(terminal_batch) * next_q_values

            # Critic update
            self.critic.zero_grad()

            q_batch = self.critic.forward([to_variable(state_batch), to_variable(action_batch)])

            value_loss = self.loss(q_batch, y_batch)
            value_loss.backward()
            self.critic_optim.step()

            # Actor update
            self.actor.zero_grad()

            value_c, _ = self.actor.forward(to_variable(state_batch))
            policy_loss = -self.critic.forward([to_variable(state_batch), value_c])

            policy_loss = policy_loss.mean()
            policy_loss.backward()
            self.actor_optim.step()

            # Target update
            soft_update(self.actor_target, self.actor, self.tau)
            soft_update(self.critic_target, self.critic, self.tau)
コード例 #4
0
ファイル: DDPG.py プロジェクト: durgaharish1993/RL_Course
    def select_action(self, state, test=False):
        value = to_numpy(self.actor.forward(to_variable(state, volatile=True)))

        cur_episode = len(self.experience_replay)

        action = np.clip(value[0] + self.noise.generate(cur_episode), -1, 1)

        return action
コード例 #5
0
    def __init__(self, shared_net, num_actions, in_channels, use_conv, config):
        super(A3C, self).__init__()
        net = A3CNet(config, in_channels, num_actions, use_conv)

        self.net = net.cuda() if isGPU else net
        self.config = config

        if config['name'] == 'LSTM':
            self.cx = to_variable(torch.zeros(1, config['hidden_size']))
            self.hx = to_variable(torch.zeros(1, config['hidden_size']))

        self.shared_net = shared_net

        self.optimizer = torch.optim.Adam(self.shared_net.parameters(),
                                          lr=config['lr'])
        self.gamma = config['gamma']
        self.tau = config['tau']
        self.clip_norm = config['clip_norm']
        self.entropy_beta = config['entropy_beta']
コード例 #6
0
    def select_action(self, state, test=False):
        value_c, value_d = self.actor.forward(to_variable(state, volatile=True))

        action_d = (F.softmax(value_d))
        action_d = to_numpy(action_d.multinomial())

        action_c = to_numpy(value_c)
        action_c += (max(self.epsilon, 0) * self.random_process.sample()) if not test else 0
        action_c = action_c[0]
        return action_c, action_d
コード例 #7
0
    def select_action(self, state, test=False):

        on_state = to_variable(state, volatile=True)

        greedy = np.random.rand()
        if greedy < self.epsilon and not test:  # explore
            action = np.random.randint(self.action_num)
        else:  # exploit
            action = np.argmax(to_numpy(self.net.forward(on_state)))

        return action
コード例 #8
0
    def compute_expect(self, state, value, network, volatile=False):

        if self.use_expect:

            actions = [
                to_variable(i * torch.ones(value.size(0), 1))
                for i in range(self.nb_actions)
            ]

            next_q_values = torch.cat([
                network.forward([to_variable(state, volatile=volatile), a])
                for a in actions
            ], 1)

            next_q_values = torch.cat([
                value[i, :].dot(next_q_values[i, :])
                for i in range(self.batch_size)
            ], 0)
        else:
            next_q_values = network.forward(
                [to_variable(state, volatile=volatile), value])

        return next_q_values
コード例 #9
0
    def update(self, values, log_probs, rewards, entropies, done, state=None):

        if done:
            if self.config['name'] == 'LSTM':
                self.cx = to_variable(
                    torch.zeros(1, self.config['hidden_size']))
                self.hx = to_variable(
                    torch.zeros(1, self.config['hidden_size']))
            R = to_variable(torch.zeros(1, 1))
        else:
            if self.config['name'] == 'LSTM':
                self.cx = to_variable(self.cx.data)
                self.hx = to_variable(self.hx.data)
            _, value, _, _ = self.select_action(state)
            R = to_variable(value.data)

        values.append(R)
        # print(len(values), len(log_probs), len(rewards), len(entropies))

        policy_loss = 0.
        value_loss = 0.

        gae = torch.zeros(1, 1)
        gae = gae.cuda() if isGPU else gae
        for i in reversed(range(len(rewards))):
            try:
                R = self.gamma * R + rewards[i]
            except TypeError:
                pass

            delta_t = rewards[i] + self.gamma * values[i +
                                                       1].data - values[i].data
            gae = gae * self.gamma * self.tau + delta_t

            policy_loss -= log_probs[i] * to_variable(
                gae) + self.entropy_beta * entropies[i]
            value_loss += 0.5 * (R - values[i])**2

        # Perform asynchronous update

        final_loss = policy_loss + 0.5 * value_loss

        self.optimizer.zero_grad()

        final_loss.backward()

        nn.utils.clip_grad_norm(self.net.parameters(), self.clip_norm)

        ensure_shared_grads(self.net, self.shared_net)

        self.optimizer.step()
コード例 #10
0
    def select_action(self, state, test=False):
        state = to_variable(state, volatile=test)

        if self.config['name'] == 'LSTM':
            on_state = state, (self.hx, self.cx)
            value, logit, (hx, cx) = self.net.forward(on_state)
        else:
            on_state = state
            # print(state.size())
            value, logit = self.net.forward(on_state)

        prob = F.softmax(logit)
        log_prob = F.log_softmax(logit)
        entropy = -(log_prob * prob).sum(1)

        action = to_numpy(prob.multinomial())
        log_prob = log_prob.gather(1, to_variable(action, dtype='long'))

        action = action[0, 0]

        if self.config['name'] == 'LSTM':
            self.hx, self.cx = hx, cx

        return action, value, log_prob, entropy
コード例 #11
0
    def select_action(self, state, test=False):
        value = to_numpy(self.actor.forward(to_variable(state, volatile=True)))
        # print(value)

        cur_episode = len(self.experience_replay)

        if self.action_type == 'continuous':
            action = np.clip(value[0] + self.noise.generate(cur_episode), -1,
                             1)
        else:
            action = self.noise.generate(value[0], cur_episode)
            if isinstance(action, int):
                action = np.array([1., 0.] if action == 0 else [0., 1.])
            else:
                # action = np.clip(action, 0.4, 0.6)
                action = action

        return action
コード例 #12
0
    def select_action(self, state, test=False):
        state = to_variable(state, isCuda=False, volatile=test)

        if self.config['name'] == 'LSTM':
            on_state = state, (self.hx, self.cx)
            value, logit, (hx, cx) = self.net.forward(on_state)
        else:
            on_state = state
            value, logit = self.net.forward(on_state)

        prob = F.softmax(logit)
        log_prob = F.log_softmax(logit)
        entropy = -(log_prob * prob).sum(1)

        action = prob.multinomial().data
        log_prob = log_prob.gather(1, Variable(action))

        action = action.numpy()[0, 0]

        if self.config['name'] == 'LSTM':
            self.hx, self.cx = hx, cx

        return action, value, log_prob, entropy
コード例 #13
0
    def update(self, state, action, reward, new_state, done):

        self.experience_replay.append((state, action, reward, new_state, done))

        if len(self.experience_replay
               ) >= self.observation:  # if have enough experience example, go
            # Sample batch from memory replay

            mini_batch = random.sample(self.experience_replay, self.batch_size)
            state_batch = torch.cat(mini_batch[k][0].unsqueeze(0)
                                    for k in range(self.batch_size))
            action_batch = [mini_batch[k][1] for k in range(self.batch_size)]
            reward_batch = [mini_batch[k][2] for k in range(self.batch_size)]
            next_state_batch = torch.cat(mini_batch[k][3].unsqueeze(0)
                                         for k in range(self.batch_size))
            terminal_batch = [mini_batch[k][4] for k in range(self.batch_size)]

            action_tensor = to_variable(np.vstack(action_batch))

            # Prepare for the target q batch
            value = self.actor_target.forward(
                to_variable(next_state_batch, volatile=True))
            next_q_values = self.compute_expect(next_state_batch,
                                                value,
                                                self.critic_target,
                                                volatile=True)

            next_q_values.volatile = False

            y_batch = to_variable(reward_batch) + self.discount * \
                to_variable(terminal_batch) * next_q_values

            # Critic update
            self.critic.zero_grad()

            q_batch = self.compute_expect(state_batch, action_tensor,
                                          self.critic)
            # q_batch = self.critic.forward([to_variable(state_batch), action_tensor])

            value_loss = self.loss(q_batch, y_batch)
            value_loss.backward()
            self.critic_optim.step()

            # Actor update
            self.actor.zero_grad()

            value = self.actor.forward(to_variable(state_batch))
            policy_loss = -self.compute_expect(state_batch, value, self.critic)
            # policy_loss = -self.critic.forward([to_variable(state_batch), value])

            policy_loss = policy_loss.mean()
            policy_loss.backward()

            # torch.nn.utils.clip_grad_norm(self.actor.parameters(), 1.)

            list_params = list(self.actor.parameters())
            # print(list_params[-1].grad[0])

            # # invert gradients
            # for i, p in enumerate(list_params):
            #     if i == len(list_params)-1:
            #         for j in range(self.nb_actions):
            #             # print("gradient", p.grad.data[j])
            #             if p.grad.data[j] > 0:  # suggest increasing p
            #                 # print("current p", (self.pmax - p.data[j]) / (self.pmax - self.pmin))
            #                 p.grad.data[j] *= abs(self.pmax - p.data[j])/(self.pmax - self.pmin)
            #             else:
            #                 # print("current p", (p.data[j] - self.pmin) / (self.pmax - self.pmin))
            #                 p.grad.data[j] *= abs(p.data[j] - self.pmin)/(self.pmax - self.pmin)

            # for p in list_params:
            #     self.invert_gradient(p.data, p.grad.data)

            # print(list_params[-1].grad[0])

            self.actor_optim.step()

            # Target update
            soft_update(self.actor_target, self.actor, self.tau)
            soft_update(self.critic_target, self.critic, self.tau)