コード例 #1
0
ファイル: main.py プロジェクト: wuzy38/DQN
 def dqnTrain(self, double=True):
     step = 0
     memory = ReplayMemory(self.MEMORY_CAPACITY_N)
     eval_Qnetwork = QNetwork(self.env.action_space.n, self.LEARNING_RATE)
     target_Qnetwork = QNetwork(self.env.action_space.n, self.LEARNING_RATE)
     eval_Qnetwork.set_weights(self.Qnetwork.get_weights())
     target_Qnetwork.set_weights(eval_Qnetwork.get_weights())
     reward_list = self.reward_list
     time_start = time.time()
     for episode in range(1, self.episode_M + 1):
         episode_reward = 0
         state = self.env.reset()
         while True:
             step += 1
             action = self.selectAction(eval_Qnetwork, state)
             next_state, reward, done, _ = self.env.step(action)
             episode_reward += reward
             memory.add((state, action, reward, next_state, done))
             state = next_state
             if len(memory) > self.BATCH_SIZE:
                 sample_batch = memory.sample(self.BATCH_SIZE)
                 self.updateQNetwork(eval_Qnetwork, target_Qnetwork,
                                     sample_batch, double)
                 # self.EPS = self.EPS*self.EPS_DECAY if self.EPS > self.EPS_MIN else self.EPS_MIN
                 eps_fraction = min(
                     float(step) / self.schedule_timesteps, self.eps_init)
                 self.eps = self.eps_init + eps_fraction * (self.eps_final -
                                                            self.eps_init)
             if step % self.TARGET_UPDATE_C == 0:
                 target_Qnetwork.set_weights(eval_Qnetwork.get_weights())
             if done:
                 break
         reward_list.append(episode_reward)
         print(
             "episode: {}, reward: {}, tot_step: {}, {}min. eps: {}".format(
                 episode, episode_reward, step,
                 (time.time() - time_start) / 60, self.eps))
         if episode % 5 == 0:
             print(
                 "episode {}. recent 5 episode_reward:{}. using {} min. total step: {}. "
                 .format(episode, self.reward_list[-5:],
                         (time.time() - time_start) / 60, step))
         if episode % 50 == 0:
             self.save(target_Qnetwork, reward_list)
     self.Qnetwork.set_weights(target_Qnetwork.get_weights())
     self.reward_list = reward_list
     return target_Qnetwork, reward_list
コード例 #2
0
class Agent(AgentConfig, EnvConfig):
    def __init__(self):
        self.env = gym.make(self.env_name)
        self.action_size = self.env.action_space.n  # 2 for cartpole
        self.memory = ReplayMemory(memory_size=self.memory_size, action_size=self.action_size, per=self.per)
        if self.train_cartpole:
            self.policy_network = MlpPolicy(action_size=self.action_size).to(device)
            self.target_network = MlpPolicy(action_size=self.action_size).to(device)
        self.optimizer = optim.Adam(self.policy_network.parameters(), lr=self.learning_rate)
        self.loss = 0
        self.criterion = nn.MSELoss()

    def new_random_game(self):
        self.env.reset()
        action = self.env.action_space.sample()
        screen, reward, terminal, info = self.env.step(action)
        return screen, reward, action, terminal

    def train(self):
        episode = 0
        step = 0
        reward_history = []

        if not os.path.exists("./GIF/"):
            os.makedirs("./GIF/")

        # A new episode
        while step < self.max_step:
            start_step = step
            episode += 1
            episode_length = 0
            total_episode_reward = 0
            frames_for_gif = []

            self.gif = True if episode % self.gif_every == 0 else False

            # Get initial state
            state, reward, action, terminal = self.new_random_game()
            current_state = state
            # current_state = np.stack((state, state, state, state))

            # A step in an episode
            while episode_length < self.max_episode_length:
                step += 1
                episode_length += 1

                # Choose action
                action = random.randrange(self.action_size) if np.random.rand() < self.epsilon else \
                    torch.argmax(self.policy_network(torch.FloatTensor(current_state).to(device))).item()

                # print(current_state)
                # print(self.policy_network(torch.FloatTensor(current_state).to(device)))

                # Act
                state, reward, terminal, _ = self.env.step(action)
                new_state = state
                # new_state = np.concatenate((current_state[1:], [state]))

                reward = -1 if terminal else reward

                if self.gif:
                    frames_for_gif.append(new_state)

                self.memory.add(current_state, reward, action, terminal, new_state)

                current_state = new_state
                total_episode_reward += reward

                self.epsilon_decay()

                if step > self.start_learning and step % self.train_freq == 0:
                    self.minibatch_learning()

                if terminal:
                    last_episode_reward = total_episode_reward
                    last_episode_length = step - start_step
                    reward_history.append(last_episode_reward)

                    print('episode: %.2f, total step: %.2f, last_episode length: %.2f, last_episode_reward: %.2f, '
                          'loss: %.4f, eps = %.2f' % (episode, step, last_episode_length, last_episode_reward,
                                                      self.loss, self.epsilon))

                    self.env.reset()

                    if self.gif:
                        generate_gif(last_episode_length, frames_for_gif, total_episode_reward, "./GIF/", episode)

                    break

            if episode % self.reset_step == 0:
                self.target_network.load_state_dict(self.policy_network.state_dict())

            if episode % self.plot_every == 0:
                plot_graph(reward_history)

            # self.env.render()

        self.env.close()

    def minibatch_learning(self):
        state_batch, reward_batch, action_batch, terminal_batch, next_state_batch = self.memory.sample(self.batch_size)

        y_batch = torch.FloatTensor()
        for i in range(self.batch_size):
            if terminal_batch[i]:
                y_batch = torch.cat((y_batch, torch.FloatTensor([reward_batch[i]])), 0)
            else:
                next_state_q = torch.max(self.target_network(torch.FloatTensor(next_state_batch[i]).to(device)))
                y = torch.FloatTensor([reward_batch[i] + self.gamma * next_state_q])
                y_batch = torch.cat((y_batch, y), 0)

        current_state_q = torch.max(self.policy_network(torch.FloatTensor(state_batch).to(device)), dim=1)[0]

        self.loss = self.criterion(current_state_q, y_batch).mean()

        self.optimizer.zero_grad()
        self.loss.backward()
        self.optimizer.step()

    def epsilon_decay(self):
        self.epsilon *= self.epsilon_decay_rate
        self.epsilon = max(self.epsilon, self.epsilon_minimum)
コード例 #3
0
ファイル: agent.py プロジェクト: pluebcke/dqn_experiments
class Agent:
    def __init__(self, action_spec: dm_env.specs.DiscreteArray,
                 observation_spec: dm_env.specs.Array, device: torch.device,
                 settings: dict) -> None:
        """
        Initializes the agent,  constructs the qnet and the q_target, initializes the optimizer and ReplayMemory.
        Args:
            action_spec(dm_env.specs.DiscreteArray): description of the action space of the environment
            observation_spec(dm_env.specs.Array): description of observations form the environment
            device(str): "gpu" or "cpu"
            settings(dict): dictionary with settings
        """
        self.device = device
        action_size = action_spec.num_values
        state_size = np.prod(observation_spec.shape)
        self.action_size = action_size
        self.state_size = state_size
        self.batch_size = settings['batch_size']
        self.noisy_nets = settings['qnet_settings']['noisy_nets']
        self.distributional = settings["qnet_settings"]["distributional"]

        if self.distributional:
            # Currently the distributional agent always uses Dueling DQN
            self.qnet = DistributionalDuelDQN(state_size, action_size,
                                              settings['qnet_settings'],
                                              device).to(device)
            self.q_target = DistributionalDuelDQN(state_size, action_size,
                                                  settings['qnet_settings'],
                                                  device).to(device)
            vmin, vmax = settings["qnet_settings"]["vmin"], settings[
                "qnet_settings"]["vmax"]
            number_atoms = settings["qnet_settings"]["number_atoms"]
            self.distribution_updater = DistributionUpdater(
                vmin, vmax, number_atoms)
        else:
            if settings["duelling_dqn"]:
                self.qnet = DuelDQN(state_size, action_size,
                                    settings['qnet_settings']).to(device)
                self.q_target = DuelDQN(state_size, action_size,
                                        settings['qnet_settings']).to(device)
            else:
                self.qnet = Dqn(state_size, action_size,
                                settings['qnet_settings']).to(device)
                self.q_target = Dqn(state_size, action_size,
                                    settings['qnet_settings']).to(device)

        self.q_target.load_state_dict(self.qnet.state_dict())
        self.optimizer = optim.Adam(self.qnet.parameters(), lr=settings['lr'])

        self.epsilon = settings["epsilon_start"]
        self.decay = settings["epsilon_decay"]
        self.epsilon_min = settings["epsilon_min"]
        self.gamma = settings['gamma']

        self.start_optimization = settings["start_optimization"]
        self.update_qnet_every = settings["update_qnet_every"]
        self.update_target_every = settings["update_target_every"]
        self.number_steps = 0
        self.ddqn = settings["ddqn"]

        # Initialize replay memory
        self.prioritized_replay = settings["prioritized_buffer"]
        if self.prioritized_replay:
            self.memory = PrioritizedReplayMemory(
                device, settings["buffer_size"], self.gamma,
                settings["n_steps"], settings["alpha"], settings["beta0"],
                settings["beta_increment"])
        else:
            self.memory = ReplayMemory(device, settings["buffer_size"],
                                       self.gamma, settings["n_steps"])
        return

    def policy(self, timestep: dm_env.TimeStep) -> int:
        """
        Returns an action following an epsilon-greedy policy.
        Args:
            timestep(dm_env.TimeStep): An observation from the environment

        Returns:
            int: The chosen action.
        """
        observation = np.array(timestep.observation).flatten()
        observation = torch.from_numpy(observation).float().to(self.device)
        self.number_steps += 1

        if not self.noisy_nets:
            self.update_epsilon()

        if np.random.rand() < self.epsilon:
            return np.random.choice(self.action_size)
        else:
            return int(self.qnet.get_max_action(observation))

    def update_epsilon(self) -> None:
        """
        Decays epsilon until self.epsilon_min
        Returns:
            None
        """
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.decay

    @staticmethod
    def calc_loss(
            q_observed: torch.Tensor, q_target: torch.Tensor,
            weights: torch.Tensor) -> typing.Tuple[torch.Tensor, np.float64]:
        """
        Returns the mean weighted MSE loss and the loss for each sample
        Args:
            q_observed(torch.Tensor): calculated q_value
            q_target(torch.Tensor):   target q-value
            weights: weights of the batch samples

        Returns:
            tuple(torch.Tensor, np.float64): mean squared error loss, loss for each indivdual sample
        """
        losses = functional.mse_loss(q_observed, q_target, reduction='none')
        loss = (weights * losses).sum() / weights.sum()
        return loss, losses.cpu().detach().numpy() + 1e-8

    @staticmethod
    def calc_distributional_loss(
        dist: torch.Tensor,
        proj_dist: torch.Tensor,
        weights: torch.Tensor,
    ) -> typing.Tuple[torch.Tensor, np.float64]:
        """
        Calculates the distributional loss metric.
        Args:
            dist(torch.Tensor): The observed distribution
            proj_dist: The projected target distribution
            weights: weights of the batch samples

        Returns:
            tuple(torch.Tensor, np.float64): mean squared error loss, loss for each indivdual sample
        """
        losses = -functional.log_softmax(dist, dim=1) * proj_dist
        losses = weights * losses.sum(dim=1)
        return losses.mean(), losses.cpu().detach().numpy() + 1e-8

    def update(self, step: dm_env.TimeStep, action: int,
               next_step: dm_env.TimeStep) -> None:
        """
        Adds experience to the replay memory, performs an optimization_step and updates the q_target neural network.
        Args:
            step(dm_env.TimeStep): Current observation from the environment
            action(int): The action that was performed by the agent.
            next_step(dm_env.TimeStep): Next observation from the environment
        Returns:
            None
        """

        observation = np.array(step.observation).flatten()
        next_observation = np.array(next_step.observation).flatten()
        done = next_step.last()
        exp = Experience(observation, action, next_step.reward,
                         next_step.discount, next_observation, 0, done)
        self.memory.add(exp)

        if self.memory.number_samples() < self.start_optimization:
            return

        if self.number_steps % self.update_qnet_every == 0:
            s0, a0, n_step_reward, discount, s1, _, dones, indices, weights = self.memory.sample_batch(
                self.batch_size)
            if not self.distributional:
                self.optimization_step(s0, a0, n_step_reward, discount, s1,
                                       indices, weights)
            else:
                self.distributional_optimization_step(s0, a0, n_step_reward,
                                                      discount, s1, dones,
                                                      indices, weights)

        if self.number_steps % self.update_target_every == 0:
            self.q_target.load_state_dict(self.qnet.state_dict())
        return

    def optimization_step(self, s0: torch.Tensor, a0: torch.Tensor,
                          n_step_reward: torch.Tensor, discount: torch.Tensor,
                          s1: torch.Tensor,
                          indices: typing.Optional[torch.Tensor],
                          weights: typing.Optional[torch.Tensor]) -> None:
        """
        Calculates the Bellmann update and updates the qnet.
        Args:
            s0(torch.Tensor): current state
            a0(torch.Tensor): current action
            n_step_reward(torch.Tensor): n-step reward
            discount(torch.Tensor): discount factor
            s1(torch.Tensor): next state
            indices(torch.Tensor): batch indices, needed for prioritized replay. Not used yet.
            weights(torch.Tensor): weights needed for prioritized replay

        Returns:
            None
        """

        with torch.no_grad():
            if self.noisy_nets:
                self.q_target.reset_noise()
                self.qnet.reset_noise()

            # Calculating the target values
            next_q_vals = self.q_target(s1)
            if self.ddqn:
                a1 = torch.argmax(self.qnet(s1), dim=1).unsqueeze(-1)
                next_q_val = next_q_vals.gather(1, a1).squeeze()
            else:
                next_q_val = torch.max(next_q_vals, dim=1).values
            q_target = n_step_reward.squeeze(
            ) + self.gamma * discount.squeeze() * next_q_val

        # Getting the observed q-values
        if self.noisy_nets:
            self.qnet.reset_noise()
        q_observed = self.qnet(s0).gather(1, a0.long()).squeeze()

        # Calculating the losses
        if not self.prioritized_replay:
            weights = torch.ones(self.batch_size)
        critic_loss, batch_loss = self.calc_loss(q_observed, q_target, weights)

        # Backpropagation of the gradients
        self.optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.qnet.parameters(), 5)
        self.optimizer.step()

        # Update replay memory
        self.memory.update_priorities(indices, batch_loss)
        return

    def distributional_optimization_step(
            self, s0: torch.Tensor, a0: torch.Tensor,
            n_step_reward: torch.Tensor, discount: torch.Tensor,
            s1: torch.Tensor, dones: torch.Tensor,
            indices: typing.Optional[torch.Tensor],
            weights: typing.Optional[torch.Tensor]) -> None:
        """
        Calculates the Bellmann update and updates the qnet for the distributional agent.
        Args:
            s0(torch.Tensor): current state
            a0(torch.Tensor): current action
            n_step_reward(torch.Tensor): n-step reward
            discount(torch.Tensor): discount factor
            s1(torch.Tensor): next state
            dones(torch.Tensor): done
            indices(torch.Tensor): batch indices, needed for prioritized replay. Not used yet.
            weights(torch.Tensor): weights needed for prioritized replay

        Returns:
            None
        """

        with torch.no_grad():
            gamma = self.gamma * discount
            if self.noisy_nets:
                self.q_target.reset_noise()
                self.qnet.reset_noise()

            # Calculating the target distributions
            next_dists, next_q_vals = self.q_target.calc(s1)
            if self.ddqn:
                a1 = self.qnet.get_max_action(s1)
            else:
                a1 = torch.max(next_q_vals, dim=1)
            distributions = next_dists[range(self.batch_size), a1]
            distributions = functional.softmax(distributions, dim=1)
            q_target = self.distribution_updater.update_distribution(
                distributions.cpu().detach().numpy(),
                n_step_reward.cpu().detach().numpy(),
                dones.cpu().detach().numpy(),
                gamma.cpu().detach().numpy())
            q_target = torch.tensor(q_target).to(self.device)

        # Getting the observed q-value distributions
        if self.noisy_nets:
            self.qnet.reset_noise()
        q_observed = self.qnet(s0)
        q_observed = q_observed[range(self.batch_size), a0.squeeze().long()]

        # Calculating the losses
        if not self.prioritized_replay:
            weights = torch.ones(self.batch_size)
        critic_loss, batch_loss = self.calc_distributional_loss(
            q_observed, q_target, weights)

        # Backpropagation of the gradients
        self.optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.qnet.parameters(), 5)
        self.optimizer.step()

        # Update replay memory
        self.memory.update_priorities(indices, batch_loss)
        return
コード例 #4
0
ファイル: agent.py プロジェクト: hknozturk/Lunarlander
class Agent():
    def __init__(self, args, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.per = args.per
        self.dueling = args.dueling
        self.buffer_size = args.buffer_size
        self.batch_size = args.batch_size
        self.gamma = args.gamma
        self.tau = args.tau
        self.lr = args.learning_rate
        self.update_freq = args.update_every
        # Q-Network
        if self.dueling:
            self.local_qnet = DuelingQNet(state_size, action_size,
                                          seed).to(device)
            self.target_qnet = DuelingQNet(state_size, action_size,
                                           seed).to(device)
        else:
            self.local_qnet = QNet(state_size, action_size, seed).to(device)
            self.target_qnet = QNet(state_size, action_size, seed).to(device)

        self.optimizer = optim.Adam(self.local_qnet.parameters(), lr=self.lr)

        # Replay Memory
        if self.per:
            self.memory = PrioritizedReplayMemory(args, self.buffer_size)
        else:
            self.memory = ReplayMemory(action_size, self.buffer_size,
                                       self.batch_size, seed)
        self.t_step = 0  # init time step for updating every UPDATE_EVERY steps

    def step(self, state, action, reward, next_state, done):
        if self.per:
            self.memory.append(state, action, reward, next_state, done)
        else:
            self.memory.add(state, action, reward, next_state,
                            done)  # save experience to replay memory.
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_freq
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                if self.dueling:
                    self.learn_DDQN(self.gamma)
                else:
                    self.learn(self.gamma)

    def act(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.local_qnet.eval()
        with torch.no_grad():
            action_values = self.local_qnet(state)
        self.local_qnet.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, gamma):
        if self.per:
            idxs, states, actions, rewards, next_states, dones, weights = self.memory.sample(
                self.batch_size)
        else:
            states, actions, rewards, next_states, dones = self.memory.sample()
        # Get max predicted Q values for next states from target model
        Q_targets_next = self.target_qnet(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        Q_expected = self.local_qnet(states).gather(1, actions)

        # Compute loss - element-wise mean squared error
        # Now loss is a Tensor of shape (1,)
        # loss.item() gets the scalar value held in the loss.
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize loss
        self.optimizer.zero_grad()
        if self.per:
            (weights * loss).mean().backward(
            )  # Backpropagate importance-weighted minibatch loss
        else:
            loss.backward()
        self.optimizer.step()

        if self.per:
            errors = np.abs((Q_expected - Q_targets).detach().cpu().numpy())
            self.memory.update_priorities(idxs, errors)
        # Update target network
        self.soft_update(self.local_qnet, self.target_qnet, self.tau)

    def learn_DDQN(self, gamma):
        if self.per:
            idxs, states, actions, rewards, next_states, dones, weights = self.memory.sample(
                self.batch_size)
        else:
            states, actions, rewards, next_states, dones = self.memory.sample()
        # Get index of maximum value for next state from Q_expected
        Q_argmax = self.local_qnet(next_states).detach()
        _, a_prime = Q_argmax.max(1)
        # Get max predicted Q values for next states from target model
        Q_targets_next = self.target_qnet(next_states).detach().gather(
            1, a_prime.unsqueeze(1))
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Get expected Q values from local model
        Q_expected = self.local_qnet(states).gather(1, actions)

        # Compute loss
        # Now loss is a Tensor of shape (1,)
        # loss.item() gets the scalar value held in the loss.
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize loss
        self.optimizer.zero_grad()
        if self.per:
            (weights * loss).mean().backward(
            )  # Backpropagate importance-weighted minibatch loss
        else:
            loss.backward()
        self.optimizer.step()

        if self.per:
            errors = np.abs((Q_expected - Q_targets).detach().cpu().numpy())
            self.memory.update_priorities(idxs, errors)
        # Update target network
        self.soft_update(self.local_qnet, self.target_qnet, self.tau)

    def soft_update(self, local_model, target_model, tau):
        # θ_target = τ*θ_local + (1 - τ)*θ_target
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
コード例 #5
0
ファイル: agents.py プロジェクト: wh-forker/rainbow-3
class DDQAgent(DQAgent):
    """
    Double DeepQ Agent with q_network and target network
    """
    def __init__(self, q_network, target_network, environment, name='ddqn'):

        self.q_network = q_network
        self.target_network = target_network
        self.replay_memory = None
        self.environment = environment

        # book keeping
        self.name = name
        self.current_step = 0
        self.save_path = os.path.join('checkpoints', name + '.pkl')
        self.logdir = os.path.join('runs', name)

    def learn(self,
              num_steps,
              batch_size=32,
              capacity=500000,
              lr=2.5e-4,
              epsilon_max=0.9,
              epsilon_min=0.05,
              decay_rate=1e-5,
              checkpoint_interval=50000,
              initial_memory=50000,
              sync_interval=1000,
              gamma=0.99):
        cudnn.benchmark = True
        self.replay_memory = ReplayMemory(capacity)

        if len(self.replay_memory) < initial_memory:
            print('populating replay memory...')
            self.prime_replay_memory(initial_memory)

        writer = SummaryWriter(self.logdir)
        optimizer = Adam(self.q_network.parameters(), lr=lr)
        criterion = nn.SmoothL1Loss()
        steps = 0
        pbar = tqdm(total=num_steps)

        while steps <= num_steps:
            state = self.environment.reset()
            total_reward = 0
            while True:
                epsilon = self.calculate_epsilon(epsilon_max, epsilon_min,
                                                 decay_rate)
                action = self.select_action(state,
                                            epsilon)  # selection an action
                next_state, reward, done, info = self.environment.step(
                    action)  # carry out action/observe reward
                self.replay_memory.add(state, action, reward, next_state, done)

                states, actions, rewards, next_states, done_mask = self.replay_memory.sample(
                    batch_size)

                # prepare batch
                states = Variable(states).cuda()
                next_states = Variable(next_states).cuda()
                rewards = Variable(rewards).cuda()
                done_mask = Variable(done_mask).cuda()

                q_values = self.q_network(states)[
                    range(len(actions)),
                    actions]  # select only Q values for actions we took

                target_actions = self.q_network(next_states).max(dim=1)[1]
                next_q_values = self.target_network(
                    next_states)[range(len(target_actions)),
                                 target_actions].detach() * done_mask
                # calculate targets = rewards + (gamma * next_Q_values)
                targets = rewards + (gamma * next_q_values)

                loss = criterion(q_values, targets)
                optimizer.zero_grad()
                loss.backward()

                # gradient clipping
                for param in self.q_network.parameters():
                    param.grad.data.clamp_(-1, 1)
                optimizer.step()

                writer.add_scalar('epsilon', epsilon, self.current_step)

                steps += 1
                total_reward += reward
                self.current_step += 1
                state = next_state  # move to next state

                if steps % sync_interval == 0:
                    dqn_params = self.q_network.state_dict()
                    self.target_network.load_state_dict(dqn_params)

                if steps % checkpoint_interval == 0:
                    self.save_checkpoint()

                pbar.update()
                if done:
                    writer.add_scalar('reward', total_reward,
                                      self.current_step)
                    pbar.set_description(
                        "last episode reward: {}".format(total_reward))
                    break

        self.environment.close()
コード例 #6
0
ファイル: agents.py プロジェクト: wh-forker/rainbow-3
class DQAgent:
    """
    DeepQ Agent without bells and whistles. Uses single Q network and replay memory to interact with environment.
    """
    def __init__(self, q_network, environment, name='ddqn'):
        self.q_network = q_network
        self.environment = environment
        self.replay_memory = None

        # book keeping
        self.name = name
        self.current_step = 0
        self.save_path = os.path.join('checkpoints', name + '.pkl')
        self.logdir = os.path.join('runs', name)

    def calculate_epsilon(self, epsilon_max, epsilon_min, decay_rate):
        """
        calculates epsilon value given steps done and speed of decay
        """
        epsilon = epsilon_min + (epsilon_max - epsilon_min) * \
                  math.exp(-decay_rate * self.current_step)
        return epsilon

    def select_action(self, state, epsilon):
        """
        epsilon greedy policy.
        selects action corresponding to maximum predicted Q value, otherwise selects
        otherwise selects random action with epsilon probability.
        Args:
            state: current state of the environment (4 stack of image frames)
            epsilon: probability of random action (1.0 - 0.0)

        Returns: action
        """
        if epsilon > random.random():
            return self.environment.action_space.sample()
        state = Variable(process_state(state), volatile=True).cuda()
        return int(self.q_network(state).data.max(1)[1])

    def learn(self,
              num_steps,
              batch_size=32,
              capacity=500000,
              lr=2.5e-4,
              epsilon_max=0.9,
              epsilon_min=0.05,
              decay_rate=1e-5,
              checkpoint_interval=50000,
              initial_memory=50000,
              gamma=0.99):
        cudnn.benchmark = True
        self.replay_memory = ReplayMemory(capacity)

        if len(self.replay_memory) < initial_memory:
            print('populating replay memory...')
            self.prime_replay_memory(initial_memory)

        writer = SummaryWriter(self.logdir)
        optimizer = Adam(self.q_network.parameters(), lr=lr)
        criterion = nn.SmoothL1Loss()
        steps = 0
        pbar = tqdm(total=num_steps)

        while steps <= num_steps:
            state = self.environment.reset()
            total_reward = 0
            while True:
                epsilon = self.calculate_epsilon(epsilon_max, epsilon_min,
                                                 decay_rate)
                action = self.select_action(state,
                                            epsilon)  # selection an action
                next_state, reward, done, info = self.environment.step(
                    action)  # carry out action/observe reward
                self.replay_memory.add(state, action, reward, next_state, done)

                states, actions, rewards, next_states, done_mask = self.replay_memory.sample(
                    batch_size)

                # prepare batch
                states = Variable(states).cuda()
                next_states = Variable(next_states).cuda()
                rewards = Variable(rewards).cuda()
                done_mask = Variable(done_mask).cuda()

                q_values = self.q_network(states)[
                    range(len(actions)),
                    actions]  # select only Q values for actions we took

                # find next Q values and set Q values for done states to 0
                next_q_values = self.q_network(next_states).max(
                    dim=1)[0].detach() * done_mask
                # calculate targets = rewards + (gamma * next_Q_values)
                targets = rewards + (gamma * next_q_values)

                loss = criterion(q_values, targets)
                optimizer.zero_grad()
                loss.backward()

                # gradient clipping
                for param in self.q_network.parameters():
                    param.grad.data.clamp_(-1, 1)
                optimizer.step()

                writer.add_scalar('epsilon', epsilon, self.current_step)

                steps += 1
                total_reward += reward
                self.current_step += 1
                state = next_state  # move to next state
                if steps % checkpoint_interval == 0:
                    self.save_checkpoint()

                pbar.update()
                if done:
                    writer.add_scalar('reward', total_reward,
                                      self.current_step)
                    pbar.set_description(
                        "last episode reward: {}".format(total_reward))
                    break

        self.environment.close()

    def play(self, num_episodes, epsilon=0.05, render=True):
        for _ in tqdm(range(num_episodes)):
            total_reward = 0
            state = self.environment.reset()
            while True:
                if render:
                    self.environment.render()
                action = self.select_action(state,
                                            epsilon)  # selection an action
                next_state, reward, done, info = self.environment.step(
                    action)  # carry out action/observe reward
                total_reward += reward
                state = next_state  # move to next state
                if done:
                    break
        self.environment.close()

    def prime_replay_memory(self, steps):
        """
        populates replay memory with transitions generated by random actions
        """
        while len(self.replay_memory) <= steps:
            state = self.environment.reset()
            while True:
                action = self.environment.action_space.sample()
                next_state, reward, done, info = self.environment.step(
                    action)  # carry out action/observe reward
                self.replay_memory.add(state, action, reward, next_state, done)
                state = next_state  # move to next state
                if done:
                    break

    def load_agent(self, name):
        checkpoint_path = os.path.join('checkpoints', name + '.pkl')
        checkpoint = torch.load(checkpoint_path)
        self.q_network.load_state_dict(checkpoint['weights'])
        self.current_step = checkpoint['current_step']

    def save_checkpoint(self):
        checkpoint = dict(weights=self.q_network.state_dict(),
                          current_step=self.current_step)
        torch.save(checkpoint, self.save_path)
コード例 #7
0
class Agent():

    def __init__(self, config, session, num_actions):
        self.config = config
        self.sess = session
        self.num_actions = num_actions

        self.gamma = config['gamma']
        self.learning_rate = config['learning_rate']

        self.exp_replay = ReplayMemory(self.config)
        self.game_state = np.zeros((1, config['screen_width'], config['screen_height'], config['history_length']), dtype=np.uint8)

        self.update_thread = threading.Thread(target=lambda: 0)
        self.update_thread.start()

        self.step_count = 0
        self.episode = 0
        self.isTesting = False

        self.reset_game()
        self.timeout_option = tf.RunOptions(timeout_in_ms=5000)

        # build the net
        with tf.device(config['device']):
            # Create all variables
            self.state_ph = tf.placeholder(tf.float32, [None, config['screen_width'], config['screen_height'], config['history_length']], name='state_ph')
            self.stateT_ph = tf.placeholder(tf.float32, [None, config['screen_width'], config['screen_height'], config['history_length']], name='stateT_ph')
            self.action_ph = tf.placeholder(tf.int64, [None], name='action_ph')
            self.reward_ph = tf.placeholder(tf.float32, [None], name='reward_ph')
            self.terminal_ph = tf.placeholder(tf.float32, [None], name='terminal_ph')

            # Define training network
            with tf.variable_scope('Q') as scope:
                self.Q = self.Q_network(self.state_ph, config, 'Normal')
                # *** Double Q-Learning ***
                scope.reuse_variables()
                self.DoubleQT = self.Q_network(self.stateT_ph, config, 'DoubleQ')
            # Define Target network
            with tf.variable_scope('QT'):
                self.QT = self.Q_network(self.stateT_ph, config, 'Target')

            # Define training operation
            self.train_op = self.train_op(self.Q, self.QT, self.action_ph, self.reward_ph, self.terminal_ph, config, 'Normal')

            # Define operation to copy parameteres from training to target net.
            with tf.variable_scope('Copy_parameters'):
                self.sync_QT_op = []
                for W_pair in zip(tf.get_collection('Target_weights'),tf.get_collection('Normal_weights')):
                    self.sync_QT_op.append(W_pair[0].assign(W_pair[1]))

            # Define the summary ops
            self.Q_summary_op = tf.merge_summary(tf.get_collection('Normal_summaries'))

        self.summary_writter = tf.train.SummaryWriter(config['log_dir'], self.sess.graph, flush_secs=20)

    def update(self):
        state_batch, action_batch, reward_batch, next_state_batch, terminal_batch, _ = self.exp_replay.sample_transition_batch()

        feed_dict={self.state_ph: state_batch,
                    self.stateT_ph: next_state_batch,
                    self.action_ph: action_batch,
                    self.reward_ph: reward_batch,
                    self.terminal_ph: terminal_batch}
        if self.step_count % self.config['update_summary_rate'] == 0:
            _, Q_summary_str = self.sess.run([self.train_op, self.Q_summary_op], feed_dict, options=self.timeout_option)
            self.summary_writter.add_summary(Q_summary_str, self.step_count)
        else:
            _ = self.sess.run(self.train_op, feed_dict, options=self.timeout_option)

        if self.step_count % self.config['sync_rate'] == 0:
            self.sess.run(self.sync_QT_op)

    def Q_network(self, input_state, config, Collection=None):
        conv_stack_shape=[(32,8,4),
                    (64,4,2),
                    (64,3,1)]

        head = tf.div(input_state,256., name='normalized_input')
        head = cops.conv_stack(head, conv_stack_shape, Collection)
        head = cops.flatten(head)
        head = cops.add_relu_layer(head, size=512, Collection=Collection)
        Q = cops.add_linear_layer(head, self.num_actions, Collection, layer_name="Q")

        return Q

    def train_op(self, Q, QT, action, reward, terminal, config, Collection):
        with tf.name_scope('Loss'):
            action_one_hot = tf.one_hot(action, self.num_actions, 1., 0., name='action_one_hot')
            acted_Q = tf.reduce_sum(Q * action_one_hot, reduction_indices=1, name='DQN_acted')

            # *** Double Q-Learning ***
            target_action = tf.argmax(self.DoubleQT, dimension=1)
            target_action_one_hot = tf.one_hot(target_action, self.num_actions, 1., 0., name='target_action_one_hot')
            DoubleQT_acted = tf.reduce_sum(self.QT * target_action_one_hot, reduction_indices=1, name='DoubleQT')
            Y = reward + self.gamma * DoubleQT_acted * (1 - terminal)
            # *** Double Q-Learning ***
            Y = tf.stop_gradient(Y)

            loss_batch = cops.clipped_l2(Y, acted_Q)
            loss = tf.reduce_sum(loss_batch, name='loss')

            tf.scalar_summary('losses/loss', loss, collections=[Collection + '_summaries'])
            tf.scalar_summary('losses/loss_0', loss_batch[0],collections=[Collection + '_summaries'])
            tf.scalar_summary('losses/loss_max', tf.reduce_max(loss_batch),collections=[Collection + '_summaries'])
            tf.scalar_summary('main/Y_0', Y[0], collections=[Collection + '_summaries'])
            tf.scalar_summary('main/Y_max', tf.reduce_max(Y), collections=[Collection + '_summaries'])
            tf.scalar_summary('main/acted_Q_0', acted_Q[0], collections=[Collection + '_summaries'])
            tf.scalar_summary('main/acted_Q_max', tf.reduce_max(acted_Q), collections=[Collection + '_summaries'])
            tf.scalar_summary('main/reward_max', tf.reduce_max(reward), collections=[Collection + '_summaries'])

        train_op, grads = cops.graves_rmsprop_optimizer(loss, self.learning_rate, 0.95, 0.01, 1)

        return train_op

    def testing(self, t=True):
        self.isTesting = t

    def reset_game(self):
        self.episode_begining = True
        self.game_state.fill(0)

    def epsilon(self):
        if self.step_count < self.config['exploration_steps']:
            return self.config['ep_start'] - ((self.config['ep_start'] - self.config['ep_min']) / self.config['exploration_steps']) * self.step_count
        else:
            return self.config['ep_min']

    def e_greedy_action(self, epsilon):
        if np.random.uniform() < epsilon:
            action = random.randint(0, self.num_actions - 1)
        else:
            action = np.argmax(self.sess.run(self.Q, feed_dict={self.state_ph: self.game_state})[0])
        return action

    def done(self):
        if not self.isTesting:
            self.exp_replay.add(self.game_state[:, :, :, -1],self.game_action, self.game_reward, True)
        self.reset_game()

    def observe(self, x, r):
        self.game_reward = r
        x_ = cv2.resize(x, (self.config['screen_width'], self.config['screen_height']))
        x_ = cv2.cvtColor(x_, cv2.COLOR_RGB2GRAY)
        self.game_state = np.roll(self.game_state, -1, axis=3)
        self.game_state[0, :, :, -1] = x_

    def step(self, x, r):
        r = max(self.config['min_reward'], min(self.config['max_reward'], r))
        if not self.isTesting:
            if not self.episode_begining:
                self.exp_replay.add(self.game_state[:, :, :, -1], self.game_action, self.game_reward, False)
            else:
                for i in range(self.config['history_length'] - 1):
                    # add the resetted buffer
                    self.exp_replay.add(self.game_state[:, :, :, i], 0, 0, False)
                self.episode_begining = False
            self.observe(x, r)
            self.game_action = self.e_greedy_action(self.epsilon())
            if self.step_count > self.config['steps_before_training']:
                self.update_thread.join()
                self.update_thread = threading.Thread(target=self.update)
                self.update_thread.start()
            self.step_count += 1
        else:
            self.observe(x, r)
            self.game_action = self.e_greedy_action(0.01)
        return self.game_action
コード例 #8
0
class DeepQ_agent:
    """
    Represents the DQN agent.
    """
    def __init__(self, env, hidden_units = None, network_LR=0.01, batch_size=1024, update_every=5, gamma=0.95):
        """
        Creates a DQN agent.

        :param env: game environment.
        :type env: Class Snake_Env().
        :param hidden_units: number of neurons in each layer.
        :type hidden_units: tupple with dimension (1, 3).
        :param network_LR: learning rate of the action-value neural network.
        :type network_LR: float.
        :param batch_size: size of the minibatch taken from the replay buffer.
        :type batch_size: int.
        :param update_every: number of iterations for updating the target qnetwork. 
        :type update_every: int
        :param gamma: discount factor.
        :type gamma: float.
        """
        self.env = env
        self.BATCH_SIZE = batch_size
        self.GAMMA = gamma          
        self.NETWORK_LR = network_LR
        self.MEMORY_CAPACITY = int(1e5)   
        self.ACTION_SIZE = env.ACTION_SPACE           
        self.HIDDEN_UNITS = hidden_units
        self.UPDATE_EVERY = update_every
       
        self.qnetwork_local = QNetwork(input_shape = self.env.STATE_SPACE,
                                        hidden_units = self.HIDDEN_UNITS,
                                        output_size = self.ACTION_SIZE,
                                        learning_rate = self.NETWORK_LR)
        
        self.qnetwork_target = QNetwork(input_shape = self.env.STATE_SPACE,
                                        hidden_units = self.HIDDEN_UNITS,
                                        output_size = self.ACTION_SIZE,
                                        learning_rate = self.NETWORK_LR)

        self.memory = ReplayMemory(self.MEMORY_CAPACITY, self.BATCH_SIZE) 

        #Temp variable
        self.t = 0


    def learn(self):
        """
        Learn from memorized experience.
        """
        if self.memory.__len__() > self.BATCH_SIZE:
            states, actions, rewards, next_states, dones = self.memory.sample(self.env.STATE_SPACE)
            
            #Calculating action-values using local network
            target = self.qnetwork_local.predict(states, self.BATCH_SIZE)
            
            #Future action-values using target network
            target_val = self.qnetwork_target.predict(next_states, self.BATCH_SIZE)
            
            #Future action-values using local network
            target_next = self.qnetwork_local.predict(next_states, self.BATCH_SIZE)
        
            max_action_values = np.argmax(target_next, axis=1)   #action selection
            
            for i in range(self.BATCH_SIZE):
                if dones[i]:
                    target[i][actions[i]] = rewards[i]
                else:
                    target[i][actions[i]] = rewards[i] + self.GAMMA*target_val[i][max_action_values[i]]   #action evaluation
            
            self.qnetwork_local.train(states, target, batch_size = self.BATCH_SIZE)

            if self.t == self.UPDATE_EVERY:
                self.update_target_weights()
                self.t = 0
            else:
                self.t += 1


    def act(self, state, epsilon=0.0):
        """
        Chooses an action using an epsilon-greedy policy.
        
        :param state: current state.
        :type state: NumPy array with dimension (1, 18).
        :param epsilon: epsilon used in epsilon-greedy policy.
        :type epsilon: float
        :return action: action chosen by the agent.
        :rtype: int
        """    
        state = state.reshape((1,)+state.shape)
        action_values = self.qnetwork_local.predict(state)    #returns a vector of size = self.ACTION_SIZE
        if random() > epsilon:
            action = np.argmax(action_values)                 #choose best action - Exploitation
        else:
            action = randint(0, self.ACTION_SIZE-1)           #choose random action - Exploration
        return action


    def add_experience(self, state, action, reward, next_state, done):
        """
        Add experience to agent's memory.
        """
        self.memory.add(state, action, reward, next_state, done)

    
    def update_target_weights(self):
        """
        Updates values of the Target network.
        """
        self.qnetwork_target.model.set_weights(self.qnetwork_local.model.get_weights())
コード例 #9
0
class T3DAgent:
    def __init__(self, env, brain, brain_name, device, settings):
        self.env = env
        self.brain_name = brain_name
        self.device = device
        action_size = brain.vector_action_space_size
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        state_size = states.shape[1]
        self.action_size = action_size
        self.state_size = state_size
        self.batch_size = settings['batch_size']

        # Initialize actor local and target networks
        self.actor_local = Actor(state_size, action_size, settings['actor_settings']).to(device)
        self.actor_target = Actor(state_size, action_size, settings['actor_settings']).to(device)
        self.actor_target.load_state_dict(self.actor_local.state_dict())
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=settings['lr_actor'])

        # Initialize critic networks
        self.critic_local = Critic(state_size, action_size, settings['critic_settings']).to(device)
        self.critic_target = Critic(state_size, action_size, settings['critic_settings']).to(device)
        self.critic_target.load_state_dict(self.critic_local.state_dict())
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=settings['lr_critic'])

        # Save some of the settings into class member variables
        self.pretrain_steps = settings['pretrain_steps']
        self.gamma = settings['gamma']
        self.tau = settings['tau']

        self.action_noise = settings['action_noise']
        self.action_clip = settings['action_clip']
        self.target_action_noise = settings['target_action_noise']
        self.target_noise_clip = settings['target_noise_clip']
        self.optimize_every = settings['optimize_critic_every']

        # Initialize replay memory and episode generator
        self.memory = ReplayMemory(device, settings['buffer_size'])
        self.generator = self.play_episode()

        self.number_steps = 0
        return

    def get_action_noise(self):
        return self.action_noise

    def set_action_noise(self, std):
        self.action_noise = std
        return

    def pretrain(self):
        # The idea of using a pretrain phase before starting regular episodes
        # is from https://github.com/whiterabbitobj/Continuous_Control/
        print("Random sampling of " + str(self.pretrain_steps) + " steps")
        env = self.env
        brain_name = self.brain_name
        env_info = env.reset(train_mode=True)[brain_name]
        number_agents = env_info.vector_observations.shape[0]
        for _ in range(self.pretrain_steps):
            actions = []
            states = env_info.vector_observations
            for _ in range(number_agents):
                actions.append(np.random.uniform(-1, 1, self.action_size))
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
                self.memory.add(Experience(state, action, reward, next_state, done))
            if np.any(dones):
                env_info = env.reset(train_mode=True)[brain_name]

    def play_episode(self, train_mode = True):
        # The idea of generating episodes in an "experience generator" is from
        # "Deep Reinforcement Learning Hands-On" by Maxim Lapan

        print("Starting episode generator")
        # Initialize the environment
        env = self.env
        brain_name = self.brain_name
        env_info = env.reset(train_mode=train_mode)[brain_name]
        # Initialize episode_rewards and get the first state
        episode_rewards = []
        # Run episode step by step
        while True:
            states = env_info.vector_observations
            with torch.no_grad():
                actions = self.actor_local.forward(
                    torch.from_numpy(states).type(torch.FloatTensor).to(self.device)).cpu().detach().numpy()
                actions += self.action_noise * np.random.normal(size=actions.shape)
                actions = np.clip(actions, -self.action_clip, self.action_clip)

            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            episode_rewards.append(rewards)

            for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
                self.memory.add(Experience(state, action, reward, next_state, done))
            if np.any(dones):
                agent_reward = np.sum(episode_rewards, axis=0)
                std_reward = np.std(agent_reward)
                mean_reward = np.mean(agent_reward)
                episode_rewards = []
                env_info = env.reset(train_mode=True)[brain_name]
                yield mean_reward, std_reward
            else:
                yield -1, -1

    def take_step(self, train_mode = True):
        return next(self.generator, train_mode)

    def learn(self):
        self.number_steps += 1
        if self.memory.number_samples() <= self.batch_size:
            return
        # states, actions, rewards, next states, done
        s0, a0, r, s1, d = self.memory.sample_batch(self.batch_size)
        critic_loss_a, critic_loss_b = self.optimize_critic(s0, a0, r, s1, d)
        actor_loss = self.optimize_actor(s0)

        return actor_loss, critic_loss_a, critic_loss_b

    def optimize_actor(self, s0):
        # Calc policy loss
        if self.number_steps % self.optimize_every == 0:
            a0_pred = self.actor_local(s0)
            actor_loss = -self.critic_local.get_qa(s0, a0_pred).mean()
            # Update actor nn
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()
            # slow update
            self.slow_update(self.tau)
            return -actor_loss.cpu().detach().numpy()
        return 0

    def optimize_critic(self, s0, a0, r, s1, d):
        # The ideas of adding noise to the next state a1 as well as the critic loss, that takes q1_expected and
        # q2_expected as arguments at the same time, are from the implementation of the authors of the TD3 manuscript
        # at https://github.com/sfujim/TD3/
        with torch.no_grad():
            # calc critic loss
            noise = torch.randn_like(a0).to(self.device)
            noise = noise * torch.tensor(self.target_action_noise).expand_as(noise).to(self.device)
            noise = noise.clamp(-self.target_noise_clip, self.target_noise_clip)
            a1 = (self.actor_target(s1) + noise).clamp(-self.action_clip, self.action_clip)
            qa_target, qb_target = self.critic_target(s1, a1)
            q_target = torch.min(qa_target, qb_target)
            q_target = r + self.gamma * (1.0 - d) * q_target
        qa_expected, qb_expected = self.critic_local(s0, a0)
        critic_loss_a = functional.mse_loss(qa_expected, q_target)
        critic_loss_b = functional.mse_loss(qb_expected, q_target)
        critic_loss = critic_loss_a + critic_loss_b
        # Update critic nn
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()
        return critic_loss_a.cpu().detach().numpy(), critic_loss_b.cpu().detach().numpy()

    def slow_update(self, tau):
        for target_par, local_par in zip(self.actor_target.parameters(), self.actor_local.parameters()):
            target_par.data.copy_(tau * local_par.data + (1.0 - tau) * target_par.data)
        for target_par, local_par in zip(self.critic_target.parameters(), self.critic_local.parameters()):
            target_par.data.copy_(tau * local_par.data + (1.0 - tau) * target_par.data)
        return

    def load_nets(self, actor_file_path, critic_file_path):
        self.actor_local.load_state_dict(torch.load(actor_file_path))
        self.actor_local.eval()
        self.critic_local.load_state_dict(torch.load(critic_file_path))
        self.critic_local.eval()
        return

    def save_nets(self, model_save_path):
        actor_path = model_save_path + "_actor_net.pt"
        torch.save(self.actor_local.state_dict(), actor_path)
        critic_path = model_save_path + "_critic_net.pt"
        torch.save(self.critic_local.state_dict(), critic_path)
        return
コード例 #10
0
ファイル: agents.py プロジェクト: nicoring/rec-maddpg
class MADDPGAgent(Agent):
    def __init__(self, index, name, env, actor, critic, params):
        self.index = index
        self.name = name
        self.env = env

        self.actor = actor.to(DEVICE)
        self.critic = critic.to(DEVICE)
        self.actor_target = actor.clone().to(DEVICE)
        self.critic_target = critic.clone().to(DEVICE)
        self.actor_optim = torch.optim.Adam(self.actor.parameters(),
                                            lr=params.lr_actor)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(),
                                             lr=params.lr_critic)
        self.memory = ReplayMemory(params.memory_size, params.max_episode_len,
                                   self.actor.n_outputs, self.actor.n_inputs)
        self.mse = torch.nn.MSELoss()

        # params
        self.batch_size = params.batch_size
        self.tau = params.tau
        self.gamma = params.gamma
        self.clip_grads = True

        # flags
        # local obs/actions means only the obs/actions of this agent are available
        # if obs and actions are local this is equivalent to DDPG
        self.local_obs = params.local_obs
        self.local_actions = params.local_actions or params.local_obs

        # agent modeling
        self.use_agent_models = params.use_agent_models
        self.agent_models = {}
        self.model_optims = {}
        self.model_lr = params.modeling_lr
        self.entropy_weight = 1e-3
        self.max_past = params.max_past
        self.modeling_train_steps = params.modeling_train_steps
        self.modeling_batch_size = params.modeling_batch_size
        self.model_class = Actor

        # action and observation noise
        self.obfuscate_others = (params.sigma_noise
                                 is not None) or (params.temp_noise
                                                  is not None)
        self.sigma_noise = params.sigma_noise
        self.temp_noise = params.temp_noise

    def init_agent_models(self, agents):
        for agent in agents:
            if agent is self:
                continue
            agent_model = self.model_class.from_actor(agent.actor).to(DEVICE)
            self.agent_models[agent.index] = agent_model
            optim = torch.optim.Adam(agent_model.parameters(),
                                     lr=self.model_lr)
            self.model_optims[agent.index] = optim

    def update_params(self, target, source):
        zipped = zip(target.parameters(), source.parameters())
        for target_param, source_param in zipped:
            updated_param = target_param.data * (1.0 - self.tau) + \
                source_param.data * self.tau
            target_param.data.copy_(updated_param)

    def act(self, obs, explore=True):
        obs = torch.tensor(obs, dtype=torch.float,
                           requires_grad=False).to(DEVICE)
        actions = self.actor.select_action(obs, explore=explore).detach()
        return actions.to('cpu').numpy()

    def experience(self, episode_count, obs, action, reward, new_obs, done):
        self.memory.add(episode_count, obs, action, reward, new_obs,
                        float(done))

    def train_actor(self, batch):
        ### forward pass ###
        pred_actions = self.actor.select_action(batch.observations[self.index])
        actions = list(batch.actions)
        actions[self.index] = pred_actions
        q_obs = [batch.observations[self.index]
                 ] if self.local_obs else batch.observations
        q_actions = [actions[self.index]] if self.local_actions else actions
        pred_q = self.critic(q_obs, q_actions)

        ### backward pass ###
        p_reg = torch.mean(
            self.actor.forward(batch.observations[self.index])**2)
        loss = -pred_q.mean() + 1e-3 * p_reg
        self.actor_optim.zero_grad()
        loss.backward()
        if self.clip_grads:
            torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5)
        self.actor_optim.step()
        return loss

    def train_critic(self, batch, agents):
        """Train critic with TD-target."""
        ### forward pass ###
        # (a_1', ..., a_n') = (mu'_1(o_1'), ..., mu'_n(o_n'))
        self_obs = batch.next_observations[self.index]
        self_action = self.actor_target.select_action(self_obs).detach()
        if self.local_actions:
            pred_next_actions = [self_action]
        elif self.use_agent_models:
            pred_next_actions = [
                m.select_action(batch.next_observations[idx]).detach()
                for idx, m in self.agent_models.items()
            ]
            pred_next_actions.insert(self.index, self_action)
        else:
            pred_next_actions = [
                a.actor_target.select_action(o).detach()
                for o, a in zip(batch.next_observations, agents)
            ]

        q_next_obs = [batch.next_observations[self.index]
                      ] if self.local_obs else batch.next_observations
        q_next = self.critic_target(q_next_obs, pred_next_actions)
        reward = batch.rewards[self.index]
        done = batch.dones[self.index]

        # if not done: y = r + gamma * Q(o_1, ..., o_n, a_1', ..., a_n')
        # if done:     y = r
        q_target = reward + (1.0 - done) * self.gamma * q_next

        ### backward pass ###
        # loss(params) = mse(y, Q(o_1, ..., o_n, a_1, ..., a_n))
        q_obs = [batch.observations[self.index]
                 ] if self.local_obs else batch.observations
        q_actions = [batch.actions[self.index]
                     ] if self.local_actions else batch.actions
        loss = self.mse(self.critic(q_obs, q_actions), q_target.detach())

        self.critic_optim.zero_grad()
        loss.backward()
        if self.clip_grads:
            torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
        self.critic_optim.step()
        return loss

    def train_models(self, batch, agents):
        for idx, model in self.agent_models.items():
            obs = batch.observations[idx]
            actions = batch.actions[idx]
            distributions = model.prob_dists(obs)
            split_actions = torch.split(actions,
                                        agents[idx].actor.action_split,
                                        dim=-1)
            self.model_optims[idx].zero_grad()
            losses = torch.zeros(len(distributions))
            for i, (actions,
                    dist) in enumerate(zip(split_actions, distributions)):
                entropy = dist.base_dist._categorical.entropy()
                loss = (dist.log_prob(actions).mean() +
                        self.entropy_weight * entropy).mean()
                losses[i] = loss
            loss = -torch.mean(losses)
            loss.backward()
            self.model_optims[idx].step()
            return loss

    def compare_models(self, agents, batch):
        kls = []
        for idx, model in self.agent_models.items():
            kls.append([])
            obs = batch.observations[idx]
            modelled_distributions = model.prob_dists(obs)
            agent_distributions = agents[idx].actor.prob_dists(obs)
            for model_dist, agent_dist in zip(modelled_distributions,
                                              agent_distributions):
                kl_div = torch.distributions.kl.kl_divergence(
                    agent_dist, model_dist).data
                kls[-1].append(kl_div.mean())
        return zip(self.agent_models.keys(), kls)

    def add_noise_(self, batch):
        for i in range(len(batch.actions)):
            if i == self.index:
                continue
            # get observations and actions for agent i
            obs = batch.observations[i]
            actions = batch.actions[i]
            # create noise tensors, same shape and on same device
            if self.sigma_noise is not None:
                obs = obs + torch.randn_like(obs) * self.sigma_noise
            if self.temp_noise is not None:
                temp = torch.tensor(self.temp_noise,
                                    dtype=torch.float,
                                    device=actions.device)
                # avoid zero probs which lead to nan samples
                probs = actions + 1e-45
                actions = RelaxedOneHotCategorical(temp, probs=probs).sample()
            # add noise
            batch.observations[i] = obs
            batch.actions[i] = actions

    def update(self, agents):
        # collect transistion memories form all agents
        memories = [a.memory for a in agents]

        # train model networks
        if self.use_agent_models:
            model_losses = []
            for _ in range(self.modeling_train_steps):
                batch = self.memory.sample_transitions_from(
                    memories, self.modeling_batch_size, max_past=self.max_past)
                if self.obfuscate_others:
                    self.add_noise_(batch)
                model_losses.append(self.train_models(batch, agents).data)
            model_loss = np.mean(model_losses)
            model_kls = self.compare_models(agents, batch)
        else:
            model_loss = None
            model_kls = None

        # sample minibatch
        batch = self.memory.sample_transitions_from(memories, self.batch_size)
        if self.obfuscate_others:
            self.add_noise_(batch)
        # train actor and critic network
        actor_loss = self.train_actor(batch)
        critic_loss = self.train_critic(batch, agents)

        # update target network params
        self.update_params(self.actor_target, self.actor)
        self.update_params(self.critic_target, self.critic)

        return actor_loss, critic_loss, model_loss, model_kls

    def get_state(self):
        if self.agent_models:
            models = {i: m.state_dict() for i, m in self.agent_models.items()}
            optims = {i: o.state_dict() for i, o in self.model_optims.items()}
            model_pair = (models, optims)
        else:
            model_pair = None
        return {
            'actor': self.actor.state_dict(),
            'actor_target': self.actor_target.state_dict(),
            'actor_optim': self.actor_optim.state_dict(),
            'critic': self.critic.state_dict(),
            'critic_target': self.critic_target.state_dict(),
            'critic_optim': self.critic_optim.state_dict(),
        }, model_pair

    def load_state(self, state):
        for key, value in state['state_dicts'].items():
            getattr(self, key).load_state_dict(value)
        if 'models' in state:
            models, optims = state['models']
            for i, m in models.items():
                self.agent_models[i].load_state_dict(m)
            for i, o in optims.items():
                self.model_optims[i].load_state_dict(o)
コード例 #11
0
ファイル: agent.py プロジェクト: nrsimonelli/deep-rl-trex
class Agent:
    def __init__(self,
                 environment,
                 optimizer,
                 memory_length,
                 dueling=True,
                 loss='mse',
                 noisy_net=False,
                 egreedy=False,
                 save_memory=None,
                 save_weights=None,
                 verbose_action=False,
                 ):

        self.environment = environment
        self._optimizer = optimizer
        self._loss = loss
        self.dueling = dueling
        self.egreedy = egreedy
        self.noisy_net = noisy_net

        # Initialize discount and exploration rate, etc
        self.total_steps = 0
        self.gamma = 0.99
        self.epsilon = 1
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.00005
        self.tau = 0.05
        self.pretraining_steps = 0

        # Build networks
        self.q_network = self._build_compile_model()
        self.target_network = self._build_compile_model()
        self.align_target_model(how='hard')

        self.memory = ReplayMemory(memory_length)

        self.save_weights_fp = save_weights
        self.save_memory_fp = save_memory
        self.start_time = datetime.datetime.now()
        self.verbose_action = verbose_action

    def load_memory(self, fp):
        with open(fp, 'rb') as f:
            self.memory.load_memory(pickle.load(f))
            print(f'loading {self.memory.length} memories...')

    def save_memory(self, fp):
        if fp:
            with open(fp, 'wb') as f:
                print('saving replay memory...')
                pickle.dump(self.memory.get_memory(), f)

    def load_weights(self, weights_fp):
        if weights_fp:
            print('loading weights...')
            self.q_network.load_weights(weights_fp)
            self.align_target_model(how='hard')

    def save_weights(self, weights_fp):
        if weights_fp:
            self.q_network.save_weights(weights_fp)

    def set_epsilon_decay_schedule(self, epsilon, epsilon_min, annealed_steps):
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = math.log(self.epsilon / self.epsilon_min) / annealed_steps

    def set_beta_schedule(self, beta_start, beta_max, annealed_samplings):
        self.memory.beta = beta_start
        self.memory.beta_max = beta_max
        self.memory.beta_increment_per_sampling = (self.memory.beta_max - self.memory.beta) / annealed_samplings

    def predict(self, state, use_target=False):
        if use_target:
            return self.target_network.predict(state)
        else:
            return self.q_network.predict(state)

    def _decay_epsilon(self):
        self.epsilon = self.epsilon * np.exp(-self.epsilon_decay)

    def store(self, state, action, reward, next_state, terminated):
        self.memory.add((state, action, reward, next_state, terminated))
        self.total_steps += 1

        if not self.egreedy:
            if (self.epsilon > self.epsilon_min) and (self.memory.length > self.pretraining_steps):
                self._decay_epsilon()

    def batch_store(self, batch_load):
        batch_load[-2][2] = -0.1  # custom reward altering
        for row in batch_load:
            self.store(*row)

    def _build_compile_model(self):
        inputs = tf.keras.layers.Input(shape=(32, 290, 4))
        conv1 = tf.keras.layers.Conv2D(32, (8, 8), strides=4, padding='same', activation='relu')(inputs)
        conv2 = tf.keras.layers.Conv2D(64, (4, 4), strides=2, padding='same', activation='relu')(conv1)
        conv3 = tf.keras.layers.Conv2D(64, (3, 3), strides=1, padding='same', activation='relu')(conv2)
        conv3 = tf.keras.layers.Flatten()(conv3)

        if self.noisy_net:
            advt = NoisyNetDense(256, activation='relu')(conv3)
            final = NoisyNetDense(2)(advt)
        else:
            advt = tf.keras.layers.Dense(256, activation='relu')(conv3)
            final = tf.keras.layers.Dense(2)(advt)

        if self.dueling:
            if self.noisy_net:
                value = NoisyNetDense(256, activation='relu')(conv3)
                value = NoisyNetDense(1)(value)
            else:
                value = tf.keras.layers.Dense(256, activation='relu')(conv3)
                value = tf.keras.layers.Dense(1)(value)

            advt = tf.keras.layers.Lambda(lambda x: x - tf.reduce_mean(x, axis=1, keepdims=True))(final)
            final = tf.keras.layers.Add()([value, advt])

        model = tf.keras.models.Model(inputs=inputs, outputs=final)
        model.compile(optimizer=self._optimizer,
                      loss=self._loss,
                      metrics=['accuracy'])
        return model

    def align_target_model(self, how):
        assert how in ('hard', 'soft'), '"how" must be either "hard" or "soft"'

        if how == 'hard':
            self.target_network.set_weights(self.q_network.get_weights())

        elif how == 'soft':
            for t, e in zip(self.target_network.trainable_variables, self.q_network.trainable_variables):
                t.assign(t * (1 - self.tau) + (e * self.tau))

    def choose_action(self, state):
        if not self.egreedy:
            if np.random.rand() <= self.epsilon:
                action = self.environment.action_space.sample()
                if self.verbose_action:
                    print(f'action: {action}, q: random')
                return action

        q_values = self.predict(state, use_target=False)
        action = np.argmax(q_values[0])
        if self.verbose_action:
            print(f'action: {action}, q: {q_values}')
        return action

    def train(self, batch, is_weights):

        td_errors = np.zeros(len(batch))
        states = np.zeros((len(batch), 32, 290, 4))
        targets = np.zeros((len(batch), 2))

        for i, (state, action, reward, next_state, terminated) in enumerate(batch):
            target, td_error = self._get_target(state, action, reward, next_state, terminated)
            states[i] = state.reshape(32, 290, 4)
            targets[i] = target
            td_errors[i] = td_error

        self.q_network.fit(states, targets, sample_weight=is_weights, batch_size=32, epochs=1, verbose=0)
        self.align_target_model(how='soft')

        return td_errors

    def replay(self, batch_size, epoch_steps=None):

        num_batches = 1
        if epoch_steps:
            num_batches = int(np.max([np.floor(epoch_steps / 4), 1]))

        bar = progressbar.ProgressBar(maxval=num_batches,
                                      widgets=[f'training - ', progressbar.widgets.Counter(), f'/{num_batches} ',
                                               progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
        bar.start()

        for i in range(num_batches):
            leaf_idx, batch, is_weights = self.memory.get_batch(batch_size)  # prioritized experience replay
            td_errors = self.train(batch, is_weights)
            self.memory.update_sum_tree(leaf_idx, td_errors)

            bar.update(i + 1)

        bar.finish()
        self.save_weights(self.save_weights_fp)

    def _get_target(self, state, action, reward, next_state, terminated):
        target = self.predict(state, use_target=False)
        prev_target = target[0][action]

        if terminated:
            target[0][action] = reward
        else:
            a = np.argmax(self.predict(next_state, use_target=False)[0])
            target[0][action] = reward + (self.gamma * self.predict(next_state, use_target=True)[0][a])  # double Q Network

        td_error = abs(prev_target - target[0][action])

        return target, td_error
コード例 #12
0
class DDPG:
    def __init__(self,
                 env,
                 actor_model,
                 critic_model,
                 memory=10000,
                 batch_size=64,
                 gamma=0.99,
                 tau=0.001,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 critic_decay=1e-2,
                 ou_theta=0.15,
                 ou_sigma=0.2,
                 render=None,
                 evaluate=None,
                 save_path=None,
                 save_every=10,
                 render_every=10,
                 train_per_step=True):
        self.env = env
        self.actor = actor_model
        self.actor_target = actor_model.clone()
        self.critic = critic_model
        self.critic_target = critic_model.clone()
        if use_cuda:
            for net in [
                    self.actor, self.actor_target, self.critic,
                    self.critic_target
            ]:
                net.cuda()
        self.memory = ReplayMemory(memory)
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.random_process = OrnsteinUhlenbeckProcess(
            env.action_space.shape[0], theta=ou_theta, sigma=ou_sigma)
        self.optim_critic = optim.Adam(self.critic.parameters(),
                                       lr=critic_lr,
                                       weight_decay=critic_decay)
        self.optim_actor = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.render = render
        self.render_every = render_every
        self.evaluate = evaluate
        self.save_path = save_path
        self.save_every = save_every
        self.train_per_step = train_per_step

    def update(self, target, source):
        zipped = zip(target.parameters(), source.parameters())
        for target_param, source_param in zipped:
            updated_param = target_param.data * (1 - self.tau) + \
                source_param.data * self.tau
            target_param.data.copy_(updated_param)

    def train_models(self):
        if len(self.memory) < self.batch_size:
            return None, None
        mini_batch = self.memory.sample_batch(self.batch_size)
        critic_loss = self.train_critic(mini_batch)
        actor_loss = self.train_actor(mini_batch)
        self.update(self.actor_target, self.actor)
        self.update(self.critic_target, self.critic)
        return critic_loss.data[0], actor_loss.data[0]

    def mse(self, inputs, targets):
        return torch.mean((inputs - targets)**2)

    def train_critic(self, batch):
        # forward pass
        pred_actions = self.actor_target(batch.next_states)
        target_q = batch.rewards + batch.done * self.critic_target(
            [batch.next_states, pred_actions]) * self.gamma
        pred_q = self.critic([batch.states, batch.actions])
        # backward pass
        loss = self.mse(pred_q, target_q)
        self.optim_critic.zero_grad()
        loss.backward(retain_graph=True)
        for param in self.critic.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optim_critic.step()
        return loss

    def train_actor(self, batch):
        # forward pass
        pred_mu = self.actor(batch.states)
        pred_q = self.critic([batch.states, pred_mu])
        # backward pass
        loss = -pred_q.mean()
        self.optim_actor.zero_grad()
        loss.backward()
        #         for param in self.actor.parameters():
        #             param.grad.data.clamp_(-1, 1)
        self.optim_actor.step()
        return loss

    def prep_state(self, s):
        return Variable(torch.from_numpy(s).float().unsqueeze(0))

    def select_action(self, state, exploration=True):
        if use_cuda:
            state = state.cuda()
        self.actor.eval()
        action = self.actor(state)
        self.actor.train()
        if exploration:
            noise = Variable(
                torch.from_numpy(self.random_process.sample()).float())
            if use_cuda:
                noise = noise.cuda()
            action = action + noise
        return action

    def step(self, action):
        next_state, reward, done, _ = self.env.step(
            action.data.cpu().numpy()[0])
        next_state = self.prep_state(next_state)
        reward = FloatTensor([reward])
        return next_state, reward, done

    def warmup(self, num_steps):
        overall_step = 0
        while overall_step <= num_steps:
            done = False
            state = self.prep_state(self.env.reset())
            self.random_process.reset()
            while not done:
                overall_step += 1
                action = self.select_action(state)
                next_state, reward, done = self.step(action)
                self.memory.add(state, action, reward, next_state, done)
                state = next_state

    def train(self, num_steps):
        running_reward = None
        reward_sums = []
        losses = []
        overall_step = 0
        episode_number = 0

        while overall_step <= num_steps:
            episode_number += 1
            done = False
            state = self.prep_state(self.env.reset())
            reward_sum = 0
            self.random_process.reset()

            while not done:
                overall_step += 1
                action = self.select_action(state)
                next_state, reward, done = self.step(action)
                self.memory.add(state, action, reward, next_state, done)
                state = next_state
                reward_sum += reward[0]
                if self.train_per_step:
                    losses.append(self.train_models())
            if not self.train_per_step:
                losses.append(self.train_models())

            render_this_episode = self.render and (episode_number %
                                                   self.render_every == 0)
            evaluation_reward = self.run(render=render_this_episode)
            reward_sums.append((reward_sum, evaluation_reward))

            if self.save_path is not None and (episode_number % self.save_every
                                               == 0):
                self.save_models(self.save_path)
                self.save_results(self.save_path, losses, reward_sums)

            running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
            print(
                'episode: {}  steps: {}  running train reward: {:.4f}  eval reward: {:.4f}'
                .format(episode_number, overall_step, running_reward,
                        evaluation_reward))

        if self.save_path is not None:
            self.save_models(self.save_path)
            self.save_results(self.save_path, losses, reward_sums)
        return reward_sums, losses

    def run(self, render=True):
        state = self.env.reset()
        done = False
        reward_sum = 0
        while not done:
            if render:
                self.env.render()
            action = self.select_action(self.prep_state(state),
                                        exploration=False)
            state, reward, done, _ = self.env.step(
                action.data.cpu().numpy()[0])
            reward_sum += reward
        return reward_sum

    def save_models(self, path):
        self.actor.save(path)
        self.critic.save(path)

    def save_results(self, path, losses, rewards):
        losses = np.array([l for l in losses if l[0] is not None])
        rewards = np.array(rewards)
        np.savetxt(os.path.join(path, 'losses.csv'),
                   losses,
                   delimiter=',',
                   header='critic,actor',
                   comments='')
        np.savetxt(os.path.join(path, 'rewards.csv'),
                   rewards,
                   delimiter=',',
                   header='train,evaluation',
                   comments='')
コード例 #13
0
class Agent:
    def __init__(self, env, sess, horizon, epsilon, learning_rate_policy,
                 learning_rate_value, gamma, lam, logger):
        self.env = env
        self.sess = sess
        self.horizon = horizon
        self.epsilon = epsilon
        self.learning_rate_policy = learning_rate_policy
        self.learning_rate_value = learning_rate_value
        self.gamma = gamma
        self.lam = lam
        self.logger = logger

        self.observation_space = env.observation_space.shape[0]
        self.action_space = env.action_space.shape[0]

        self.policy = Policy(self.observation_space, self.action_space,
                             self.epsilon, self.learning_rate_policy)
        self.value_function = Value_function(self.observation_space,
                                             self.learning_rate_value)

        self.replay_memory = ReplayMemory(self.horizon, self.observation_space,
                                          self.action_space)

    def learn(self):
        """
         Learning process that loops forever if not stopped
        """
        while True:
            #Fill replay memory with one trajectory
            self.run_trajectory()
            adv, vtarget = self.gae()
            self.sess.run(self.policy.network.copy_to(self.policy.network_old))
            #Train policy and value function on minibatch
            bg = BatchGenerator((self.replay_memory.observations,
                                 self.replay_memory.actions, adv), 1000)
            for _ in range(20):
                for ms, ma, madv in bg.iterate_once():
                    self.sess.run(
                        self.policy.optimizer, {
                            self.policy.network.input_pl: ms,
                            self.policy.network_old.input_pl: ms,
                            self.policy.action_pl: ma,
                            self.policy.adv_pl: madv
                        })
            bg = BatchGenerator((self.replay_memory.observations, vtarget),
                                250)
            for _ in range(10):
                for ms, mvpred in bg.iterate_once():
                    self.sess.run(
                        self.value_function.optimizer, {
                            self.value_function.network.input_pl: ms,
                            self.value_function.value_pl: mvpred
                        })

    def run_trajectory(self):
        """
         Runs for one trajectory and fills the replay memory
        Returns:
         Nothing, data is stored in replay memory for later use
        """
        self.replay_memory.clear()
        observation = self.env.reset()
        episode_reward = 0
        for _ in range(self.horizon):
            observation = np.array([observation])
            action = self.sess.run(
                self.policy.network.sample,
                {self.policy.network.input_pl: observation})[0]
            new_observation, reward, done, info = self.env.step(action)
            episode_reward += reward
            self.replay_memory.add(observation, action, reward,
                                   new_observation, done)
            if done:
                #Log episode reward and reset
                self.logger.add_reward(episode_reward)
                episode_reward = 0
                observation = self.env.reset()
            else:
                observation = new_observation

    def gae(self):
        """
         Takes data in replay memory and calculates general advantage estimate with it
        Returns:
         gae: general advantage estimate
         vtarget: predicted values
        """
        v = self.sess.run(self.value_function.network.predict, {
            self.value_function.network.input_pl:
            self.replay_memory.observations
        })
        v1 = self.sess.run(
            self.value_function.network.predict, {
                self.value_function.network.input_pl:
                self.replay_memory.new_observations
            })
        tds = self.replay_memory.rewards + self.gamma * v1 * (
            1 - self.replay_memory.done) - v
        gae = scipy.signal.lfilter([1.0], [1.0, -self.gamma * self.lam],
                                   tds[::-1])[::-1]
        vtarget = gae + v
        gae = (gae - gae.mean()) / (gae.std() + 1e-6)
        return gae, vtarget
コード例 #14
0
class DeepQ_agent:
    def __init__(self,
                 env,
                 hidden_units=None,
                 network_LR=0.001,
                 batch_size=64,
                 update_every=4,
                 gamma=1.0):
        self.env = env
        self.BATCH_SIZE = batch_size
        self.GAMMA = gamma
        self.NETWORK_LR = network_LR
        self.MEMORY_CAPACITY = int(1e5)  #this is pythonic

        self.nA = env.ACTION_SPACE  #number of actions agent can perform
        self.HIDDEN_UNITS = hidden_units
        self.UPDATE_EVERY = update_every

        #let's give it some brains
        self.qnetwork_local = QNetwork(input_shape=self.env.STATE_SPACE,
                                       hidden_units=self.HIDDEN_UNITS,
                                       output_size=self.nA,
                                       learning_rate=self.NETWORK_LR)
        print(self.qnetwork_local.model.summary())

        #I call the target network as the PC
        # Where our agent stores all the concrete and important stuff
        self.qnetwork_target = QNetwork(input_shape=self.env.STATE_SPACE,
                                        hidden_units=self.HIDDEN_UNITS,
                                        output_size=self.nA,
                                        learning_rate=self.NETWORK_LR)

        #and the memory of course
        self.memory = ReplayMemory(self.MEMORY_CAPACITY, self.BATCH_SIZE)

        #handy temp variable
        self.t = 0

#----------------------Learn from experience-----------------------------------#

    def learn(self):
        '''
            hell yeah   
        '''

        if self.memory.__len__() > self.BATCH_SIZE:
            states, actions, rewards, next_states, dones = self.memory.sample(
                self.env.STATE_SPACE)

            #calculating action-values using local network
            target = self.qnetwork_local.predict(states, self.BATCH_SIZE)

            #future action-values using target network
            target_val = self.qnetwork_target.predict(next_states,
                                                      self.BATCH_SIZE)

            #future action-values using local network
            target_next = self.qnetwork_local.predict(next_states,
                                                      self.BATCH_SIZE)

            #The main point of Double DQN is selection of action from local network
            #while the update si from target network
            max_action_values = np.argmax(target_next,
                                          axis=1)  #action selection

            for i in range(self.BATCH_SIZE):
                if dones[i]:
                    target[i][actions[i]] = rewards[i]
                else:
                    target[i][
                        actions[i]] = rewards[i] + self.GAMMA * target_val[i][
                            max_action_values[i]]  #action evaluation

            self.qnetwork_local.train(states,
                                      target,
                                      batch_size=self.BATCH_SIZE)

            if self.t == self.UPDATE_EVERY:
                self.update_target_weights()
                self.t = 0
            else:
                self.t += 1

#-----------------------Time to act-----------------------------------------------#

    def act(self, state, epsilon=0):  #set to NO exploration by default
        state = state.reshape((1, ) + state.shape)
        action_values = self.qnetwork_local.predict(
            state)  #returns a vector of size = self.nA
        if random.random() > epsilon:
            action = np.argmax(
                action_values)  #choose best action - Exploitation
        else:
            action = random.randint(0, self.nA -
                                    1)  #choose random action - Exploration

        return action

#-----------------------------Add experience to agent's memory------------------------#

    def add_experience(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

#----------------------Updates values of Target network----------------------------#

    def update_target_weights(self):
        #well now we are doing hard update, but we can do soft update also
        self.qnetwork_target.model.set_weights(
            self.qnetwork_local.model.get_weights())

#---------------------helpful save function-------------------------------------#

    def save(self, model_num, directory):
        self.qnetwork_local.model.save(
            f'{directory}/snake_dqn_{model_num}_{time.asctime()}.h5')