Exemple #1
0
class Agent(object):
    """Implements an agent that follows DDPG algorithm."""
    def __init__(self,
                 state_shape,
                 num_actions,
                 action_scale=2.0,
                 discount=0.99,
                 tau=0.01,
                 actor_lrate=0.001,
                 critic_lrate=0.01,
                 l2_decay=1e-3,
                 batch_size=64,
                 q_update_iter=1,
                 capacity=1000000):

        if not isinstance(state_shape, tuple):
            raise AssertionError('state_shape must be of type <tuple>.')
        elif len(state_shape) == 0:
            raise AssertionError('No state space dimensions provided.')
        elif num_actions == 0:
            raise ValueError('Number of actions must be > 0.')
        elif capacity < batch_size:
            raise ValueError('Replay capacity must be > batch_size.')

        self.batch_size = batch_size
        self.q_update_iter = q_update_iter
        self.replay_buffer = ReplayBuffer(capacity, state_shape, num_actions)
        self.actor = Actor(state_shape, num_actions, action_scale, actor_lrate,
                           tau)
        self.critic = Critic(state_shape, num_actions, discount, critic_lrate,
                             tau, l2_decay)
        self.step = 0

    def choose_action(self, state):
        """Returns an action for the agent to perform in the environment."""
        return self.actor.predict(state).flatten()

    def update_buffer(self, s0, a, r, s1, terminal):
        """Updates memory replay buffer with new experience."""
        self.replay_buffer.update(s0, a, r, s1, terminal)

    def update_policy(self):
        """Updates Q-networks using replay memory data + performing SGD"""

        mb = self.replay_buffer.sample(self.batch_size)

        # To update the critic, we need a prediction from target policy
        target_a = self.actor.predict_target(mb[3])
        self.critic.train_fn(mb[0], mb[1], mb[3], target_a, mb[2], mb[4])

        # Updating the actor requires gradients from critic
        action = self.actor.predict(mb[0])
        grads = self.critic.get_action_grads(mb[0], action)
        self.actor.train_fn(mb[0], grads)

        # Every few steps in an episode we update target network weights
        if self.step == self.q_update_iter:
            self.actor.update_target()
            self.critic.update_target()
        self.step = self.step + 1 if self.step != self.q_update_iter else 0
class Agent:
    """Interacts with and learns from the environment."""

    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 random_seed,
                 device,
                 lr_actor,
                 lr_critic,
                 weight_decay_critic,
                 batch_size,
                 buffer_size,
                 gamma,
                 tau,
                 update_every,
                 n_updates,
                 eps_start,
                 eps_end,
                 eps_decay):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.t_step = 0
        self.device = device
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay_critic = weight_decay_critic
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.gamma = gamma
        self.tau = tau
        self.update_every = update_every
        self.n_updates = n_updates
        self.eps = eps_start
        self.eps_end = eps_end
        self.eps_decay = eps_decay

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(self.device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(self.device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, random_seed, self.device)

    def step(self, state, action, reward, next_state, done, agent_number):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        self.t_step += 1
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory and at interval settings
        if len(self.memory) > self.batch_size:
            if self.t_step % self.update_every == 0:
                for _ in range(self.n_updates):
                    experiences = self.memory.sample()
                    self.learn(experiences, self.gamma, agent_number)

    def act(self, states, add_noise):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(self.device)
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for agent_num, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_num, :] = action
        self.actor_local.train()
        if add_noise:
            actions += self.eps * self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, agent_number):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)

        if agent_number == 0:
            actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1)
        else:
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)

        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)

        if agent_number == 0:
            actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1)
        else:
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)

        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

        # Update epsilon noise value
        self.eps = max(self.eps_end, self.eps_decay*self.eps)
        # self.eps = self.eps - (1/self.eps_decay)
        # if self.eps < self.eps_end:
        #     self.eps = self.eps_end

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
Exemple #3
0
def train(env, estimator, target_network, num_episodes=1000,
                    replay_memory_size=500000,
                    frame_history_len=4,
                    save_every=10,
                    update_every=1000,
                    discount=0.99, epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=50000,
                    batch_size=32, record_every=50):
    """
    deep q learning algorithm
    :param env: openAI gym environment
    :param estimator: estimator model for predicting values
    :param target_network:
    :param num_episodes: number of episodes to run
    :param replay_memory_size: size of replay memory
    :param update_every: copy params from estimator into target estimator after this many steps
    :param discount: discount factor
    :param epsilon_start: starting epsilon value
    :param epsilon_end: ending epsilon value
    :param batch_size: 32 lol
    :param record_every: record a video every N episodes
    :return:
    """

    # Load previous state here
    replay_memory = ReplayBuffer(replay_memory_size, frame_history_len)

    # epsilon delay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    loss_func = nn.SmoothL1Loss()
    optimizer = torch.optim.Adam(estimator.parameters())

    policy = make_epsilon_greedy_policy(estimator, len(VALID_ACTIONS))

    env = Monitor(env, directory="./monitor",
                  resume=True,
                  video_callable=lambda count: count % record_every == 0)

    total_t = 0
    pbar = tqdm(range(num_episodes))
    pbar.set_description("ep: %d, er: %.2f, et: %d, tt: %d, exp_size: %d" % (0, 0.0, 0, 0, 0))

    for ep in pbar:

        state = env.reset()  # 210 x 160 x 4
        state = process_state(state)  # 94 x 94 x 3
        episode_loss = 0
        episode_reward = 0
        episode_t = 0

        for t in itertools.count():
            epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)]

            last_idx = replay_memory.store_frame(state)

            recent_observations = replay_memory.encode_recent_observation()

            action_dist = policy(recent_observations, epsilon)
            action_dist = action_dist.squeeze(0).numpy()
            action = np.random.choice(np.arange(len(action_dist)), p=action_dist)

            next_state, reward, done, _ = env.step(action)
            reward = max(-1.0, min(reward, 1.0))

            episode_reward += reward

            replay_memory.store_effect(last_idx, action, reward, done)
            next_state = process_state(next_state)

            state = next_state

            if replay_memory.can_sample(batch_size):
                obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_memory.sample(batch_size)
                obs_batch = torch.from_numpy(obs_batch).float()
                obs_batch = obs_batch.to(device)
                act_batch = torch.from_numpy(act_batch).long().to(device) / 255.0
                rew_batch = torch.from_numpy(rew_batch).to(device)
                next_obs_batch = torch.from_numpy(next_obs_batch).float().to(device) / 255.0
                not_done_mask = torch.from_numpy(1 - done_mask).float().to(device)

                state_values = estimator(obs_batch)  # b x VALID_ACTIONS
                state_action_values = torch.gather(state_values, 1, act_batch.unsqueeze(1))  # b x 1

                next_state_values_max = target_network(next_obs_batch).detach().max(dim=1)[0]
                next_state_values = not_done_mask * next_state_values_max

                expected_q_value = (next_state_values * discount) + rew_batch

                # bellman_error = expected_q_value - state_action_values.squeeze(1)
                #
                # clipped_bellman_error = bellman_error.clamp(-1, 1)
                #
                # d_error = clipped_bellman_error * -1.0

                loss = loss_func(state_action_values, expected_q_value.unsqueeze(1))
                episode_loss += loss

                # state_action_values.backward(d_error.data.unsqueeze(1))

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            if done:
                break

            total_t += 1
            episode_t = t

        pbar.set_description("ep: %d, el: %.5f, er: %.2f, et: %d, tt: %d, exp_size: %d" % (ep, episode_loss, episode_reward, episode_t, total_t, replay_memory.num_in_buffer))
        if total_t % update_every == 0:
            copy_model_params(estimator, target_network)

        # save checkpoint
        if ep % save_every == 0:
            torch.save(estimator.state_dict(), './checkpoints/checkpoint.pt')

    env.close()
Exemple #4
0
def challenger_round():
    challengers = []
    leaders = []
    leader_checkpoints = os.listdir(LEADER_DIR)
    # Need to share the same schedule with all challengers, so they all anneal
    # at same rate
    epsilon_schedule = LinearSchedule(EPS_START, EPS_END, TRAIN_FRAMES)
    for i in xrange(NUM_LEADERS):
        challenger = try_gpu(
            DQNAgent(6,
                     epsilon_schedule,
                     OBSERVATION_MODE,
                     lr=LR,
                     max_grad_norm=GRAD_CLIP_NORM))
        if i < len(leader_checkpoints):
            leader = try_gpu(
                DQNAgent(6, LinearSchedule(0.1, 0.1, 500000),
                         OBSERVATION_MODE))
            leader_path = os.path.join(LEADER_DIR, leader_checkpoints[i])
            print "LOADING CHECKPOINT: {}".format(leader_path)
            challenger.load_state_dict(
                torch.load(leader_path,
                           map_location=lambda storage, loc: storage))
            leader.load_state_dict(
                torch.load(leader_path,
                           map_location=lambda storage, loc: storage))
        else:
            leader = RandomAgent(6)
            print "INITIALIZING NEW CHALLENGER AND LEADER"
        challengers.append(challenger)
        leaders.append(leader)

    if CHALLENGER_DIR is not None:
        challengers = []
        # Load in all of the leaders
        for checkpoint in os.listdir(CHALLENGER_DIR):
            path = os.path.join(CHALLENGER_DIR, checkpoint)
            print "LOADING FROM CHALLENGER_DIR: {}".format(path)
            challenger = try_gpu(
                DQNAgent(6,
                         LinearSchedule(0.05, 0.05, 1),
                         CHALLENGER_OBSERVATION_MODE,
                         lr=LR,
                         max_grad_norm=GRAD_CLIP_NORM,
                         name=checkpoint))
            challenger.load_state_dict(
                torch.load(path, map_location=lambda storage, loc: storage))
            challengers.append(challenger)

    challenger = EnsembleDQNAgent(challengers)
    leader = EnsembleDQNAgent(leaders)
    if OPPONENT is not None or HUMAN:
        leader = NoOpAgent()
    replay_buffer = ReplayBuffer(1000000)
    rewards = collections.deque(maxlen=1000)
    frames = 0  # number of training frames seen
    episodes = 0  # number of training episodes that have been played
    with tqdm(total=TRAIN_FRAMES) as progress:
        # Each loop completes a single episode
        while frames < TRAIN_FRAMES:
            states = env.reset()
            challenger.reset()
            leader.reset()
            episode_reward = 0.
            episode_frames = 0
            # Each loop completes a single step, duplicates _evaluate() to
            # update at the appropriate frame #s
            for _ in xrange(MAX_EPISODE_LENGTH):
                frames += 1
                episode_frames += 1
                action1 = challenger.act(states[0])
                action2 = leader.act(states[1])
                next_states, reward, done = env.step(action1, action2)
                episode_reward += reward

                # NOTE: state and next_state are LazyFrames and must be
                # converted to np.arrays
                replay_buffer.add(
                    Experience(states[0], action1._action_index, reward,
                               next_states[0], done))
                states = next_states

                if len(replay_buffer) > 50000 and \
                        frames % 4 == 0:
                    experiences = replay_buffer.sample(32)
                    challenger.update_from_experiences(experiences)

                if frames % 10000 == 0:
                    challenger.sync_target()

                if frames % SAVE_FREQ == 0:
                    # TODO: Don't access internals
                    for agent in challenger._agents:
                        path = os.path.join(LEADER_DIR,
                                            agent.name + "-{}".format(frames))
                        print "SAVING CHECKPOINT TO: {}".format(path)
                        torch.save(agent.state_dict(), path)
                    #path = os.path.join(
                    #        LEADER_DIR, challenger.name + "-{}".format(frames))
                    #torch.save(challenger.state_dict(), path)

                if frames >= TRAIN_FRAMES:
                    break

                if done:
                    break

            if episodes % 300 == 0:
                print "Evaluation: {}".format(
                    evaluate(challenger, leader, EPISODES_EVALUATE_TRAIN))
            print "Episode reward: {}".format(episode_reward)
            episodes += 1
            rewards.append(episode_reward)
            stats = challenger.stats
            stats["Avg Episode Reward"] = float(sum(rewards)) / len(rewards)
            stats["Num Episodes"] = episodes
            stats["Replay Buffer Size"] = len(replay_buffer)
            progress.set_postfix(stats, refresh=False)
            progress.update(episode_frames)
            episode_frames = 0
Exemple #5
0
class DDPG:
    def __init__(self, config):
        self.config = config
        self.state_size = config.state_size
        self.action_size = config.action_size

        self.actor_local = Actor(self.state_size, self.action_size,
                                 2).to(device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  2).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=config.LR_ACTOR)

        self.critic_local = Critic(self.state_size, self.action_size,
                                   2).to(device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    2).to(device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=config.LR_CRITIC,
        )

        self.memory = ReplayBuffer(config.random_seed, config.BUFFER_SIZE)
        self.noise = OUNoise(self.action_size, config.random_seed)

        self.t_step = 0

        self.soft_update(self.critic_local, self.critic_target, 1)
        self.soft_update(self.actor_local, self.actor_target, 1)

    def step(self, states, actions, rewards, next_states, dones):

        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        self.t_step = (self.t_step + 1) % self.config.UPDATE_EVERY

        if len(self.memory) > self.config.BATCH_SIZE and (self.t_step == 0):

            for i in range(self.config.EPOCH):
                experiences = self.memory.sample(self.config.BATCH_SIZE)
                self.learn(experiences)

    def reset(self):
        self.noise.reset()

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        Q_targets_next = self.critic_target(next_states,
                                            self.actor_target(next_states))
        Q_targets = rewards + (self.config.GAMMA * Q_targets_next *
                               (1 - dones))
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        actor_loss = -self.critic_local(states,
                                        self.actor_local(states)).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target,
                         self.config.TAU)
        self.soft_update(self.actor_local, self.actor_target, self.config.TAU)

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent(object):
    """DDPG Agent that interacts and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 device,
                 actor_args={},
                 critic_args={}):
        """Initializes the DQN agent.

        Args:
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            device (torch.device): Device to use for calculations
            actor_args (dict): Arguments describing the actor network
            critic_args (dict): Arguments describing the critic network
        """
        self.state_size = state_size
        """Dimension of each state"""

        self.action_size = action_size
        """Dimension of each action"""

        self.device = device
        """Device to use for calculations"""

        self.t_step = 0
        """Timestep between training updates"""

        # Parameters

        # Actor network
        self.actor_local = Actor(state_size, action_size,
                                 **actor_args).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  **actor_args).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic network

        self.critic_local = Critic(state_size, action_size,
                                   **critic_args).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    **critic_args).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process for exploration
        self.noise = OUNoise(action_size, sigma=NOISE_SD)

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, self.device)

    def reset(self):
        """Reset state of agent."""
        self.noise.reset()

    def save_weights(self, path):
        """Save local network weights.

        Args:
            path (string): File to save to"""
        torch.save(
            {
                'actor_local': self.actor_local.state_dict(),
                'actor_target': self.actor_target.state_dict(),
                'critic_local': self.critic_local.state_dict(),
                'critic_target': self.critic_target.state_dict()
            }, path)

    def load_weights(self, path):
        """Load local network weights.

        Args:
            path (string): File to load weights from"""
        checkpoint = torch.load(path)
        self.actor_local.load_state_dict(checkpoint['actor_local'])
        self.actor_target.load_state_dict(checkpoint['actor_target'])
        self.critic_local.load_state_dict(checkpoint['critic_local'])
        self.critic_target.load_state_dict(checkpoint['critic_target'])

    def act(self, state, add_noise=True):
        """Returns action for given state according to the current policy
            
        Args:
            state (np.ndarray): Current state

        Returns:
            action (np.ndarray): Action tuple
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        # Temporarily set evaluation mode (no dropout &c) & turn off autograd
        self.actor_local.eval()

        with torch.no_grad():
            action = self.actor_local(state).cpu().detach().numpy()

        # Resume training mode
        self.actor_local.train()

        # Add noise if exploring
        if add_noise:
            action += self.noise.sample()
            # The noise might take us out of range
            action = np.clip(action, -1, 1)

        return action

    def step(self, state, action, reward, next_state, done):
        """Save experience and learn if due.
        Args:
            state (Tensor): Current state
            action (int): Chosen action
            reward (float): Resulting reward
            next_state (Tensor): State after action
            done (bool): True if terminal state
        """
        self.memory.add(state, action, reward, next_state, done)

        # Learn as soon as we have enough stored experiences
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0 and len(self.memory) > BATCH_SIZE:
            for _ in range(NUM_UPDATES):
                experiences = self.memory.sample()
                self.learn(experiences)

    def learn(self, experiences):
        """Learn from batch of experiences."""
        states, actions, rewards, next_states, dones = experiences

        # region Update Critic
        actions_next = self.actor_target(next_states)
        q_targets_next = self.critic_target(next_states, actions_next)

        q_targets = rewards + (GAMMA * q_targets_next * (1 - dones))

        q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(q_expected, q_targets)

        # Minimize loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1.0)
        self.critic_optimizer.step()
        # endregion

        # region Update Actor
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        # endregion

        # Update target networks
        soft_update(self.critic_local, self.critic_target, TAU)
        soft_update(self.actor_local, self.actor_target, TAU)
Exemple #7
0
class Agent(object):
    """DQN Agent that interacts and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 device,
                 replay_buffer_size=int(1e5),
                 batch_size=64,
                 discount_factor=0.99,
                 soft_update=1e-3,
                 learning_rate=5e-4,
                 update_every=4,
                 **kwargs):
        """Initializes the DQN agent.

        Args:
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            device (torch.device): Device to use for calculations
            replay_buffer_size (int): Size of replay buffer
            batch_size (int): Size of experience batches during training
            discount_factor (float): Discount factor (gamma)
            soft_update (float): Soft update coefficient (tau)
            learning_rate (float): Learning rate (alpha)
            update_every (int): Steps between updating the network
            **kwargs: Arguments describing the QNetwork
        """
        self.state_size = state_size
        """Dimension of each state"""

        self.action_size = action_size
        """Dimension of each action"""

        self.device = device
        """Device to use for calculations"""

        # Parameters
        self.batch_size = batch_size
        """Size of experience batches during training"""

        self.discount_factor = discount_factor
        """Discount factor (gamma)"""

        self.soft_update = soft_update
        """Soft update coefficient (tau)"""

        self.update_every = update_every
        """Steps between updating the network"""

        # Q Networks
        self.target_network = QNetwork(state_size, action_size, **kwargs) \
            .to(device)
        """Target Q-Network"""

        self.local_network = QNetwork(state_size, action_size, **kwargs) \
            .to(device)
        """Local Q-Network"""

        self.optimizer = optim.Adam(self.local_network.parameters(),
                                    lr=learning_rate)
        """Optimizer used when training the Q-network."""

        # Memory
        self.memory = ReplayBuffer(replay_buffer_size, batch_size, device)

        # Time step
        self.t_step = 0
        """Current time step"""

    def save_weights(self, path):
        """Save local network weights.

        Args:
            path (string): File to save to"""
        self.local_network.save_weights(path)

    def load_weights(self, path):
        """Load local network weights.

        Args:
            path (string): File to load weights from"""
        self.local_network.load_weights(path)

    def act(self, state, eps=0.):
        """Returns action for given state according to the current policy
            
        Args:
            state (np.ndarray): Current state
            eps (float): Probability of selecting random action (epsilon)
            
        Returns:
            int: Epsilon-greedily selected action
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        # Temporarily set evaluation mode (no dropout &c) & turn off autograd
        self.local_network.eval()
        with torch.no_grad():
            action_values = self.local_network(state)
        self.local_network.train()

        # Select action epsilon-greedily
        if random.random() > eps:
            return np.argmax(action_values.cpu().detach().numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def step(self, state, action, reward, next_state, done):
        """Save experience and learn if due.

        Args:
            state (Tensor): Current state
            action (int): Chosen action
            reward (float): Resulting reward
            next_state (Tensor): State after action
            done (bool): True if terminal state
        """
        self.memory.add(state, action, reward, next_state, done)

        # Learn if at update_every steps
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # Check that we have enough stored experiences
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences)

    def learn(self, experiences):
        """Update Q-network using given experiences

        Args:
            experiences (Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
                SARS'+done tuple
        """
        states, actions, rewards, next_states, dones = experiences

        # Predicted Q values from target model for next states
        # (NB. torch.max returns tuple (max, argmax)
        q_target_next = self.target_network(next_states).max(dim=1,
                                                             keepdim=True)[0]

        # Computed target Q values for current state
        q_target = rewards + self.discount_factor * q_target_next * (1 - dones)

        # Predicted Q values from local model for current state
        q_local = self.local_network(states).gather(dim=1, index=actions)

        loss = F.mse_loss(q_local, q_target)

        # Update local network weights
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network
        soft_update(self.local_network, self.target_network, self.soft_update)
Exemple #8
0
            # Add to buffer.
            instruction_data_cuda = [
                torch.tensor(t, dtype=torch.float, device=device)
                for t in instruction_data
            ]
            replay_buffer.append(instruction_data_cuda)

            # Check for minimum replay size.
            if len(replay_buffer) < REPLAY_MIN:
                print('Waiting for minimum buffer size ... {}/{}'.format(
                    len(replay_buffer), REPLAY_MIN))
                continue

            # Sample training mini-batch.
            sampled_evaluations = replay_buffer.sample(REPLAY_SAMPLE_SIZE)
            sampled_contexts = torch.stack([t[0] for t in sampled_evaluations])
            sampled_states = torch.stack([t[1] for t in sampled_evaluations])
            sampled_params = torch.stack([t[2] for t in sampled_evaluations])
            sampled_values = torch.stack([t[3] for t in sampled_evaluations])

            # Update critic.
            critic_loss = torch.distributions.Normal(*critic_model(sampled_contexts, sampled_states, sampled_params)) \
                    .log_prob(sampled_values).mean(dim=-1)
            critic_model_optimizer.zero_grad()
            gen_model_optimizer.zero_grad()
            (-critic_loss).backward()
            torch.nn.utils.clip_grad_norm_(critic_model.parameters(), 1.0)
            critic_model_optimizer.step()

            # Update params model.
Exemple #9
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0  #0
        self.exploration_theta = 0.15  #0.15
        self.exploration_sigma = 0.2  #0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        # Score tracker and learning parameters
        self.best_score = -np.inf

    def reset_episode(self):
        self.total_reward = 0.0
        self.count = 0

        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Save reward
        self.total_reward += reward
        self.count += 1

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

        if done:
            # Keeping track of the score
            self.score = self.total_reward / float(
                self.count) if self.count else 0.0
            if self.score > self.best_score:
                self.best_score = self.score

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)