Exemple #1
0
def test(env: gym.Env, agent: AgentBase, settings: TestSettings):
    # Initialize variables for logging.
    # agent.load(settings.directory)
    scores = ContiguousRingBuffer(capacity=128)
    eps = ConstantEpsilon(0.01)
    for i_episode in tqdm(range(settings.num_episodes)):
        # Initialize episode
        state = env.reset()
        total_reward = 0

        # Interact with the environment until done.
        done = False
        step = 0
        while not done:
            action = agent.select_action(state, eps(i_episode))
            if settings.render:
                env.render()
            next_state, reward, done, _ = env.step(action)
            total_reward += reward
            state = next_state
            time.sleep(1.0 / settings.fps)
            logger.debug('{}:{}'.format(step, action))
            step += 1

        # Save the final score.
        scores.append(total_reward)
    return scores
Exemple #2
0
    def __init__(self, state_size, action_size, buffer_size, batch_size, seed):
        """Initialize a ReplayBuffer object.

        Params
        ======
            state_size (int or tuple): dimension of state
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.action_size = action_size

        # NOTE(yycho0108): "done" is technically boolean, but mapping here to uint8
        # To support torch conversion (bool is not supported on certain versions.)
        self.dtype = np.dtype([('state', np.float32, state_size),
                               ('action', np.int32), ('reward', np.float32),
                               ('next_state', np.float32, state_size),
                               ('done', np.uint8)])
        # self.memory = {name : ContiguousRingBuffer(capacity=buffer_size, dtype=self.dtype.fields[name][0]) for name in  self.dtype.names}
        self.memory = ContiguousRingBuffer(capacity=buffer_size,
                                           dtype=self.dtype)
        self.batch_size = batch_size

        # Manipulate random engine.
        self.rng = np.random.RandomState(seed)
        self.nadd = 0
        self.nquery = 0
class PrioritizedReplayBuffer(object):
    def __init__(self,
                 state_size,
                 action_size,
                 buffer_size,
                 batch_size,
                 seed,
                 alpha=0.6):
        """Initialize a ReplayBuffer object.

        Params
        ======
            state_size (int or tuple): dimension of state
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.action_size = action_size

        # NOTE(yycho0108): "done" is technically boolean, but mapping here to uint8
        # To support torch conversion (bool is not supported on certain versions.)
        self.dtype = np.dtype([
            ('state', np.float32, state_size),
            ('action', np.int32),
            ('reward', np.float32),
            ('next_state', np.float32, state_size),
            ('done', np.uint8),
            ('priority', np.float32),
        ])
        self.memory = ContiguousRingBuffer(capacity=buffer_size,
                                           dtype=self.dtype)
        self.batch_size = batch_size
        self.max_priority = 1.0
        self.alpha = alpha
        self.fields = ['state', 'action', 'reward', 'next_state', 'done']

        # Manipulate random engine.
        self.rng = np.random.RandomState(seed)
        self.nadd = 0
        self.nquery = 0

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        entry = np.array(
            (state, action, reward, next_state, done, self.max_priority),
            dtype=self.dtype)
        self.memory.append(entry)
        self.nadd += 1

    def extend(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        entry = np.empty(len(state), dtype=self.dtype)
        entry['state'] = state
        entry['action'] = action
        entry['reward'] = reward
        entry['next_state'] = next_state
        entry['done'] = done
        entry['priority'] = np.full(len(state), self.max_priority**self.alpha)
        self.memory.extend(entry)
        self.nadd += len(state)

    def sample(self, indices: np.ndarray = None):
        """Randomly sample a batch of experiences from memory."""
        if indices is None:
            # FIXME(yycho0108): using max_priority here may not be accurate.
            # Maintaining a heapq of priority may actually provide better results.
            # However, in practice memory.max() == max_priority
            # Due to the nature that Q values tend to grow over time.
            # Perhaps this will lead to unexpected artifacts in sampling?
            indices = resample_wheel(self.memory['priority'], self.batch_size,
                                     self.max_priority**self.alpha)
            # indices = resample_wheel(self.memory['priority'], self.batch_size)
        out = [(self.memory[name][indices]) for name in self.fields]
        self.nquery += 1
        return out, indices

    def update_priorities(self, indices: np.ndarray, priorities: np.ndarray):
        """ Update priority """
        # Update alpha factor to priorities to control
        # Preference to uniform vs. prioritized sampling.
        # (Assume `priorities` is not pre-exponentiated with `alpha`.)
        self.max_priority = max(self.max_priority, priorities.max())
        priorities = priorities**self.alpha
        self.memory['priority'][indices] = priorities

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)
Exemple #4
0
class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""
    def __init__(self, state_size, action_size, buffer_size, batch_size, seed):
        """Initialize a ReplayBuffer object.

        Params
        ======
            state_size (int or tuple): dimension of state
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.action_size = action_size

        # NOTE(yycho0108): "done" is technically boolean, but mapping here to uint8
        # To support torch conversion (bool is not supported on certain versions.)
        self.dtype = np.dtype([('state', np.float32, state_size),
                               ('action', np.int32), ('reward', np.float32),
                               ('next_state', np.float32, state_size),
                               ('done', np.uint8)])
        # self.memory = {name : ContiguousRingBuffer(capacity=buffer_size, dtype=self.dtype.fields[name][0]) for name in  self.dtype.names}
        self.memory = ContiguousRingBuffer(capacity=buffer_size,
                                           dtype=self.dtype)
        self.batch_size = batch_size

        # Manipulate random engine.
        self.rng = np.random.RandomState(seed)
        self.nadd = 0
        self.nquery = 0

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        entry = np.array((state, action, reward, next_state, done),
                         dtype=self.dtype)
        self.memory.append(entry)
        self.nadd += 1

    def extend(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        entry = np.empty(len(state), dtype=self.dtype)
        entry['state'] = state
        entry['action'] = action
        entry['reward'] = reward
        entry['next_state'] = next_state
        entry['done'] = done
        self.memory.extend(entry)
        self.nadd += len(state)

    def sample(self, indices=None):
        """Randomly sample a batch of experiences from memory."""
        if indices is None:
            indices = self.rng.randint(len(self.memory), size=self.batch_size)

        # NOTE(yycho0108): It is much more favorable to index by name first here,
        # to prevent creation of multiple copies since the output must ultimately be contiguous.
        # Since indexing by the field name will merely create a view, applying the selection indices last
        # Will create the final contiguous copy without the intermediate memory.
        # print(self.memory['state'].base is self.memory.data_) # True
        # print(self.memory[indices].base is self.memory.data_) # False
        # print(self.memory['state'][indices].base is self.memory.data_) # False

        out = [(self.memory[name][indices]) for name in self.dtype.fields]
        self.nquery += 1
        return out, indices

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)
Exemple #5
0
def train_multi(env: gym.Env, agent: AgentBase, settings: TrainSettings):
    # Initialize variables for logging.
    scores = ContiguousRingBuffer(capacity=128)
    max_avg_score = -np.inf

    # Ensure settings.directory exists for logging / saving.
    os.makedirs(settings.directory, exist_ok=True)
    # Optionally load from existing checkpoint.
    if settings.load:
        agent.load(settings.load)

    # Instantiate vectorized environment.
    if isinstance(env, SubprocVecEnv):
        # No further action is required.
        pass
    elif isinstance(env, gym.Env):
        # Cannot
        logger.error("Unable to broadcast single environment {}".format(env))
    else:
        # Assume that env is a constructor function.
        env = SubprocVecEnv(
            [functools.partial(env, i) for i in range(settings.num_env)])

    # Initialize handlers for data collection.
    total_rewards = np.zeros(settings.num_env, dtype=np.float32)
    dones = np.zeros(settings.num_env, dtype=np.uint8)
    states = env.reset()
    # FIXME(yycho0108): EPS should be configurable.
    # eps = LinearEpsilon(0.8 * settings.num_episodes)
    eps = ExponentialEpsilon(0.99, 0.05, 0.8 * settings.num_episodes, True)

    i_episode = 0
    pbar = tqdm(total=settings.num_episodes)
    while i_episode < settings.num_episodes:
        # Reset the environments that are done, so that
        # At each moment the agent is always dealing with a live-state.
        # SubprocVecEnv.reset() does not allow granular control.
        for s, d, e in zip(states, dones, env.remotes):
            if not d:
                continue
            e.send(('reset', None))
            # FIXME(yycho0108): Applying a reshape here as e.recv()
            # Was seen to return a list for whatever reason.
            # May silently allow an error to pass through.
            s[:] = np.reshape(e.recv(), s.shape)
        scores.extend(total_rewards[dones == True])
        total_rewards[dones == True] = 0.0
        num_done = dones.sum()
        dones[:] = False

        # Process each state and interact with each env.
        actions = agent.select_action(states, eps(i_episode))
        next_states, rewards, dones, _ = env.step(actions)
        agent.step(states, actions, rewards, next_states, dones)
        total_rewards += rewards
        states = next_states

        # Increment episode counts accordingly.
        pbar.set_postfix(score=np.mean(scores.array))

        # Optionally enable printing episode statistics.
        # The logging happens at each crossing of the discretized log-period boundary.
        if count_boundaries(i_episode, num_done, settings.log_period) > 0:
            # Compute statistilcs.
            avg_score = np.mean(scores.array)
            if avg_score > max_avg_score:
                max_avg_score = avg_score

            # Print statistics.
            logger.info(
                "Episode {}/{} | Max Avg: {:.2f} | Eps : {:.2f}".format(
                    i_episode, settings.num_episodes, max_avg_score,
                    eps(i_episode)))
            if isinstance(agent.memory, PrioritizedReplayBuffer):
                logger.info('mp : {} vs {}'.format(
                    agent.memory.max_priority,
                    agent.memory.memory.array['priority'].max()))

        # Save agent checkpoint as well.
        if count_boundaries(i_episode, num_done, settings.save_period) > 0:
            agent.save(settings.directory, i_episode + num_done)

        i_episode += num_done
        pbar.update(num_done)
    pbar.close()

    # Save results and return.
    agent.save(settings.directory)
    return scores
Exemple #6
0
def train_single(env: gym.Env, agent: AgentBase, settings: TrainSettings):
    # Initialize variables for logging.
    scores = ContiguousRingBuffer(capacity=128)
    max_avg_score = -np.inf

    # Ensure settings.directory exists for logging / saving.
    os.makedirs(settings.directory, exist_ok=True)
    # Optionally load from existing checkpoint.
    if settings.load:
        agent.load(settings.load)

    # Instantiate vectorized environment.
    if isinstance(env, gym.Env):
        # No further action is required.
        pass
    else:
        # Assume that env is a constructor function.
        env = env()

    # FIXME(yycho0108): EPS should be configurable.
    # eps = LinearEpsilon(0.8 * settings.num_episodes)
    eps = ExponentialEpsilon(0.99, 0.05, 0.8 * settings.num_episodes, True)

    if is_notebook():
        t = tnrange(settings.num_episodes)
    else:
        t = tqdm(range(settings.num_episodes))

    for i_episode in t:
        # Initialize episode
        state = env.reset()
        total_reward = 0

        # Interact with the environment until done.
        done = False
        while not done:
            action = agent.select_action(state, eps(i_episode))
            next_state, reward, done, _ = env.step(action)
            # NOTE(yycho0108): agent.step() trains the agent.
            # FIXME(yycho0108): rename?
            agent.step(state, action, reward, next_state, done)
            total_reward += reward
            state = next_state

        # Save the final score.
        scores.append(total_reward)

        t.set_postfix(score=np.mean(scores.array))
        # Optionally enable printing episode stats.
        if i_episode % settings.log_period == 0:
            # Compute statistics.
            avg_score = np.mean(scores.array)
            if avg_score > max_avg_score:
                max_avg_score = avg_score

            # Print statistics.
            logger.info(
                "Episode {}/{} | Max Avg: {:.2f} | Eps : {:.2f}".format(
                    i_episode, settings.num_episodes, max_avg_score,
                    eps(i_episode)))
            # sys.stdout.flush()

        if i_episode % settings.save_period == 0:
            agent.save(settings.directory, i_episode)

    # Save results and return.
    agent.save(settings.directory)
    return scores