Example #1
0
class A2CRunner():
    def __init__(self,
                 agent,
                 envs,
                 summary_writer=None,
                 train=True,
                 n_steps=8,
                 discount=0.99,
                 event_based=False):
        """
    Args:
      agent: A2CAgent instance.
      envs: SubprocVecEnv instance.
      summary_writer: summary writer to log episode scores.
      train: whether to train the agent.
      n_steps: number of agent steps for collecting rollouts.
      discount: future reward discount.
    """
        self.agent = agent
        self.envs = envs
        self.summary_writer = summary_writer
        self.train = train
        self.n_steps = n_steps
        self.discount = discount
        self.preproc = Preprocessor(self.envs.observation_spec()[0])
        self.episode_counter = 0
        self.cumulative_score = 0.0
        self.event_based = event_based
        self.event_memory = EventMemory(504, 1000)

    def reset(self):
        obs_raw = self.envs.reset()
        self.last_obs = self.preproc.preprocess_obs(obs_raw)

    def get_mean_score(self):
        return self.cumulative_score / self.episode_counter

    def _summarize_episode(self, timestep):
        score = timestep.observation["score_cumulative"][0]
        if self.summary_writer is not None:
            summary = tf.Summary()
            summary.value.add(tag='sc2/episode_score', simple_value=score)
            self.summary_writer.add_summary(summary, self.episode_counter)

        print("episode %d: score = %f" % (self.episode_counter, score))
        self.episode_counter += 1
        return score

    def run_batch(self, train_summary=False):
        """Collect trajectories for a single batch and train (if self.train).

    Args:
      train_summary: return a Summary of the training step (losses, etc.).

    Returns:
      result: None (if not self.train) or the return value of agent.train.
    """
        shapes = (self.n_steps, self.envs.n_envs)
        values = np.zeros(shapes, dtype=np.float32)
        rewards = np.zeros(shapes, dtype=np.float32)
        dones = np.zeros(shapes, dtype=np.float32)

        all_obs = []
        all_actions = []
        all_scores = []

        last_obs = self.last_obs

        if self.event_based:
            event_states = [EventState(obs) for obs in last_obs]

        for n in range(self.n_steps):
            actions, value_estimate = self.agent.step(last_obs)
            actions = mask_unused_argument_samples(actions)
            size = last_obs['screen'].shape[1:3]

            values[n, :] = value_estimate
            all_obs.append(last_obs)
            all_actions.append(actions)

            pysc2_actions = actions_to_pysc2(actions, size)
            obs_raw = self.envs.step(pysc2_actions)
            last_obs = self.preproc.preprocess_obs(obs_raw)
            rewards[n, :] = [t.reward for t in obs_raw]
            dones[n, :] = [t.last() for t in obs_raw]

            if self.event_based:
                for i in range(obs_raw):
                    if not dones[n][i]:
                        events = event_states[i].update(obs_raw[i])
                        event_reward = self.event_memory.get_event_rewards(
                            events)
                        rewards[n][i] = event_reward
                    else:
                        self.event_memory.record_events(
                            event_states[i].get_events())
                        event_states[i].reset()

            for t in obs_raw:
                if t.last():
                    score = self._summarize_episode(t)
                    self.cumulative_score += score

        self.last_obs = last_obs

        next_values = self.agent.get_value(last_obs)

        returns, advs = compute_returns_advantages(rewards, dones, values,
                                                   next_values, self.discount)

        actions = stack_and_flatten_actions(all_actions)
        obs = flatten_first_dims_dict(stack_ndarray_dicts(all_obs))
        returns = flatten_first_dims(returns)
        advs = flatten_first_dims(advs)

        if self.train:
            return self.agent.train(obs,
                                    actions,
                                    returns,
                                    advs,
                                    summary=train_summary)

        return None
Example #2
0
class A2CRunner():
    def __init__(self,
                 agent,
                 envs,
                 summary_writer=None,
                 train=True,
                 n_steps=8,
                 discount=0.99,
                 temporal=False):
        """
    Args:
      agent: A2CAgent instance.
      envs: SubprocVecEnv instance.
      summary_writer: summary writer to log episode scores.
      train: whether to train the agent.
      n_steps: number of agent steps for collecting rollouts.
      discount: future reward discount.
    """
        self.agent = agent
        self.envs = envs
        self.summary_writer = summary_writer
        self.train = train
        self.n_steps = n_steps
        self.discount = discount
        self.preproc = Preprocessor(self.envs.observation_spec()[0])
        self.episode_counter = 0
        self.cumulative_score = 0.0
        self.temporal = temporal

    def reset(self):
        obs_raw = self.envs.reset()
        self.last_obs = self.preproc.preprocess_obs(obs_raw)
        self.size = self.last_obs['screen'].shape[1:3]
        self.last_state = [
            np.zeros((self.envs.n_envs, ) + self.size + (96, ))
            for _ in range(2)
        ]

    def get_mean_score(self):
        return self.cumulative_score / self.episode_counter

    def _summarize_episode(self, timestep):
        score = timestep.observation["score_cumulative"][0]
        if self.summary_writer is not None:
            summary = tf.Summary()
            summary.value.add(tag='sc2/episode_score', simple_value=score)
            self.summary_writer.add_summary(summary, self.episode_counter)

        print("episode %d: score = %f" % (self.episode_counter, score))
        self.episode_counter += 1
        return score

    def run_batch(self, train_summary=False):
        """Collect trajectories for a single batch and train (if self.train).

    Args:
      train_summary: return a Summary of the training step (losses, etc.).

    Returns:
      result: None (if not self.train) or the return value of agent.train.
    """
        shapes = (self.n_steps, self.envs.n_envs)
        values = np.zeros(shapes, dtype=np.float32)
        rewards = np.zeros(shapes, dtype=np.float32)
        dones = np.zeros(shapes, dtype=np.float32)
        all_obs = []
        all_actions = []
        all_scores = []
        all_states = []

        last_obs = self.last_obs
        last_state = self.last_state
        all_states.append(last_state)

        for n in range(self.n_steps):
            if self.temporal:
                actions, value_estimate, last_state = self.agent.step(
                    last_obs, last_state)
                for item in last_state:
                    item[dones[n - 1, :].nonzero(
                    )] = 0.  # reset those that were the last step
                all_states.append(last_state)
            else:
                actions, value_estimate = self.agent.step(last_obs)
            #show(policy[1])
            actions = mask_unused_argument_samples(actions)
            #size = last_obs['screen'].shape[1:3]

            values[n, :] = value_estimate
            all_obs.append(last_obs)
            all_actions.append(actions)

            pysc2_actions = actions_to_pysc2(actions, self.size)
            obs_raw = self.envs.step(pysc2_actions)
            last_obs = self.preproc.preprocess_obs(obs_raw)
            rewards[n, :] = [t.reward for t in obs_raw]
            dones[n, :] = [t.last() for t in obs_raw]

            for t in obs_raw:
                if t.last():
                    score = self._summarize_episode(t)
                    self.cumulative_score += score

        self.last_obs = last_obs
        self.last_state = last_state

        next_values = self.agent.get_value(last_obs, last_state)

        returns, advs = compute_returns_advantages(rewards, dones, values,
                                                   next_values, self.discount)

        actions = stack_and_flatten_actions(all_actions)
        obs = flatten_first_dims_dict(stack_ndarray_dicts(all_obs))
        returns = flatten_first_dims(returns)
        advs = flatten_first_dims(advs)
        all_states = all_states[:self.n_steps]
        states = [np.stack(item) for item in zip(*all_states)]
        states = [flatten_first_dims(item) for item in states]

        if self.train:
            return self.agent.train(obs,
                                    actions,
                                    returns,
                                    advs,
                                    summary=train_summary,
                                    states=states)

        return None
Example #3
0
class A2CRunner():
    def __init__(self,
                 agent,
                 envs,
                 slack,
                 summary_writer=None,
                 train=True,
                 n_steps=8,
                 discount=0.99):
        """
    Args:
      agent: A2CAgent instance.
      envs: SubprocVecEnv instance.
      summary_writer: summary writer to log episode scores.
      train: whether to train the agent.
      n_steps: number of agent steps for collecting rollouts.
      discount: future reward discount.
    """
        self.agent = agent
        self.envs = envs
        self.slack = slack
        self.summary_writer = summary_writer
        if self.summary_writer is not None:
            self.summary = tf.Summary()

        self.train = train
        self.n_steps = n_steps
        self.discount = discount
        self.preproc = Preprocessor(self.envs.observation_spec()[0])
        self.episode_counter = 0
        self.best_n_mean_counter = 0
        self.mean_score = 0.0
        self.cumulative_score = 0.0

    def reset(self, nenvs, res):
        obs_raw = self.envs.reset()
        self.last_obs = self.preproc.preprocess_obs(obs_raw)
        lstm_state_shape = (nenvs, res, res, 75)
        cell_state = np.zeros(shape=lstm_state_shape, dtype=np.float32)
        hidden_state = np.zeros(shape=lstm_state_shape, dtype=np.float32)
        self.lstm_states = (cell_state, hidden_state)
        self.episode_last = [None for i in range(self.envs.n_envs)]
        self.worker_scores = [0 for i in range(self.envs.n_envs)]

    def get_mean_score(self):
        return self.cumulative_score / self.episode_counter

    def _print_agent_actions(self, actions):
        fn_ids, arg_ids = actions
        id_str = " ".join(map(str, fn_ids))
        print("episode %d | Sampled action IDs: " % (self.episode_counter) +
              id_str)

    def _summarize_episode(self, timestep, total_frames, worker_id=None):
        score = timestep.observation["score_cumulative"][0]
        self.worker_scores[worker_id] = score

        if self.summary_writer is not None:
            summary = tf.Summary()
            summary.value.add(tag='sc2/episode_score', simple_value=score)
            self.summary_writer.add_summary(summary, self.episode_counter)

        print("[Worker ID: %d] episode %d: score = %f" %
              (worker_id, self.episode_counter, score))
        self.episode_counter += 1
        return score

    def _summarize_best_and_mean(self, total_frames):
        mean_score = self.mean_score / self.envs.n_envs
        best_score = max(self.worker_scores)

        if self.summary_writer is not None:
            summary = tf.Summary()
            summary.value.add(tag='sc2/mean_score', simple_value=mean_score)
            summary.value.add(tag='sc2/best_score', simple_value=best_score)
            self.summary_writer.add_summary(summary, self.best_n_mean_counter)

        if total_frames >= 0:
            summary = tf.Summary()
            summary.value.add(tag='sc2/mean_score_per_frames',
                              simple_value=mean_score)
            summary.value.add(tag='sc2/best_score_per_frames',
                              simple_value=best_score)
            self.summary_writer.add_summary(summary, total_frames)

        print("step %d: MEAN SCORE = %f" %
              (self.best_n_mean_counter, mean_score))
        print("step %d: BEST SCORE = %f" %
              (self.best_n_mean_counter, best_score))
        self.episode_last = [None for i in range(self.envs.n_envs)]
        self.mean_score = 0
        self.best_n_mean_counter += 1

    def run_batch(self, total_frames, train_summary=False, lstm=False):
        """Collect trajectories for a single batch and train (if self.train).

    Args:
      train_summary: return a Summary of the training step (losses, etc.).

    Returns:
      result: None (if not self.train) or the return value of agent.train.
    """
        shapes = (self.n_steps, self.envs.n_envs)
        values = np.zeros(shapes, dtype=np.float32)
        rewards = np.zeros(shapes, dtype=np.float32)
        dones = np.zeros(shapes, dtype=np.float32)
        all_obs = []
        all_actions = []
        all_scores = []  # TODO: Unused local var?

        last_obs = self.last_obs
        lstm_states = self.lstm_states if lstm else None  # XXX reset?

        for n in range(self.n_steps):
            actions, value_estimate, lstm_states = self.agent.step(
                last_obs, lstm_states)
            actions, masked_actions = mask_unavailable_samples(
                actions, last_obs)
            actions = mask_unused_argument_samples(actions)
            size = last_obs['screen'].shape[1:3]

            values[n, :] = value_estimate
            all_obs.append(last_obs)
            all_actions.append(actions)

            pysc2_actions = actions_to_pysc2(masked_actions,
                                             size)  # XXX Use masked samples
            obs_raw = self.envs.step(pysc2_actions)
            last_obs = self.preproc.preprocess_obs(obs_raw)
            rewards[n, :] = [t.reward for t in obs_raw]
            dones[n, :] = [t.last() for t in obs_raw]

            # episode summary
            for i, t in enumerate(obs_raw):
                if t.last():
                    score = self._summarize_episode(t,
                                                    total_frames,
                                                    worker_id=i)
                    self.cumulative_score += score
                    self.mean_score += score
                    self.episode_last[i] = t.last()

            # mean and best scores summary
            if all(self.episode_last):
                self._summarize_best_and_mean(total_frames)
            total_frames += 1

        self.last_obs = last_obs
        self.lstm_states = lstm_states
        next_values = self.agent.get_value(last_obs, lstm_states)

        returns, advs = compute_returns_advantages(rewards, dones, values,
                                                   next_values, self.discount)

        actions = stack_and_flatten_actions(all_actions)
        obs = flatten_first_dims_dict(stack_ndarray_dicts(all_obs))
        returns = flatten_first_dims(returns)
        advs = flatten_first_dims(advs)

        if self.train:
            return self.agent.train(obs,
                                    actions,
                                    returns,
                                    advs,
                                    total_frames,
                                    summary=train_summary,
                                    lstm_states=lstm_states)

        return None, total_frames