Exemple #1
0
    def test_simple(self):
        a = ActionProcesser(
            dim=40,
            rect_delta=5,
        )

        action_ids = [2, 331, 0, 1]
        coords = ((15, 4), (22, 33), (1, 1), (1, 1))

        actions = a.process(action_ids, coords)

        expected = [
            FunctionCall(function=2, arguments=[[0], (4, 15)]),
            FunctionCall(function=331, arguments=[[0], (33, 22)]),
            FunctionCall(function=0, arguments=[]),
            FunctionCall(function=1, arguments=[(1, 1)])
        ]

        assert actions == expected
Exemple #2
0
    def test_rectangle(self):
        dim = 48
        a = ActionProcesser(
            dim=dim,
            rect_delta=7,
        )

        action_ids = [3, 3, 3, 3]
        coords = ((15, 4), (22, 33), (1, 1), (45, 10))

        actions = a.process(action_ids, coords)

        expected = [
            FunctionCall(function=3, arguments=[[0], [8, 0], [22, 11]]),
            FunctionCall(function=3, arguments=[[0], [15, 26], [29, 40]]),
            FunctionCall(function=3, arguments=[[0], [0, 0], [8, 8]]),
            FunctionCall(function=3, arguments=[[0], [38, 3], [47, 17]])
        ]

        assert actions == expected
Exemple #3
0
class Runner(object):
    def __init__(self,
                 envs,
                 agent: ActorCriticAgent,
                 n_steps=5,
                 discount=0.99,
                 do_training=True,
                 ppo_par: PPORunParams = None,
                 episodes=3):
        self.envs = envs
        self.agent = agent  # assign a variable agent to the AC which comes from the module of your choice
        self.obs_processer = ObsProcesser()
        self.action_processer = ActionProcesser(dim=flags.FLAGS.resolution)
        self.n_steps = n_steps
        self.discount = discount
        self.do_training = do_training
        self.ppo_par = ppo_par
        self.batch_counter = 0
        self.episode_counter = 0
        self.episodes = episodes

    def reset(self):
        obs = self.envs.reset()
        self.latest_obs = self.obs_processer.process(obs)

    def _log_score_to_tb(self, score):
        summary = tf.Summary()
        summary.value.add(tag='sc2/episode_score', simple_value=score)
        self.agent.summary_writer.add_summary(summary, self.episode_counter)

    def _handle_episode_end(self, timestep):
        score = timestep.observation["score_cumulative"][0]
        print("episode %d ended. Score %f" % (self.episode_counter, score))
        self._log_score_to_tb(score)
        self.episode_counter += 1

    def run_batch(self):
        mb_actions = []
        mb_obs = []
        mb_values = np.zeros((self.envs.n_envs, self.n_steps + 1),
                             dtype=np.float32)
        mb_rewards = np.zeros((self.envs.n_envs, self.n_steps),
                              dtype=np.float32)

        latest_obs = self.latest_obs  # state(t0)
        rnn_state = self.agent.state_init
        for n in range(self.n_steps):
            action_ids, spatial_action_2ds, value_estimate, rnn_state = self.agent.step(
                latest_obs, rnn_state)
            #print('step: ', n, action_ids, spatial_action_2ds, value_estimate)  # for debugging

            # Store actions and value estimates for all steps (this is being done in parallel with the other envs)
            mb_values[:, n] = value_estimate
            mb_obs.append(latest_obs)
            mb_actions.append((action_ids, spatial_action_2ds))
            # Do action, return it to environment, get new obs and reward, store reward
            actions_pp = self.action_processer.process(action_ids,
                                                       spatial_action_2ds)
            obs_raw = self.envs.step(actions_pp)
            latest_obs = self.obs_processer.process(obs_raw)  # state(t+1)
            mb_rewards[:, n] = [t.reward for t in obs_raw]

            #Check for all convolutional lstm vizdoom if last state is true
            for t in obs_raw:
                if t.last():
                    self._handle_episode_end(t)
        # Get the Vt+1 and use it as a future reward from st+1 as we dont know the actual reward (bootstraping here with net's predicitons)
        mb_values[:, -1] = self.agent.get_value(latest_obs, rnn_state)

        n_step_advantage = general_n_step_advantage(mb_rewards,
                                                    mb_values,
                                                    self.discount,
                                                    lambda_par=1.0)

        full_input = {
            # these are transposed because action/obs
            # processers return [time, env, ...] shaped arrays
            FEATURE_KEYS.advantage:
            n_step_advantage.transpose(),
            FEATURE_KEYS.value_target:
            (n_step_advantage + mb_values[:, :-1]).transpose()
        }
        # We combine all experiences from every env
        full_input.update(self.action_processer.combine_batch(mb_actions))
        full_input.update(self.obs_processer.combine_batch(mb_obs))
        full_input = {
            k: combine_first_dimensions(v)
            for k, v in full_input.items()
        }

        if not self.do_training:
            pass
        else:
            self.agent.train(
                full_input, rnn_state
            )  # You might want to reset the state between forward and backward pass

        self.latest_obs = latest_obs  #state(t) = state(t+1)
        self.batch_counter += 1
        print('Batch %d finished' % self.batch_counter)
        sys.stdout.flush()

    def run_trained_batch(self):
        # This function enables testing from a trained model. It loads the weights and runs the model for some episodes.
        latest_obs = self.latest_obs  # state(t0)
        rnn_state = self.agent.state_init
        while self.episode_counter <= (self.episodes - 1):
            action_ids, spatial_action_2ds, value_estimate, rnn_state = self.agent.step(
                latest_obs,
                rnn_state)  # (MINE) AGENT STEP = INPUT TO NN THE CURRENT
            #print('action: ', actions.FUNCTIONS[action_ids[0]].name, 'on', 'x=', spatial_action_2ds[0][0], 'y=', spatial_action_2ds[0][1], 'Value=', value_estimate[0]) # for debugging
            actions_pp = self.action_processer.process(action_ids,
                                                       spatial_action_2ds)
            obs_raw = self.envs.step(actions_pp)
            latest_obs = self.obs_processer.process(obs_raw)

            # Check for all observations if last state is true
            for t in obs_raw:
                if t.last():
                    self._handle_episode_end(t)

        self.latest_obs = latest_obs  # state(t) = state(t+1)
        self.batch_counter += 1
        #print('Batch %d finished' % self.batch_counter)
        sys.stdout.flush()
Exemple #4
0
class Runner(object):
    def __init__(self,
                 envs,
                 agent: ActorCriticAgent,
                 n_steps=5,
                 discount=0.99,
                 do_training=True,
                 ppo_par: PPORunParams = None):
        self.envs = envs
        self.agent = agent
        self.obs_processer = ObsProcesser()
        self.action_processer = ActionProcesser(dim=flags.FLAGS.resolution)
        self.n_steps = n_steps
        self.discount = discount
        self.do_training = do_training
        self.ppo_par = ppo_par
        self.batch_counter = 0
        self.episode_counter = 0
        assert self.agent.mode in [ACMode.PPO, ACMode.A2C]
        self.is_ppo = self.agent.mode == ACMode.PPO
        if self.is_ppo:
            assert ppo_par is not None
            assert n_steps * envs.n_envs % ppo_par.batch_size == 0
            assert n_steps * envs.n_envs >= ppo_par.batch_size
            self.ppo_par = ppo_par

    def reset(self):
        obs = self.envs.reset()
        #print(min_distance_to_enemy(obs[0], minimap=True))
        self.last_min_dist_to_enemy = min_distance_to_enemy(obs[0],
                                                            minimap=True)
        #print(count_units(obs[0], minimap=False))
        self.units_in_frame = count_units(obs[0], minimap=False)
        self.latest_obs = self.obs_processer.process(obs)

    def _log_score_to_tb(self, score):
        summary = tf.Summary()
        summary.value.add(tag='sc2/episode_score', simple_value=score)
        self.agent.summary_writer.add_summary(summary, self.episode_counter)

    def _log_modified_to_tb(self, score):
        summary = tf.Summary()
        summary.value.add(tag='sc2/episode_score_modified', simple_value=score)
        self.agent.summary_writer.add_summary(summary, self.episode_counter)

    def _handle_episode_end(self, timestep):
        score = timestep.observation["score_cumulative"][0]
        print("episode %d ended. Score %f" % (self.episode_counter, score))
        self._log_score_to_tb(score)
        self.episode_counter += 1

    def _train_ppo_epoch(self, full_input):
        total_obs = self.n_steps * self.envs.n_envs
        shuffle_idx = np.random.permutation(total_obs)
        batches = dict_of_lists_to_list_of_dicst({
            k: np.split(v[shuffle_idx], total_obs // self.ppo_par.batch_size)
            for k, v in full_input.items()
        })
        for b in batches:
            self.agent.train(b)

    def run_batch(self):
        mb_actions = []
        mb_obs = []
        mb_values = np.zeros((self.envs.n_envs, self.n_steps + 1),
                             dtype=np.float32)
        mb_rewards = np.zeros((self.envs.n_envs, self.n_steps),
                              dtype=np.float32)
        mb_rewards_modified = np.zeros((self.envs.n_envs, self.n_steps),
                                       dtype=np.float32)

        latest_obs = self.latest_obs

        for n in range(self.n_steps):
            # could calculate value estimate from obs when do training
            # but saving values here will make n step reward calculation a bit easier
            action_ids, spatial_action_2ds, value_estimate = self.agent.step(
                latest_obs)

            mb_values[:, n] = value_estimate
            mb_obs.append(latest_obs)
            mb_actions.append((action_ids, spatial_action_2ds))

            actions_pp = self.action_processer.process(action_ids,
                                                       spatial_action_2ds)
            obs_raw = self.envs.step(actions_pp)
            latest_obs = self.obs_processer.process(obs_raw)
            mb_rewards[:, n] = [t.reward for t in obs_raw]
            # NEW
            i = 0
            last_dist = self.last_min_dist_to_enemy
            #print(last_dist)
            curr_dist = min_distance_to_enemy(obs_raw[0], minimap=True)
            #print(curr_dist)
            if last_dist < INF and curr_dist < INF:
                mb_rewards_modified[:, n] = [
                    t.reward + (last_dist - curr_dist) / 20 for t in obs_raw
                ]
            self.last_min_dist_to_enemy = curr_dist
            ###
            for t in obs_raw:
                if t.last():
                    self._handle_episode_end(t)

        mb_values[:, -1] = self.agent.get_value(latest_obs)

        n_step_advantage = general_n_step_advantage(
            mb_rewards,
            mb_rewards_modified,
            mb_values,
            self.discount,
            lambda_par=self.ppo_par.lambda_par if self.is_ppo else 1.0)

        full_input = {
            # these are transposed because action/obs
            # processers return [time, env, ...] shaped arrays
            FEATURE_KEYS.advantage:
            n_step_advantage.transpose(),
            FEATURE_KEYS.value_target:
            (n_step_advantage + mb_values[:, :-1]).transpose()
        }
        full_input.update(self.action_processer.combine_batch(mb_actions))
        full_input.update(self.obs_processer.combine_batch(mb_obs))
        full_input = {
            k: combine_first_dimensions(v)
            for k, v in full_input.items()
        }

        if not self.do_training:
            pass
        elif self.agent.mode == ACMode.A2C:
            self.agent.train(full_input)
        elif self.agent.mode == ACMode.PPO:
            for epoch in range(self.ppo_par.n_epochs):
                self._train_ppo_epoch(full_input)
            self.agent.update_theta()

        self.latest_obs = latest_obs
        self.batch_counter += 1
        sys.stdout.flush()
Exemple #5
0
class Runner(object):
    def __init__(
            self,
            envs,
            agent: A2CAgent,
            n_steps=5,
            discount=0.99,
            do_training=True
    ):
        self.envs = envs
        self.agent = agent
        self.obs_processer = ObsProcesser()
        self.action_processer = ActionProcesser(dim=flags.FLAGS.resolution)
        self.n_steps = n_steps
        self.discount = discount
        self.do_training = do_training
        self.batch_counter = 0
        self.episode_counter = 0

    def reset(self):
        obs = self.envs.reset()
        self.latest_obs = self.obs_processer.process(obs)

    def _log_score_to_tb(self, score):
        summary = tf.Summary()
        summary.value.add(tag='sc2/episode_score', simple_value=score)
        self.agent.summary_writer.add_summary(summary, self.episode_counter)

    def _handle_episode_end(self, timestep):
        score = timestep.observation["score_cumulative"][0]
        print("episode %d ended. Score %f" % (self.episode_counter, score))
        self._log_score_to_tb(score)
        self.episode_counter += 1

    def run_batch(self):
        dim = (self.envs.n_envs, self.n_steps)
        mb_rewards = np.zeros(dim, dtype=np.float32)
        mb_actions = []
        mb_obs = []

        latest_obs = self.latest_obs

        for n in range(self.n_steps):
            # values is not used for anything but calculated anyway
            action_ids, spatial_action_2ds = self.agent.step(latest_obs)

            mb_obs.append(latest_obs)
            mb_actions.append((action_ids, spatial_action_2ds))

            actions_pp = self.action_processer.process(action_ids, spatial_action_2ds)
            obs_raw = self.envs.step(actions_pp)
            latest_obs = self.obs_processer.process(obs_raw)
            mb_rewards[:, n] = [t.reward for t in obs_raw]

            for t in obs_raw:
                if t.last():
                    self._handle_episode_end(t)

        last_state_values = self.agent.get_value(latest_obs)

        n_step_rewards = calculate_n_step_reward(mb_rewards, self.discount, last_state_values)

        mb_actions_combined = self.action_processer.combine_batch(mb_actions)
        mb_obs_combined = self.obs_processer.combine_batch(mb_obs)

        if self.do_training:
            self.agent.train(
                n_step_rewards.transpose(),
                mb_obs_combined=mb_obs_combined,
                mb_actions_combined=mb_actions_combined
            )

        self.latest_obs = latest_obs
        self.batch_counter += 1
        sys.stdout.flush()