def test_simple(self): a = ActionProcesser( dim=40, rect_delta=5, ) action_ids = [2, 331, 0, 1] coords = ((15, 4), (22, 33), (1, 1), (1, 1)) actions = a.process(action_ids, coords) expected = [ FunctionCall(function=2, arguments=[[0], (4, 15)]), FunctionCall(function=331, arguments=[[0], (33, 22)]), FunctionCall(function=0, arguments=[]), FunctionCall(function=1, arguments=[(1, 1)]) ] assert actions == expected
def test_rectangle(self): dim = 48 a = ActionProcesser( dim=dim, rect_delta=7, ) action_ids = [3, 3, 3, 3] coords = ((15, 4), (22, 33), (1, 1), (45, 10)) actions = a.process(action_ids, coords) expected = [ FunctionCall(function=3, arguments=[[0], [8, 0], [22, 11]]), FunctionCall(function=3, arguments=[[0], [15, 26], [29, 40]]), FunctionCall(function=3, arguments=[[0], [0, 0], [8, 8]]), FunctionCall(function=3, arguments=[[0], [38, 3], [47, 17]]) ] assert actions == expected
class Runner(object): def __init__(self, envs, agent: ActorCriticAgent, n_steps=5, discount=0.99, do_training=True, ppo_par: PPORunParams = None, episodes=3): self.envs = envs self.agent = agent # assign a variable agent to the AC which comes from the module of your choice self.obs_processer = ObsProcesser() self.action_processer = ActionProcesser(dim=flags.FLAGS.resolution) self.n_steps = n_steps self.discount = discount self.do_training = do_training self.ppo_par = ppo_par self.batch_counter = 0 self.episode_counter = 0 self.episodes = episodes def reset(self): obs = self.envs.reset() self.latest_obs = self.obs_processer.process(obs) def _log_score_to_tb(self, score): summary = tf.Summary() summary.value.add(tag='sc2/episode_score', simple_value=score) self.agent.summary_writer.add_summary(summary, self.episode_counter) def _handle_episode_end(self, timestep): score = timestep.observation["score_cumulative"][0] print("episode %d ended. Score %f" % (self.episode_counter, score)) self._log_score_to_tb(score) self.episode_counter += 1 def run_batch(self): mb_actions = [] mb_obs = [] mb_values = np.zeros((self.envs.n_envs, self.n_steps + 1), dtype=np.float32) mb_rewards = np.zeros((self.envs.n_envs, self.n_steps), dtype=np.float32) latest_obs = self.latest_obs # state(t0) rnn_state = self.agent.state_init for n in range(self.n_steps): action_ids, spatial_action_2ds, value_estimate, rnn_state = self.agent.step( latest_obs, rnn_state) #print('step: ', n, action_ids, spatial_action_2ds, value_estimate) # for debugging # Store actions and value estimates for all steps (this is being done in parallel with the other envs) mb_values[:, n] = value_estimate mb_obs.append(latest_obs) mb_actions.append((action_ids, spatial_action_2ds)) # Do action, return it to environment, get new obs and reward, store reward actions_pp = self.action_processer.process(action_ids, spatial_action_2ds) obs_raw = self.envs.step(actions_pp) latest_obs = self.obs_processer.process(obs_raw) # state(t+1) mb_rewards[:, n] = [t.reward for t in obs_raw] #Check for all convolutional lstm vizdoom if last state is true for t in obs_raw: if t.last(): self._handle_episode_end(t) # Get the Vt+1 and use it as a future reward from st+1 as we dont know the actual reward (bootstraping here with net's predicitons) mb_values[:, -1] = self.agent.get_value(latest_obs, rnn_state) n_step_advantage = general_n_step_advantage(mb_rewards, mb_values, self.discount, lambda_par=1.0) full_input = { # these are transposed because action/obs # processers return [time, env, ...] shaped arrays FEATURE_KEYS.advantage: n_step_advantage.transpose(), FEATURE_KEYS.value_target: (n_step_advantage + mb_values[:, :-1]).transpose() } # We combine all experiences from every env full_input.update(self.action_processer.combine_batch(mb_actions)) full_input.update(self.obs_processer.combine_batch(mb_obs)) full_input = { k: combine_first_dimensions(v) for k, v in full_input.items() } if not self.do_training: pass else: self.agent.train( full_input, rnn_state ) # You might want to reset the state between forward and backward pass self.latest_obs = latest_obs #state(t) = state(t+1) self.batch_counter += 1 print('Batch %d finished' % self.batch_counter) sys.stdout.flush() def run_trained_batch(self): # This function enables testing from a trained model. It loads the weights and runs the model for some episodes. latest_obs = self.latest_obs # state(t0) rnn_state = self.agent.state_init while self.episode_counter <= (self.episodes - 1): action_ids, spatial_action_2ds, value_estimate, rnn_state = self.agent.step( latest_obs, rnn_state) # (MINE) AGENT STEP = INPUT TO NN THE CURRENT #print('action: ', actions.FUNCTIONS[action_ids[0]].name, 'on', 'x=', spatial_action_2ds[0][0], 'y=', spatial_action_2ds[0][1], 'Value=', value_estimate[0]) # for debugging actions_pp = self.action_processer.process(action_ids, spatial_action_2ds) obs_raw = self.envs.step(actions_pp) latest_obs = self.obs_processer.process(obs_raw) # Check for all observations if last state is true for t in obs_raw: if t.last(): self._handle_episode_end(t) self.latest_obs = latest_obs # state(t) = state(t+1) self.batch_counter += 1 #print('Batch %d finished' % self.batch_counter) sys.stdout.flush()
class Runner(object): def __init__(self, envs, agent: ActorCriticAgent, n_steps=5, discount=0.99, do_training=True, ppo_par: PPORunParams = None): self.envs = envs self.agent = agent self.obs_processer = ObsProcesser() self.action_processer = ActionProcesser(dim=flags.FLAGS.resolution) self.n_steps = n_steps self.discount = discount self.do_training = do_training self.ppo_par = ppo_par self.batch_counter = 0 self.episode_counter = 0 assert self.agent.mode in [ACMode.PPO, ACMode.A2C] self.is_ppo = self.agent.mode == ACMode.PPO if self.is_ppo: assert ppo_par is not None assert n_steps * envs.n_envs % ppo_par.batch_size == 0 assert n_steps * envs.n_envs >= ppo_par.batch_size self.ppo_par = ppo_par def reset(self): obs = self.envs.reset() #print(min_distance_to_enemy(obs[0], minimap=True)) self.last_min_dist_to_enemy = min_distance_to_enemy(obs[0], minimap=True) #print(count_units(obs[0], minimap=False)) self.units_in_frame = count_units(obs[0], minimap=False) self.latest_obs = self.obs_processer.process(obs) def _log_score_to_tb(self, score): summary = tf.Summary() summary.value.add(tag='sc2/episode_score', simple_value=score) self.agent.summary_writer.add_summary(summary, self.episode_counter) def _log_modified_to_tb(self, score): summary = tf.Summary() summary.value.add(tag='sc2/episode_score_modified', simple_value=score) self.agent.summary_writer.add_summary(summary, self.episode_counter) def _handle_episode_end(self, timestep): score = timestep.observation["score_cumulative"][0] print("episode %d ended. Score %f" % (self.episode_counter, score)) self._log_score_to_tb(score) self.episode_counter += 1 def _train_ppo_epoch(self, full_input): total_obs = self.n_steps * self.envs.n_envs shuffle_idx = np.random.permutation(total_obs) batches = dict_of_lists_to_list_of_dicst({ k: np.split(v[shuffle_idx], total_obs // self.ppo_par.batch_size) for k, v in full_input.items() }) for b in batches: self.agent.train(b) def run_batch(self): mb_actions = [] mb_obs = [] mb_values = np.zeros((self.envs.n_envs, self.n_steps + 1), dtype=np.float32) mb_rewards = np.zeros((self.envs.n_envs, self.n_steps), dtype=np.float32) mb_rewards_modified = np.zeros((self.envs.n_envs, self.n_steps), dtype=np.float32) latest_obs = self.latest_obs for n in range(self.n_steps): # could calculate value estimate from obs when do training # but saving values here will make n step reward calculation a bit easier action_ids, spatial_action_2ds, value_estimate = self.agent.step( latest_obs) mb_values[:, n] = value_estimate mb_obs.append(latest_obs) mb_actions.append((action_ids, spatial_action_2ds)) actions_pp = self.action_processer.process(action_ids, spatial_action_2ds) obs_raw = self.envs.step(actions_pp) latest_obs = self.obs_processer.process(obs_raw) mb_rewards[:, n] = [t.reward for t in obs_raw] # NEW i = 0 last_dist = self.last_min_dist_to_enemy #print(last_dist) curr_dist = min_distance_to_enemy(obs_raw[0], minimap=True) #print(curr_dist) if last_dist < INF and curr_dist < INF: mb_rewards_modified[:, n] = [ t.reward + (last_dist - curr_dist) / 20 for t in obs_raw ] self.last_min_dist_to_enemy = curr_dist ### for t in obs_raw: if t.last(): self._handle_episode_end(t) mb_values[:, -1] = self.agent.get_value(latest_obs) n_step_advantage = general_n_step_advantage( mb_rewards, mb_rewards_modified, mb_values, self.discount, lambda_par=self.ppo_par.lambda_par if self.is_ppo else 1.0) full_input = { # these are transposed because action/obs # processers return [time, env, ...] shaped arrays FEATURE_KEYS.advantage: n_step_advantage.transpose(), FEATURE_KEYS.value_target: (n_step_advantage + mb_values[:, :-1]).transpose() } full_input.update(self.action_processer.combine_batch(mb_actions)) full_input.update(self.obs_processer.combine_batch(mb_obs)) full_input = { k: combine_first_dimensions(v) for k, v in full_input.items() } if not self.do_training: pass elif self.agent.mode == ACMode.A2C: self.agent.train(full_input) elif self.agent.mode == ACMode.PPO: for epoch in range(self.ppo_par.n_epochs): self._train_ppo_epoch(full_input) self.agent.update_theta() self.latest_obs = latest_obs self.batch_counter += 1 sys.stdout.flush()
class Runner(object): def __init__( self, envs, agent: A2CAgent, n_steps=5, discount=0.99, do_training=True ): self.envs = envs self.agent = agent self.obs_processer = ObsProcesser() self.action_processer = ActionProcesser(dim=flags.FLAGS.resolution) self.n_steps = n_steps self.discount = discount self.do_training = do_training self.batch_counter = 0 self.episode_counter = 0 def reset(self): obs = self.envs.reset() self.latest_obs = self.obs_processer.process(obs) def _log_score_to_tb(self, score): summary = tf.Summary() summary.value.add(tag='sc2/episode_score', simple_value=score) self.agent.summary_writer.add_summary(summary, self.episode_counter) def _handle_episode_end(self, timestep): score = timestep.observation["score_cumulative"][0] print("episode %d ended. Score %f" % (self.episode_counter, score)) self._log_score_to_tb(score) self.episode_counter += 1 def run_batch(self): dim = (self.envs.n_envs, self.n_steps) mb_rewards = np.zeros(dim, dtype=np.float32) mb_actions = [] mb_obs = [] latest_obs = self.latest_obs for n in range(self.n_steps): # values is not used for anything but calculated anyway action_ids, spatial_action_2ds = self.agent.step(latest_obs) mb_obs.append(latest_obs) mb_actions.append((action_ids, spatial_action_2ds)) actions_pp = self.action_processer.process(action_ids, spatial_action_2ds) obs_raw = self.envs.step(actions_pp) latest_obs = self.obs_processer.process(obs_raw) mb_rewards[:, n] = [t.reward for t in obs_raw] for t in obs_raw: if t.last(): self._handle_episode_end(t) last_state_values = self.agent.get_value(latest_obs) n_step_rewards = calculate_n_step_reward(mb_rewards, self.discount, last_state_values) mb_actions_combined = self.action_processer.combine_batch(mb_actions) mb_obs_combined = self.obs_processer.combine_batch(mb_obs) if self.do_training: self.agent.train( n_step_rewards.transpose(), mb_obs_combined=mb_obs_combined, mb_actions_combined=mb_actions_combined ) self.latest_obs = latest_obs self.batch_counter += 1 sys.stdout.flush()