KOE: Here, we take the action, observe rewards, done and skip ahead. game.set_action(a_t.tolist()) skiprate = agent.frame_per_action game.advance_action(skiprate) #Repeats the action skiprate times and returns state after that. game_state = game.get_state() # Observe again after we take the action is_terminated = game.is_episode_finished() r_t = game.get_last_reward() ''' #KOEComment: My unity agent also skips 5 frames between actions, controlled in the Unity interface. #The vector space in Unity has 4 branches, with multiple actions i each! Those can also be combined! #I need the ANN output to be able to select all combinations. #TODO Believe step just wants the index of the action. observation, reward, done, info = env.step(action_idx) #print("obs after step: ", total_size(observation)) if reward != 0: print("Got reward: ", reward) print("Taking action ", action_idx) #TODO How to step ahead multiple steps? - I asked github- check what they suggest. #Observation is the image. vector_observations are the measurements. #battery, eaten_poison, eaten_food meas = info['brain_info'].vector_observations if (done): print("Game done at timestep ", t) if ((food - poison) > max_reward): max_reward = (food - poison) GAME += 1 reward_buffer.append(food - poison)
def sampleTrajectory(): action_repeat = 300 action_range_around_zero = 20 # should be even #action_range_around_zero = [-9, -8, -7, -6, -5, 0, 5, 6, 7, 8, 9] period = 1 if action_range_around_zero % 2 != 0: return False env = UnityEnv( "/homes/gkumar/Documents/UnityProjects/mazeContinuousTarget_fixed_camera_data_collection/Build/mazeContinuousTarget_fixed_camera_data_collection", 0, use_visual=True, uint8_visual=True) list_of_data = [] for i in range(int(-1 * action_range_around_zero / 2), int(action_range_around_zero / 2 + 1), period): # velocity X [-5, -3, -1, 1, 3, 5] for j in range(int(-1 * action_range_around_zero / 2), int(action_range_around_zero / 2 + 1), period): # velocity Y [-5, -3, -1, 1, 3, 5] print(i, j) for k in range(int(-1 * action_range_around_zero / 2), int(action_range_around_zero / 2 + 1), period): # action X [-5, -3, -1, 1, 3, 5] for l in range(int(-1 * action_range_around_zero / 2), int(action_range_around_zero / 2 + 1), period): # action Y [-5, -3, -1, 1, 3, 5] single_tuple = np.zeros(4 + 2 * action_repeat) obs_fovea = env.reset() obs_fovea_next, reward, done, info = env.step([[i], [j], [k], [l]]) # action single_tuple[0] = i single_tuple[1] = j # velocity single_tuple[2] = k single_tuple[3] = l for m in range(0, action_repeat): single_tuple[ 3 + m * 2 + 1] = info["brain_info"].vector_observations[0][2] single_tuple[ 3 + m * 2 + 2] = info["brain_info"].vector_observations[0][3] x_vel_new = info["brain_info"].vector_observations[0][ 6] y_vel_new = info["brain_info"].vector_observations[0][ 7] if math.sqrt( math.pow((single_tuple[3 + m * 2 + 1] - single_tuple[4]), 2) + math.pow((single_tuple[3 + m * 2 + 2] - single_tuple[5]), 2)) < 6: obs_fovea_next, reward, done, info = env.step( [[i], [j], [x_vel_new], [y_vel_new]]) else: for n in range(m, action_repeat): single_tuple[3 + n * 2 + 1] = single_tuple[3 + (m - 1) * 2 + 1] single_tuple[3 + n * 2 + 2] = single_tuple[3 + (m - 1) * 2 + 2] break list_of_data.append(single_tuple) h5f = h5py.File('data.h5', 'w') h5f.create_dataset('dataset_1', data=list_of_data) h5f.close()
class UnityEnvWrapper: def __init__(self, env_config=None, use_eval=False, rpc_mode=False): self.env = None if not rpc_mode: assert (env_config is not None) self.launch(env_config, use_eval) def launch(self, env_config, use_eval=False): environment_path = (env_config["environment_path_eval"] if use_eval else env_config["environment_path"]) port = env_config.get("port", 0) if use_eval and port: port += 2 use_visual = env_config.get("use_visual", False) use_vector = env_config.get("use_vector", True) multiagent = env_config.get("multiagent", False) uint8_visual = env_config.get("uint8_visual", True) flatten_branched = env_config.get("flatten_branched", True) self.env = UnityEnv( environment_path, port, use_visual=use_visual, use_vector=use_vector, uint8_visual=uint8_visual, multiagent=multiagent, flatten_branched=flatten_branched, ) self.action_space = self.env._action_space self.observation_space = self.env._observation_space # agent name must be unique among **all** agents self.agent_name = [ f'{port}_{i}' for i in range(self.env.number_agents) ] def _transform_list_to_dict(self, objs): return {name: obj for name, obj in zip(self.agent_name, objs)} def _transform_dict_to_list(self, objs): return [objs[name] for name in self.agent_name] def step(self, act, action_settings=None): action = np.stack(self._transform_dict_to_list(act)).tolist() observation, reward, done, info = self.env.step(action) transform = self._transform_list_to_dict info = list(map(json.loads, info['text_observation'])) for i, x in enumerate(info): x['done'] = done[i] done = [False] * 4 done_dict = transform(done) done_dict['__all__'] = False # no early termination (for logging) return transform(observation), transform(reward), done_dict, transform( info) def reset(self, reset_settings=None): obs = self.env.reset() return self._transform_list_to_dict(obs) def get_env_spaces(self): spaces = self.action_space, self.observation_space, self.agent_name p = pickle.dumps(spaces) z = zlib.compress(p) return z def get_action_count(self): if isinstance(self.env.action_space, gym.spaces.Discrete): return self.env.action_space.n elif isinstance(self.env.action_space, gym.spaces.MultiDiscrete): return self.env.action_space.nvec.tolist() raise NotImplementedError def sample(self): return self.env.action_space.sample() def number_agents(self): return self.env.number_agents def env_close(self): if self.env: self.env.close() self.env = None def close(self): self.env_close() def hello(self): print('Hello World')
from baselines import logger import time from gym_unity.envs.unity_env import UnityEnv import subprocess as sp import os env = UnityEnv("../unity_envs/kais_banana", 0, use_visual=True, uint8_visual=True, flatten_branched=True) act = deepq.learn(env, network='cnn', total_timesteps=0, load_path="logs_backup/model") #"unity_model.pkl") #Visualizing #TODO Maybe slow down the simulation by inserting some delays here. while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() obs, rew, done, _ = env.step(act(obs[None])[0]) episode_rew += rew time.sleep(0.05) print("Episode reward", episode_rew)