class AnimalAIWrapper(gym.Env): def __init__( self, worker_id, env_path, config_path, reduced_actions=False, docker_training=False, ): super(AnimalAIWrapper, self).__init__() self.config = ArenaConfig(config_path) self.time_limit = self.config.arenas[0].t self.env = UnityEnvironment( file_name=env_path, worker_id=worker_id, seed=worker_id, n_arenas=1, arenas_configurations=self.config, docker_training=docker_training, ) lookup_func = lambda a: {"Learner": np.array([a], dtype=float)} if reduced_actions: lookup = itertools.product([0, 1], [0, 1, 2]) else: lookup = itertools.product([0, 1, 2], repeat=2) lookup = dict(enumerate(map(lookup_func, lookup))) self.action_map = lambda a: lookup[a] self.observation_space = gym.spaces.Box(0, 255, [84, 84, 3], dtype=np.uint8) self.action_space = gym.spaces.Discrete(len(lookup)) self.t = 0 def _process_state(self, state): img = 255 * state["Learner"].visual_observations[0][0] vec = state["Learner"].vector_observations[0] r = state["Learner"].rewards[0] done = state["Learner"].local_done[0] return np.uint8(img), vec, r, done def reset(self): self.t = 0 img, vec, r, done = self._process_state( self.env.reset(arenas_configurations=self.config)) while done: img, vec, r, done = self._process_state( self.env.reset(arenas_configurations=self.config)) return img def step(self, action): obs, vec, r, done = self._process_state( self.env.step(vector_action=self.action_map(action.item()))) self.t += 1 done = done or self.t >= self.time_limit return obs, r, done, {}
class AnimalAIEnv(gym.Env): """ Provides Gym wrapper for Unity Learning Environments. Multi-agent environments use lists for object types, as done here: https://github.com/openai/multiagent-particle-envs """ def __init__(self, environment_filename: str, worker_id=0, docker_training=False, n_arenas=1, seed=0, arenas_configurations=None, greyscale=False, retro=True, inference=False, resolution=None): """ Environment initialization :param environment_filename: The UnityEnvironment path or file to be wrapped in the gym. :param worker_id: Worker number for environment. :param docker_training: Whether this is running within a docker environment and should use a virtual frame buffer (xvfb). :param n_arenas: number of arenas to create in the environment (one agent per arena) :param arenas_configurations: an ArenaConfig to configure the items present in each arena, will spawn random objects randomly if not provided :param greyscale: whether the visual observations should be grayscaled or not :param retro: Resize visual observation to 84x84 (int8) and flattens action space. """ self._env = UnityEnvironment(file_name=environment_filename, worker_id=worker_id, seed=seed, docker_training=docker_training, n_arenas=n_arenas, arenas_configurations=arenas_configurations, inference=inference, resolution=resolution) # self.name = self._env.academy_name self.vector_obs = None self.inference = inference self.resolution = resolution self._current_state = None self._n_agents = None self._flattener = None self._greyscale = greyscale or retro # self._seed = None self.retro = retro self.game_over = False # Hidden flag used by Atari environments to determine if the game is over self.arenas_configurations = arenas_configurations self.flatten_branched = self.retro self.uint8_visual = self.retro # Check brain configuration if len(self._env.brains) != 1: raise UnityGymException( "There can only be one brain in a UnityEnvironment " "if it is wrapped in a gym.") self.brain_name = self._env.external_brain_names[0] brain = self._env.brains[self.brain_name] if brain.number_visual_observations == 0: raise UnityGymException("Environment provides no visual observations.") if brain.num_stacked_vector_observations != 1: raise UnityGymException("Environment provides no vector observations.") # Check for number of agents in scene. initial_info = self._env.reset(arenas_configurations=arenas_configurations)[self.brain_name] self._check_agents(len(initial_info.agents)) if self.retro and self._n_agents > 1: raise UnityGymException("Only one agent is allowed in retro mode, set n_agents to 1.") # Set observation and action spaces if len(brain.vector_action_space_size) == 1: self._action_space = spaces.Discrete(brain.vector_action_space_size[0]) else: if self.flatten_branched: self._flattener = ActionFlattener(brain.vector_action_space_size) self._action_space = self._flattener.action_space else: self._action_space = spaces.MultiDiscrete(brain.vector_action_space_size) # high = np.array([np.inf] * brain.vector_observation_space_size) self.action_meanings = brain.vector_action_descriptions # if self.visual_obs: if self._greyscale: depth = 1 else: depth = 3 if self.retro: image_space_max = 255 image_space_dtype = np.uint8 camera_height = 84 camera_width = 84 image_space = spaces.Box( 0, image_space_max, dtype=image_space_dtype, shape=(camera_height, camera_width, depth) ) self._observation_space = image_space else: image_space_max = 1.0 image_space_dtype = np.float32 camera_height = brain.camera_resolutions[0]["height"] camera_width = brain.camera_resolutions[0]["width"] max_float = np.finfo(np.float32).max image_space = spaces.Box( 0, image_space_max, dtype=image_space_dtype, shape=(self._n_agents, camera_height, camera_width, depth) ) vector_space = spaces.Box(-max_float, max_float, shape=(self._n_agents, brain.vector_observation_space_size)) self._observation_space = spaces.Tuple((image_space, vector_space)) def reset(self, arenas_configurations=None): """Resets the state of the environment and returns an initial observation. In the case of multi-agent environments, this is a list. Returns: observation (object/list): the initial observation of the space. """ info = self._env.reset(arenas_configurations=arenas_configurations)[self.brain_name] n_agents = len(info.agents) self._check_agents(n_agents) self.game_over = False if self._n_agents == 1: obs, reward, done, info = self._single_step(info) else: obs, reward, done, info = self._multi_step(info) return obs def step(self, action): """Run one timestep of the environment's dynamics. When end of episode is reached, you are responsible for calling `reset()` to reset this environment's state. Accepts an action and returns a tuple (observation, reward, done, info). In the case of multi-agent environments, these are lists. Args: action (object/list): an action provided by the environment Returns: observation (object/list): agent's observation of the current environment reward (float/list) : amount of reward returned after previous action done (boolean/list): whether the episode has ended. info (dict): contains auxiliary diagnostic information, including BrainInfo. """ # Use random actions for all other agents in environment. if self._n_agents > 1: if not isinstance(action, list): raise UnityGymException("The environment was expecting `action` to be a list.") if len(action) != self._n_agents: raise UnityGymException( "The environment was expecting a list of {} actions.".format(self._n_agents)) else: if self._flattener is not None: # Action space is discrete and flattened - we expect a list of scalars action = [self._flattener.lookup_action(_act) for _act in action] action = np.array(action) else: if self._flattener is not None: # Translate action into list action = self._flattener.lookup_action(action) info = self._env.step(action)[self.brain_name] n_agents = len(info.agents) self._check_agents(n_agents) self._current_state = info if self._n_agents == 1: obs, reward, done, info = self._single_step(info) self.game_over = done else: obs, reward, done, info = self._multi_step(info) self.game_over = all(done) return obs, reward, done, info def _single_step(self, info): self.visual_obs = self._preprocess_single(info.visual_observations[0][0, :, :, :]) self.vector_obs = info.vector_observations[0] if self._greyscale: self.visual_obs = self._greyscale_obs_single(self.visual_obs) if self.retro: self.visual_obs = self._resize_observation(self.visual_obs) default_observation = self.visual_obs else: default_observation = self.visual_obs, self.vector_obs return default_observation, info.rewards[0], info.local_done[0], { "text_observation": info.text_observations[0], "brain_info": info} def _preprocess_single(self, single_visual_obs): if self.uint8_visual: return (255.0 * single_visual_obs).astype(np.uint8) else: return single_visual_obs def _multi_step(self, info): self.visual_obs = self._preprocess_multi(info.visual_observations) self.vector_obs = info.vector_observations if self._greyscale: self.visual_obs = self._greyscale_obs_multi(self.visual_obs) default_observation = self.visual_obs return list(default_observation), info.rewards, info.local_done, { "text_observation": info.text_observations, "brain_info": info} def _preprocess_multi(self, multiple_visual_obs): if self.uint8_visual: return [(255.0 * _visual_obs).astype(np.uint8) for _visual_obs in multiple_visual_obs] else: return multiple_visual_obs def render(self, mode='rgb_array'): return self.visual_obs def close(self): """Override _close in your subclass to perform any necessary cleanup. Environments will automatically close() themselves when garbage collected or when the program exits. """ self._env.close() def get_action_meanings(self): return self.action_meanings def seed(self, seed=None): """Sets the seed for this env's random number generator(s). Currently not implemented. """ logger.warning("Could not seed environment %s", self.name) return @staticmethod def _resize_observation(observation): """ Re-sizes visual observation to 84x84 """ obs_image = Image.fromarray(observation) obs_image = obs_image.resize((84, 84), Image.NEAREST) return np.array(obs_image) def _greyscale_obs_single(self, obs): new_obs = np.floor(np.expand_dims(np.mean(obs, axis=2), axis=2)).squeeze().astype(np.uint8) return new_obs def _greyscale_obs_multi(self, obs): new_obs = [np.floor(np.expand_dims(np.mean(o, axis=2), axis=2)).squeeze().astype(np.uint8) for o in obs] return new_obs def _check_agents(self, n_agents): # if n_agents > 1: # raise UnityGymException( # "The environment was launched as a single-agent environment, however" # "there is more than one agent in the scene.") # elif self._multiagent and n_agents <= 1: # raise UnityGymException( # "The environment was launched as a mutli-agent environment, however" # "there is only one agent in the scene.") if self._n_agents is None: self._n_agents = n_agents logger.info("{} agents within environment.".format(n_agents)) elif self._n_agents != n_agents: raise UnityGymException("The number of agents in the environment has changed since " "initialization. This is not supported.") @property def metadata(self): return {'render.modes': ['rgb_array']} @property def reward_range(self): return -float('inf'), float('inf') @property def spec(self): return None @property def action_space(self): return self._action_space @property def observation_space(self): return self._observation_space @property def number_agents(self): return self._n_agents
s_idx = image_buffer.get_current_index() input_state = image_buffer.get_state(s_idx) ep_reward = 0.0 ep_count = 0 epsilon = max(1.0 * (1 - epi_i / 100), 0.05 * (1 - epi_i / 1000)) while True: a_category = q_main.epsilon_sample( torch.FloatTensor(input_state).to(device).view( 1, input_channel_size, height, width), epsilon) a_deploy = action_dict[a_category] info = env.step(a_deploy)["Learner"] end = info.local_done[0] ep_count += 1 r = info.rewards[0] * reward_scale s2_frame = info.visual_observations[0][0] image_buffer.animal_add(s2_frame) s2_idx = image_buffer.get_current_index() input_state = image_buffer.get_state(s2_idx) replay_buffer.store(np.array([s_idx]), np.array([a_category]), np.array([r]), np.array([end]), np.array([s2_idx])) s_idx = s2_idx
class Worker(object): def __init__(self, name, globalAC): env_id = int(name[-1]) self.env = UnityEnvironment(file_name='env/AnimalAI', worker_id=env_id, seed=0, docker_training=False, n_arenas=1, play=False, inference=True, resolution=None) reset = self.env.reset(train_mode=True) self.name = name self.AC = ACNet(name, globalAC) def work(self): global GLOBAL_RUNNING_R, GLOBAL_EP total_step = 1 buffer_s, buffer_a, buffer_r = [], [], [] while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: #reset = self.env.reset(train_mode=True) reset = self.env.reset(train_mode=True, arenas_configurations=ARENA) brain = reset['Learner'] s = np.array(brain.visual_observations, dtype='float32').reshape(84, 84, 3).flatten()[np.newaxis, :] ep_r = 0 rnn_state = self.AC.state_init for ep_t in range(MAX_STEPS): a = self.AC.choose_action(s, rnn_state) rnn_state = a[2] if a[0] == 0: info = [ self.env.step(vector_action=[0, 1]) for i in range(30) ][-1] else: info = self.env.step(vector_action=a[1]) brain = info['Learner'] s_ = np.array(brain.visual_observations, dtype='float32').reshape( 84, 84, 3).flatten()[np.newaxis, :] r = brain.rewards[0] done = brain.local_done[0] end = True if (ep_t == MAX_STEPS - 1) else False if r == 0: r = -0.0125 ep_r += r buffer_s.append(s) buffer_a.append(a[0]) buffer_r.append(r) if total_step % UPDATE_GLOBAL_ITER == 0 or end: # обновление сети if end: v_s_ = 0 else: v_s_ = SESS.run( self.AC.v, { self.AC.s: s_, self.AC.state_in[0]: rnn_state[0], self.AC.state_in[1]: rnn_state[1] })[0, 0] buffer_v_target = [] for r in buffer_r[::-1]: v_s_ = r + GAMMA * v_s_ buffer_v_target.append(v_s_) buffer_v_target.reverse() buffer_s, buffer_a, buffer_v_target = np.vstack( buffer_s), np.array(buffer_a), np.vstack( buffer_v_target) feed_dict = { self.AC.s: buffer_s, self.AC.a_his: buffer_a, self.AC.v_target: buffer_v_target, self.AC.state_in[0]: rnn_state[0], self.AC.state_in[1]: rnn_state[1] } self.AC.update_global(feed_dict) buffer_s, buffer_a, buffer_r = [], [], [] self.AC.pull_global() s = s_ total_step += 1 if end: if len(GLOBAL_RUNNING_R) == 0: # запись наград эпизода GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(0.99 * GLOBAL_RUNNING_R[-1] + 0.01 * ep_r) break
model.load_state_dict(torch.load("./models/dqn/dqn.pt")) env=UnityEnvironment(file_name=env_path) #環境リセット action_info = env.reset(arenas_configurations_input=arena_config_in, train_mode=False) obs = action_info[brain_name].visual_observations[0][0] state = get_state(obs) for step in range(1000): time.sleep(0.05) #ランダム行動 action_values = model(state) action = np.argmax(action_values.cpu().data.numpy()) conv_action = convert_action(action) action_info = env.step(conv_action) obs = action_info[brain_name].visual_observations[0][0] reward = action_info[brain_name].rewards[0] done = action_info[brain_name].local_done[0] max_reach=action_info[brain_name].max_reached next_state = get_state(obs) state = next_state #表示 #print('\n ===== {} step ======'.format(step)) #print('\naction=', action) #print('\nstate=', state.shape) #print('\nreward=', reward) #print('\ndone=', done) #print('\nmax_reach=', max_reach) #plt.imshow(state[0][0])