def test_step(): with mock.patch('subprocess.Popen'): with mock.patch('socket.socket') as mock_socket: with mock.patch('glob.glob') as mock_glob: mock_glob.return_value = ['FakeLaunchPath'] mock_socket.return_value.accept.return_value = (mock_socket, 0) mock_socket.recv.return_value.decode.return_value = dummy_start env = UnityEnvironment(' ') brain = env.brains['RealFakeBrain'] mock_socket.recv.side_effect = dummy_reset brain_info = env.reset() mock_socket.recv.side_effect = dummy_step brain_info = env.step([0] * brain.vector_action_space_size * len(brain_info['RealFakeBrain'].agents)) with pytest.raises(UnityActionException): env.step([0]) brain_info = env.step([0] * brain.vector_action_space_size * len(brain_info['RealFakeBrain'].agents)) with pytest.raises(UnityActionException): env.step([0] * brain.vector_action_space_size * len(brain_info['RealFakeBrain'].agents)) env.close() assert env.global_done assert isinstance(brain_info, dict) assert isinstance(brain_info['RealFakeBrain'], BrainInfo) assert isinstance(brain_info['RealFakeBrain'].visual_observations, list) assert isinstance(brain_info['RealFakeBrain'].vector_observations, np.ndarray) assert len(brain_info['RealFakeBrain'].visual_observations) == brain.number_visual_observations assert brain_info['RealFakeBrain'].vector_observations.shape[0] == \ len(brain_info['RealFakeBrain'].agents) assert brain_info['RealFakeBrain'].vector_observations.shape[1] == \ brain.vector_observation_space_size * brain.num_stacked_vector_observations assert not brain_info['RealFakeBrain'].local_done[0] assert brain_info['RealFakeBrain'].local_done[2]
brain = env.brains[brain_name] # ### 2. Examine the State and Action Spaces # # In this environment, a double-jointed arm can move to target locations. A reward of `+0.1` is provided for each step that the agent's hand is in the goal location. Thus, the goal of your agent is to maintain its position at the target location for as many time steps as possible. # # The observation space consists of `33` variables corresponding to position, rotation, velocity, and angular velocities of the arm. Each action is a vector with four numbers, corresponding to torque applicable to two joints. Every entry in the action vector must be a number between `-1` and `1`. # # Run the code cell below to print some information about the environment. # In[4]: # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size)) print('The state for the first agent looks like:', states[0])
class UnityEnvV0(Env, Serializable): def __init__(self, app_name, time_state=False, idx=0, is_render=False, no_graphics=False, recording=True): Serializable.quick_init(self, locals()) # Unity scene self._env = UnityEnvironment(file_name=app_name, worker_id=idx, no_graphics=no_graphics) self.id = 0 self.name = app_name self.idx = idx self.is_render = is_render self.time_state = time_state self.time_step = 0 # Check brain configuration assert len(self._env.brains) == 1 self.brain_name = self._env.external_brain_names[0] brain = self._env.brains[self.brain_name] # Check for number of agents in scene initial_info = self._env.reset()[self.brain_name] self.use_visual = (brain.number_visual_observations == 1) and False self.recording = brain.number_visual_observations == 1 and recording # Set observation and action spaces if brain.vector_action_space_type == "discrete": self._action_space = Discrete(1) else: high = np.array([np.inf] * (brain.vector_action_space_size)) self._action_space = Box(-high, high) # ---------------------------------- if self.use_visual and False and no_graphic: high = np.array([np.inf] * brain.camera_resolutions[0]["height"] * brain.camera_resolutions[0]["width"] * 3) self._observation_space = Box(-high, high) else: if self.time_state: high = np.array([np.inf] * (brain.vector_observation_space_size + 1)) else: high = np.array([np.inf] * (brain.vector_observation_space_size)) self._observation_space = Box(-high, high) # video buffer self.frames = [] def reset(self): self.frames = [] info = self._env.reset()[self.brain_name] if self.is_render: self.observation = info.visual_observations[0] state = info.vector_observations[0][:] self._pos = info.vector_observations[0][:2] if self.time_state: state = np.hstack((state, [self.time_step])) self.time_step += 1 self._collect_frames(info.visual_observations[0][0]) return state.flatten() def step(self, action): info = self._env.step([action])[self.brain_name] if self.is_render: self.observation = info.visual_observations[0] state = info.vector_observations[0][:] self._pos = info.vector_observations[0][:2] reward = info.rewards[0] done = info.local_done[0] if self.time_state: state = np.hstack((state, [self.time_step])) self.time_step += 1 if done: self.time_step = 0 self._collect_frames(info.visual_observations[0][0]) return Step(observation=state.flatten(), reward=reward, done=done) def terminate(self): self._env.close() def render(self, mode=None): if self.is_render: x = self.observation[0] * 255 return np.array(x).astype('uint8') else: return np.zeros((480, 360, 3)) def _collect_frames(self, frame): if self.recording: self.frames.append(np.uint8(frame * 255)) @property def action_space(self): return self._action_space @property def observation_space(self): return self._observation_space @property def position(self): return self._pos
def main(seed=seed): # --------------------------------------------------------------------------------------------------- # Logger # --------------------------------------------------------------------------------------------------- save_path = f"./results/Reacher_DDPG_{pd.Timestamp.utcnow().value}" os.makedirs(save_path, exist_ok=True) logger = logging.getLogger() formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s : %(message)s') handler = logging.FileHandler( f"{save_path}/logs_navigation_{pd.Timestamp.utcnow().value}.log") handler.setLevel(logging.DEBUG) handler.setFormatter(formatter) logger.addHandler(handler) # --------------------------------------------------------------------------------------------------- # Inputs # --------------------------------------------------------------------------------------------------- n_episodes = 300 config = dict( # Environment parameters env_name="Reacher", n_episodes=n_episodes, length_episode=1500, save_every=100, save_path=save_path, mode="train", # "train" or "test" evaluate_every= 5000, # Number of training episodes before 1 evaluation episode eps_decay=1, # Epsilon decay rate # Agent Parameters agent="DDPG", hidden_layers_actor=(200, 150), # (50, 50, 15), # (200, 150), # hidden_layers_critic_body=(400, ), # (50, 50,), # hidden_layers_critic_head=(300, ), # (50,), # (300,) func_critic_body="F.leaky_relu", # func_critic_head="F.leaky_relu", # func_actor_body="F.leaky_relu", # lr_scheduler= None, #{'scheduler_type': "multistep", # "step", "exp" or "decay", "multistep" # 'gamma': 0.5, # 0.99999, # 'step_size': 1, # 'milestones': [15*1000 * i for i in range(1, 6)], # 'max_epochs': n_episodes}, TAU=1e-3, # for soft update of target parameters BUFFER_SIZE=int(1e6), # replay buffer size BATCH_SIZE=128, # minibatch size GAMMA=0.99, # discount factor LR_ACTOR=1e-3, # learning rate of the actor LR_CRITIC=1e-3, # learning rate of the critic WEIGHT_DECAY=0, # L2 weight decay UPDATE_EVERY=1, # Number of actions before making a learning step action_noise="OU", # action_noise_scale=1, weights_noise=None, # state_normalizer="BatchNorm", # "RunningMeanStd" or "BatchNorm" warmup=0, # Number of random actions to start with as a warm-up start_time=str(pd.Timestamp.utcnow()), random_seed=seed, threshold=30) logger.warning("+=" * 90) logger.warning(f" RUNNING SIMULATION WITH PARAMETERS config={config}") logger.warning("+=" * 90) # ------------------------------------------------------------ # 1. Initialization # ------------------------------------------------------------ # 1. Start the Environment # env = UnityEnvironment(file_name=f'./Reacher_Linux_2/Reacher.x86_64') # Linux env = UnityEnvironment(file_name=f'./{config["env_name"]}') # mac OS # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) config["n_agents"] = num_agents # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) config.update(dict(action_size=action_size, state_size=state_size)) # ------------------------------------------------------------ # 2. Training # ------------------------------------------------------------ # Unity Monitor monitor = UnityMonitor(env=env, config=config) if config["mode"] == "train": # Actor model seed = 0 actor = SimpleNeuralNetHead(action_size, SimpleNeuralNetBody( state_size, config["hidden_layers_actor"], seed=seed), func=F.tanh, seed=seed) actor_target = SimpleNeuralNetHead(action_size, SimpleNeuralNetBody( state_size, config["hidden_layers_actor"], seed=seed), func=F.tanh, seed=seed) # Critic model critic = DeepNeuralNetHeadCritic( action_size, SimpleNeuralNetBody(state_size, config["hidden_layers_critic_body"], func=eval(config["func_critic_body"]), seed=seed), hidden_layers_sizes=config["hidden_layers_critic_head"], func=eval(config["func_critic_head"]), end_func=None, seed=seed) critic_target = DeepNeuralNetHeadCritic( action_size, SimpleNeuralNetBody(state_size, config["hidden_layers_critic_body"], func=eval(config["func_critic_body"]), seed=seed), hidden_layers_sizes=config["hidden_layers_critic_head"], func=eval(config["func_critic_head"]), end_func=None, seed=seed) # DDPG Agent agent = DDPGAgent( state_size=state_size, action_size=action_size, model_actor=actor, model_critic=critic, # actor_target=actor_target, critic_target=critic_target, action_space_low=-1, action_space_high=1, config=config, ) # Training start = pd.Timestamp.utcnow() scores = monitor.run(agent) logger.info("Average Score last 100 episodes: {}".format( np.mean(scores[-100:]))) elapsed_time = pd.Timedelta(pd.Timestamp.utcnow() - start).total_seconds() logger.info(f"Elapsed Time: {elapsed_time} seconds") # ------------------------------------------------------------ # 3. Testing # ------------------------------------------------------------ else: agent = DDPGAgent.load(filepath=config['save_path'], mode="test") scores = monitor.run(agent) logger.info( f"Test Score over {len(scores)} episodes: {np.mean(scores)}") config["test_scores"] = scores config["best_test_score"] = max(scores) config["avg_test_score"] = np.mean(scores) # When finished, you can close the environment. logger.info("Closing...") env.close()
class UnityEnv(IEnvironment): def __init__(self, name): drl_logger.info("Initializing environment.'", extra={"params": { "name": name, }}) self.env = UnityEnvironment(file_name=name) self.brain_name = self.env.brain_names[0] self.termination_reward = 0 def action_offset(self): return 0 def close(self): self.env.close() def get_action_space(self): # isDiscrete = isinstance(self.__env.action_space, Discrete) # # if isDiscrete: # num_action_space = self.__env.action_space.n # logging.debug("Env action space is discrete") # logging.debug("Env action space: {}".format(num_action_space)) # # logging.debug("Env observation space: {}".format(self.__env.observation_space)) pass def render(self, mode): pass def reset(self): brain_name = self.env.brain_names[0] # brain = self.__env.brains[brain_name] env_info = self.env.reset( train_mode=True)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state # state = env_info.vector_observations # get the current state new_life = True return state, new_life def start_game_action(self): return None def step(self, action): env_info = self.env.step(action)[ self.brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished if done: reward += self.termination_reward new_life = False return next_state, reward, done, new_life
class TrainerController(object): def __init__(self, env_path, run_id, save_freq, curriculum_file, fast_simulation, load, train, worker_id, keep_checkpoints, lesson, seed, docker_target_name, trainer_config_path, no_graphics): """ :param env_path: Location to the environment executable to be loaded. :param run_id: The sub-directory name for model and summary statistics :param save_freq: Frequency at which to save model :param curriculum_file: Curriculum json file for environment :param fast_simulation: Whether to run the game at training speed :param load: Whether to load the model or randomly initialize :param train: Whether to train model, or only run inference :param worker_id: Number to add to communication port (5005). Used for multi-environment :param keep_checkpoints: How many model checkpoints to keep :param lesson: Start learning from this lesson :param seed: Random seed used for training. :param docker_target_name: Name of docker volume that will contain all data. :param trainer_config_path: Fully qualified path to location of trainer configuration file :param no_graphics: Whether to run the Unity simulator in no-graphics mode """ self.trainer_config_path = trainer_config_path if env_path is not None: env_path = (env_path.strip().replace('.app', '').replace( '.exe', '').replace('.x86_64', '').replace('.x86', '') ) # Strip out executable extensions if passed # Recognize and use docker volume if one is passed as an argument if docker_target_name == '': self.docker_training = False self.model_path = './experiments/{run_id}'.format(run_id=run_id) self.curriculum_file = curriculum_file self.summaries_dir = './summaries' else: self.docker_training = True self.model_path = '/{docker_target_name}/experiments/{run_id}'.format( docker_target_name=docker_target_name, run_id=run_id) if env_path is not None: env_path = '/{docker_target_name}/{env_name}'.format( docker_target_name=docker_target_name, env_name=env_path) if curriculum_file is None: self.curriculum_file = None else: self.curriculum_file = '/{docker_target_name}/{curriculum_file}'.format( docker_target_name=docker_target_name, curriculum_file=curriculum_file) self.summaries_dir = '/{docker_target_name}/summaries'.format( docker_target_name=docker_target_name) self.logger = logging.getLogger("unityagents") self.run_id = run_id self.save_freq = save_freq self.lesson = lesson self.fast_simulation = fast_simulation self.load_model = load self.train_model = train self.worker_id = worker_id self.keep_checkpoints = keep_checkpoints self.trainers = {} if seed == -1: seed = np.random.randint(0, 999999) self.seed = seed np.random.seed(self.seed) tf.set_random_seed(self.seed) self.env = UnityEnvironment(file_name=env_path, worker_id=self.worker_id, curriculum=self.curriculum_file, seed=self.seed, docker_training=self.docker_training, no_graphics=no_graphics) if env_path is None: self.env_name = 'editor_' + self.env.academy_name else: self.env_name = os.path.basename( os.path.normpath(env_path)) # Extract out name of environment def _get_progress(self): if self.curriculum_file is not None: progress = 0 if self.env.curriculum.measure_type == "progress": for brain_name in self.env.external_brain_names: progress += self.trainers[ brain_name].get_step / self.trainers[ brain_name].get_max_steps return progress / len(self.env.external_brain_names) elif self.env.curriculum.measure_type == "reward": for brain_name in self.env.external_brain_names: progress += self.trainers[brain_name].get_last_reward return progress else: return None else: return None def _process_graph(self): nodes = [] scopes = [] for brain_name in self.trainers.keys(): if self.trainers[brain_name].graph_scope is not None: scope = self.trainers[brain_name].graph_scope + '/' if scope == '/': scope = '' scopes += [scope] if self.trainers[brain_name].parameters[ "trainer"] == "imitation": nodes += [scope + x for x in ["action"]] else: nodes += [ scope + x for x in ["action", "value_estimate", "action_probs"] ] if self.trainers[brain_name].parameters["use_recurrent"]: nodes += [ scope + x for x in ["recurrent_out", "memory_size"] ] if len(scopes) > 1: self.logger.info("List of available scopes :") for scope in scopes: self.logger.info("\t" + scope) self.logger.info("List of nodes to export :") for n in nodes: self.logger.info("\t" + n) return nodes def _save_model(self, sess, saver, steps=0): """ Saves current model to checkpoint folder. :param sess: Current Tensorflow session. :param steps: Current number of steps in training process. :param saver: Tensorflow saver for session. """ last_checkpoint = self.model_path + '/model-' + str(steps) + '.cptk' saver.save(sess, last_checkpoint) tf.train.write_graph(sess.graph_def, self.model_path, 'raw_graph_def.pb', as_text=False) self.logger.info("Saved Model") def _export_graph(self): """ Exports latest saved model to .bytes format for Unity embedding. """ target_nodes = ','.join(self._process_graph()) ckpt = tf.train.get_checkpoint_state(self.model_path) freeze_graph.freeze_graph( input_graph=self.model_path + '/raw_graph_def.pb', input_binary=True, input_checkpoint=ckpt.model_checkpoint_path, output_node_names=target_nodes, output_graph=self.model_path + '/' + self.env_name + "_" + self.run_id + '.bytes', clear_devices=True, initializer_nodes="", input_saver="", restore_op_name="save/restore_all", filename_tensor_name="save/Const:0") def _initialize_trainers(self, trainer_config, sess): trainer_parameters_dict = {} self.trainers = {} for brain_name in self.env.external_brain_names: trainer_parameters = trainer_config['default'].copy() if len(self.env.external_brain_names) > 1: graph_scope = re.sub('[^0-9a-zA-Z]+', '-', brain_name) trainer_parameters['graph_scope'] = graph_scope trainer_parameters['summary_path'] = '{basedir}/{name}'.format( basedir=self.summaries_dir, name=str(self.run_id) + '_' + graph_scope) else: trainer_parameters['graph_scope'] = '' trainer_parameters['summary_path'] = '{basedir}/{name}'.format( basedir=self.summaries_dir, name=str(self.run_id)) if brain_name in trainer_config: _brain_key = brain_name while not isinstance(trainer_config[_brain_key], dict): _brain_key = trainer_config[_brain_key] for k in trainer_config[_brain_key]: trainer_parameters[k] = trainer_config[_brain_key][k] trainer_parameters_dict[brain_name] = trainer_parameters.copy() for brain_name in self.env.external_brain_names: if trainer_parameters_dict[brain_name]['trainer'] == "imitation": self.trainers[brain_name] = BehavioralCloningTrainer( sess, self.env, brain_name, trainer_parameters_dict[brain_name], self.train_model, self.seed) elif trainer_parameters_dict[brain_name]['trainer'] == "ppo": self.trainers[brain_name] = PPOTrainer( sess, self.env, brain_name, trainer_parameters_dict[brain_name], self.train_model, self.seed) else: raise UnityEnvironmentException( "The trainer config contains an unknown trainer type for brain {}" .format(brain_name)) def _load_config(self): try: with open(self.trainer_config_path) as data_file: trainer_config = yaml.load(data_file) return trainer_config except IOError: raise UnityEnvironmentException( """Parameter file could not be found here {}. Will use default Hyper parameters""" .format(self.trainer_config_path)) except UnicodeDecodeError: raise UnityEnvironmentException( "There was an error decoding Trainer Config from this path : {}" .format(self.trainer_config_path)) @staticmethod def _create_model_path(model_path): try: if not os.path.exists(model_path): os.makedirs(model_path) except Exception: raise UnityEnvironmentException( "The folder {} containing the generated model could not be accessed." " Please make sure the permissions are set correctly.".format( model_path)) def start_learning(self): self.env.curriculum.set_lesson_number(self.lesson) trainer_config = self._load_config() self._create_model_path(self.model_path) tf.reset_default_graph() with tf.Session() as sess: self._initialize_trainers(trainer_config, sess) for k, t in self.trainers.items(): self.logger.info(t) init = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=self.keep_checkpoints) # Instantiate model parameters if self.load_model: self.logger.info('Loading Model...') ckpt = tf.train.get_checkpoint_state(self.model_path) if ckpt is None: self.logger.info( 'The model {0} could not be found. Make sure you specified the right ' '--run-id'.format(self.model_path)) saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(init) global_step = 0 # This is only for saving the model self.env.curriculum.increment_lesson(self._get_progress()) curr_info = self.env.reset(train_mode=self.fast_simulation) if self.train_model: for brain_name, trainer in self.trainers.items(): trainer.write_tensorboard_text('Hyperparameters', trainer.parameters) try: while any([ t.get_step <= t.get_max_steps for k, t in self.trainers.items() ]) or not self.train_model: if self.env.global_done: self.env.curriculum.increment_lesson( self._get_progress()) curr_info = self.env.reset( train_mode=self.fast_simulation) for brain_name, trainer in self.trainers.items(): trainer.end_episode() # Decide and take an action take_action_vector, take_action_memories, take_action_text, take_action_outputs = {}, {}, {}, {} for brain_name, trainer in self.trainers.items(): (take_action_vector[brain_name], take_action_memories[brain_name], take_action_text[brain_name], take_action_outputs[brain_name] ) = trainer.take_action(curr_info) new_info = self.env.step(vector_action=take_action_vector, memory=take_action_memories, text_action=take_action_text) for brain_name, trainer in self.trainers.items(): trainer.add_experiences( curr_info, new_info, take_action_outputs[brain_name]) trainer.process_experiences(curr_info, new_info) if trainer.is_ready_update( ) and self.train_model and trainer.get_step <= trainer.get_max_steps: # Perform gradient descent with experience buffer trainer.update_model() # Write training statistics to Tensorboard. trainer.write_summary( self.env.curriculum.lesson_number) if self.train_model and trainer.get_step <= trainer.get_max_steps: trainer.increment_step_and_update_last_reward() if self.train_model: global_step += 1 if global_step % self.save_freq == 0 and global_step != 0 and self.train_model: # Save Tensorflow model self._save_model(sess, steps=global_step, saver=saver) curr_info = new_info # Final save Tensorflow model if global_step != 0 and self.train_model: self._save_model(sess, steps=global_step, saver=saver) except KeyboardInterrupt: print( '--------------------------Now saving model-------------------------' ) if self.train_model: self.logger.info( "Learning was interrupted. Please wait while the graph is generated." ) self._save_model(sess, steps=global_step, saver=saver) pass self.env.close() if self.train_model: self._export_graph()
from unityagents import UnityEnvironment env_name = "tv_maze" env = UnityEnvironment(file_name=env_name, worker_id = 2) print(str(env)) default_brain = env.brain_names[0] brain = env.brains[default_brain] train_mode = False prevState = None np.random.seed = 13 d = {"startLoc" : 1, "render" : 1.0, "tv" : 0.0, "door" : 1.0} for episode in range(300): env_info = env.reset(train_mode=train_mode, config = d)[default_brain] done = False episode_rewards = 0 for i in range(1000): print(env_info.states) if brain.action_space_type == 'continuous': act = np.random.randn(len(env_info.agents), brain.action_space_size) if False: quaternion = [1,0,0,0] quaternion = np.array(quaternion) act[:, :4] = quaternion env_info = env.step(act)[default_brain] else: a = int(input("input: ")) env_info = env.step(a)[default_brain]
import pickle NO_GRAPHICS = True GPU_SERVER = True MONITOR_INTERVAL = 10 TRAIN_MODE = True env = UnityEnvironment(file_name='../Reacher_Linux_NoVis/Reacher.x86_64' if GPU_SERVER else '../Reacher.app', no_graphics=NO_GRAPHICS) brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=TRAIN_MODE)[brain_name] # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) print('The state for the first agent looks like:', states[0])
def main(): parser = argparse.ArgumentParser( description= 'Train a ddpg agent to play the Unity Environment Reacher app') parser.add_argument("--episodes", type=int, help="Number of training episodes to run", default=200) parser.add_argument("--max_steps", type=int, help="Maximum steps per episode", default=1000) parser.add_argument( "--saveto", help= "Save agent after training. agent- and critic- are prepended to the specified name.", default='checkpoint.pth') parser.add_argument("--loadfrom", help="Load previously saved model before training") parser.add_argument( "--min_score", type=float, help="Only save the model if the it achieves this score", default=30.) parser.add_argument("--saveplot", help="Location to save plot of scores") parser.add_argument( "--environment", help="Path to Unity environment for game (i.e. ./Reacher.App)", default="./Reacher.app") parser.add_argument( "--eval", type=bool, help= "Turns on eval mode, which affects the unity environment and removes the random noise from the predicted agent actions", default=False) args = parser.parse_args() env = UnityEnvironment(file_name=args.environment) brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of actions action_size = brain.vector_action_space_size # examine the state space state = env_info.vector_observations[0] state_size = len(state) num_agents = len(env_info.agents) print('Number of agents:', num_agents) # Create agent and start training _agent = ddpg_agent.DDPGAgent(state_size, action_size, num_agents) if args.loadfrom: _agent.load(args.loadfrom) _coach = coach.Coach(_agent, env) scores = _coach.run_episodes(args.episodes, args.max_steps, train=not args.eval) mean_score = np.mean(scores[-100:]) # Save the network if successful if mean_score > args.min_score and args.saveto: _agent.save(args.saveto) print("Training succeeded!") # Plot scores plt.plot(scores) plt.plot(moving_average(scores, 100), color='red') plt.ylabel('Episode scores') if args.saveplot: plt.savefig(args.saveplot, bbox_inches='tight') print("Your agent received a final mean score of {}".format(mean_score))
def train_unity_ddpg(PATH, env_name, platform, env_path, policy, score_threshold, timestamp, start, n_episodes, max_t, num_agents): """ Trains unity environments with DDPG policy """ total_scores = [] from unityagents import UnityEnvironment env_path = PATH + f"data/{env_path}" env = UnityEnvironment(file_name=env_path) brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] num_agents = len(env_info.agents) print(f"Number of agents: {num_agents}") states = env_info.vector_observations state_size = states.shape[1] print( f"There are {states.shape[0]} agents. Each observes a state with length {state_size}" ) print(f"The state for the first agent looks like:\n{states[0]}") action_size = brain.vector_action_space_size print(f"Size of each action: {action_size}") policy = policy(state_size, action_size, num_agents) for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations scores = np.zeros(num_agents) policy.reset() for t in range(max_t): actions = policy.act(states) env_info = env.step(actions)[ brain_name] # send the action to the environment next_states = env_info.vector_observations rewards = env_info.rewards # get the reward dones = env_info.local_done policy.step(states, actions, rewards, next_states, dones, t) states = next_states scores += env_info.rewards if np.any(dones): break score_length = len(total_scores) if len(total_scores) < 100 else 100 mean_score = np.mean(scores) min_score = np.min(scores) max_score = np.max(scores) total_scores.append(mean_score) total_average_score = np.mean(total_scores[-score_length:]) end = time.time() print( f'\rEpisode {i_episode}\tScore TAS/Mean/Max/Min: {total_average_score:.2f}/{mean_score:.2f}/{max_score:.2f}/{min_score:.2f}\t{calc_runtime(end-start)}', end=" ") if i_episode % 20 == 0 or total_average_score >= score_threshold: fap = PATH + f'results/{env_name}_{timestamp}_checkpoint_actor.pth' torch.save(policy.actor.state_dict(), fap) fcp = PATH + f'results/{env_name}_{timestamp}_checkpoint_critic.pth' torch.save(policy.critic.state_dict(), fcp) print( f'\rEpisode {i_episode}\tScore TAS/Mean/Max/Min: {total_average_score:.2f}/{mean_score:.2f}/{max_score:.2f}/{min_score:.2f}\t{calc_runtime(end-start)}' ) if total_average_score > score_threshold: print(f"Solved in {i_episode} and {calc_runtime(end-start)}") break env.close() return total_scores
summary.value.add(tag="Cumulated Reward", simple_value=episodeReward) summary.value.add(tag="Epsilon", simple_value=epsilon) summary.value.add(tag="Learning Rate", simple_value=lr) summary.value.add(tag="Episode Length", simple_value=episodeStep) writer.add_summary(summary, episode) writer.flush() with tf.Session(config=config) as sess: sess.run(init) totalStep = 0 for episode in range(maxEpisode): # initial observation episodeStep = 0 episodeReward = 0 info = env.reset()[brain_name] state = info.states[0] while True: action = RL.choose_action(state) new_info = env.step({brain_name: [action]})[brain_name] RL.store_transition(state, action, new_info.rewards[0], new_info.states[0]) episodeReward += new_info.rewards[0] if (totalStep > 200) and (totalStep % learning_freq == 0): RL.learn() state = new_info.states[0] if new_info.local_done[0]: break totalStep += 1 episodeStep += 1 if (episode % summary_freq == 0):
class UnityEnv(Env): allowed_modes = ['vector', 'visual'] def __init__(self, filename: str, mode='vector', frame_size=(84, 84), use_grayscale=True, n_frames=4, **kwargs) -> None: super().__init__(**kwargs) if mode not in self.allowed_modes: raise Exception("Allowed modes : %s" % self.allowed_modes) if "headless" in kwargs: del kwargs["headless"] if "train_mode" in kwargs: del kwargs["train_mode"] self.mode = mode self.env = UnityEnvironment(filename, no_graphics=self._headless, **kwargs) self.brain_name = self.env.brain_names[0] brain = self.env.brains[self.brain_name] env_info = self.env.reset(train_mode=self._train_mode)[self.brain_name] self.nA = brain.vector_action_space_size self.action_shape = (self.nA,) if mode == 'vector': self.nS = len(env_info.vector_observations[0]) self.state_shape = (self.nS,) elif mode == 'visual': self.frame_size = tuple(frame_size) self.use_grayscale = use_grayscale self.n_frames = n_frames self.frame_buffer = deque(maxlen=self.n_frames) num_channels = 1 if not use_grayscale: num_channels = 3 self.state_shape = self.frame_size + (num_channels * n_frames,) def reset(self): if self.mode == 'visual': self.frame_buffer.clear() env_info = self.env.reset(train_mode=self._train_mode)[self.brain_name] return self._to_state(env_info) def step(self, action): env_info = self.env.step(action)[self.brain_name] next_state = self._to_state(env_info) reward = env_info.rewards[0] done = env_info.local_done[0] return next_state, reward, done, env_info def render(self, **kwargs): pass def close(self): pass def _process_frame(self, frame): frame = np.squeeze(frame, axis=0) frame = resize(frame, self.frame_size, mode='constant', anti_aliasing=True) if self.use_grayscale: frame = np.expand_dims(rgb2gray(frame), axis=2) return frame def _to_state(self, env_info): if self.mode == 'vector': return env_info.vector_observations[0] elif self.mode == 'visual': frame = self._process_frame(env_info.visual_observations[0]) if len(self.frame_buffer) == 0: for i in range(self.n_frames): self.frame_buffer.append(frame) else: self.frame_buffer.append(frame) result = np.reshape(self.frame_buffer, self.state_shape) result = np.expand_dims(result, axis=0) return result
def main(): # --------------------------------------------------------------------------------------------------- # Logger # --------------------------------------------------------------------------------------------------- save_path = f"./results/Tennis_DDPG_{pd.Timestamp.utcnow().value}" os.makedirs(save_path, exist_ok=True) logger = logging.getLogger() formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s : %(message)s') handler = logging.FileHandler( f"{save_path}/logs_p3_{pd.Timestamp.utcnow().value}.log") handler.setLevel(logging.DEBUG) handler.setFormatter(formatter) logger.addHandler(handler) # --------------------------------------------------------------------------------------------------- # Inputs # --------------------------------------------------------------------------------------------------- import json with open(f"./assets/best_agent/config.json", "r") as f: config = json.load(f) config["mode"] = "test" config["n_episodes"] = 10 config["warmup"] = 0 logger.warning("+=" * 90) logger.warning(f" RUNNING SIMULATION WITH PARAMETERS config={config}") logger.warning("+=" * 90) # ------------------------------------------------------------ # 1. Initialization # ------------------------------------------------------------ # 1. Start the Environment env = UnityEnvironment(file_name=f'./{config["env_name"]}') # mac OS # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) config["n_agents"] = num_agents # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) config.update(dict(action_size=action_size, state_size=state_size)) # ------------------------------------------------------------ # 2. Training # ------------------------------------------------------------ # Unity Monitor monitor = UnityMonitor(env=env, config=config) # Actor model seed = 0 actor = SimpleNeuralNetHead(action_size, SimpleNeuralNetBody( state_size, config["hidden_layers_actor"], seed=seed), func=torch.tanh, seed=seed) # Critic model critic = DeepNeuralNetHeadCritic( action_size * num_agents, SimpleNeuralNetBody(state_size * num_agents, config["hidden_layers_critic_body"], func=eval(config["func_critic_body"]), seed=seed), hidden_layers_sizes=config["hidden_layers_critic_head"], func=eval(config["func_critic_head"]), end_func=None, seed=seed) # MADDPG Agent agent = MADDPGAgent( state_size=state_size, action_size=action_size, model_actor=actor, model_critic=critic, action_space_low=-1, action_space_high=1, config=config, ) # ------------------------------------------------------------ # 3. Testing # ------------------------------------------------------------ logger.warning("Entering Test Mode!") monitor.n_episodes = 100 env.reset(train_mode=False) env.warmup = 0 agent.warmup = 0 for a in agent.agents: a.warmup = 0 agent.load(filepath="./assets/best_agent", mode="test") scores = monitor.run(agent) logger.info(f"Test Score over {len(scores)} episodes: {np.mean(scores)}") config["test_scores"] = scores config["best_test_score"] = max(scores) config["avg_test_score"] = np.mean(scores) # When finished, you can close the environment. logger.info("Closing...") env.close()
class Env: '''A convinience function for generating episodes and memories This convinience class generates a context manager that can be used for generating a Unity environment. The Unity environment and the OpenAI Gym environment operates slightly differently and hence it will be difficult to create a uniform algorithm that is able to solve everything at the sametime. This environment tries to solve that problem. ''' def __init__(self, fileName, showEnv=False, trainMode=True): '''Initialize the environment This sets up the requirements that will later be used for generating the Unity Environment. This assumes that you will provide a binary file for generating the environment. There are different ways in which the environment can be generated. It can be generated either in a *headless* mode by using showEnv as False, in which case the environment will not show a window at startup. This is good for training, as well as situations when you are running the environment without the presence of an X server, especially when you are running this environment remotely. The other thing that you can do is to specify that this is being run in `trainMode`. In this case, the environment will be primed for training. That is, each frame will finish as soon as possible. This is not good for observing what is happening. However, this significantly increases the speed of training. Arguments: fileName {str} -- Path to the binary file. This file must be the same as the one for which the `unityagents` package has been generated. Keyword Arguments: showEnv {bool} -- Set this to ``True`` if you want to view the environment (default: {False}) trainMode {bool} -- Set this to ``True`` if you want the environment tobe in training mode (i.e. fast execution) (default: {True}) ''' try: self.no_graphics = not showEnv self.trainMode = trainMode self.fileName = fileName self.states = None except Exception as e: raise type(e)('lib.envs.envUnity.Env.__init__ - ERROR - ' + str(e)).with_traceback(sys.exc_info()[2]) return def __enter__(self): '''generate a context manager This will actually generate the context manager and allow you use this within a ``with`` statement. This is the function that actually initialized the environment and maintains it, until it is needed. Returns: ``this`` -- Returns an instance of the same class ''' try: self.env = UnityEnvironment(file_name=self.fileName, no_graphics=self.no_graphics) # get the default brain self.brain_name = self.env.brain_names[0] self.brain = self.env.brains[self.brain_name] self.env_info = self.env.reset( train_mode=self.trainMode)[self.brain_name] self.num_agents = len(self.env_info.agents) self.action_size = self.brain.vector_action_space_size except Exception as e: raise type(e)('lib.envs.envUnity.Env.__enter__ - ERROR - ' + str(e)).with_traceback(sys.exc_info()[2]) return self def reset(self): '''reset the environment before starting an episode Returns: status -- The current status after the reset ''' try: self.env.reset(train_mode=self.trainMode) self.states = self.env_info.vector_observations except Exception as e: raise type(e)('lib.envs.envUnity.Env.reset - ERROR - ' + str(e)).with_traceback(sys.exc_info()[2]) return self.states def step(self, policy): '''advance one step by taking an action This function takes a policy function and generates an action according to that particular policy. This results in the advancement of the episode into a one step with the return of the reward, and the next state along with any done information. Arguments: policy {function} -- This function takes a state vector and returns an action vector. It is assumed that the policy is the correct type of policy, and is capable if taking the right returning the right type of vector corresponding the the policy for the current environment. It does not check for the validity of the policy function Returns: list -- This returns a list of tuples containing the tuple ``(s_t, a_t, r_{t+1}, s_{t+1}, d)``. One tuple for each agent. Even for the case of a single agent, this is going to return a list of states ''' try: states = self.states.copy() actions = policy(states) env_info = self.env.step(actions)[self.brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done self.states = next_states results = [] for i in range(self.num_agents): state = states[i] action = actions[i] reward = rewards[i] next_state = next_states[i] done = dones[i] results.append((state, action, reward, next_state, done)) except Exception as e: raise type(e)('lib.envs.envUnity.Env.step - ERROR - ' + str(e)).with_traceback(sys.exc_info()[2]) return results def episode(self, policy, maxSteps=None): '''generate data for an entire episode This function generates an entire episde. It plays the environment by first resetting it too the beginning, and then playing the game for a given number of steps (or unless the game is terminated). It generates a set of list of tuplees, again one for each agent. Rememebr that even when the number of agents is 1, it will still return a list oof states. Arguments: policy {function} -- The function that takes the current state and returns the action vector. Keyword Arguments: maxSteps {int or None} -- The maximum number of steps that the agent is going to play the episode before the episode is terminated. (default: {None} in which case the episode will continue until it actually finishes) Returns: list -- This returns the list of tuples for the entire episode. Again, this is a lsit of lists, one for each agent. ''' try: self.reset() stepCount = 0 allResults = [[] for _ in range(self.num_agents)] while True: stepCount += 1 finished = False results = self.step(policy) for agent in range(self.num_agents): state, action, reward, next_state, done = results[agent] allResults[agent].append(results[agent]) finished = finished or done if finished: break if (maxSteps is not None) and (stepCount >= maxSteps): break except Exception as e: raise type(e)('lib.envs.envUnity.Env.episode - ERROR - ' + str(e)).with_traceback(sys.exc_info()[2]) return allResults def __exit__(self, exc, value, traceback): '''Exit the context manager The exit funciton that will result in exiting the context manager. Typically one is supposed to check the error if any at this point. This will be handled at a higher level Arguments: *args {[type]} -- [description] ''' if not exec: self.env.close() return True
Number of External Brains : 1 Lesson number : 0 Reset Parameters : goal_speed -> 1.0 goal_size -> 5.0 Unity brain name: ReacherBrain Number of Visual Observations (per agent): 0 Vector Observation space type: continuous Vector Observation space size (per agent): 33 Number of stacked Vector Observation: 1 Vector Action space type: continuous Vector Action space size (per agent): 4 Vector Action descriptions: , , , ''' # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size)) print('The state for the first agent looks like:', states[0])
class UnityEnv: ''' Class for all Envs. Standardizes the UnityEnv design to work in Lab. Access Agents properties by: Agents - AgentSpace - AEBSpace - EnvSpace - Envs ''' def __init__(self, env_spec, env_space, e=0): self.env_spec = env_spec self.env_space = env_space self.info_space = env_space.info_space self.e = e util.set_attr(self, self.env_spec) self.name = self.env_spec['name'] self.body_e = None self.nanflat_body_e = None # nanflatten version of bodies self.body_num = None worker_id = int(f'{os.getpid()}{self.e+int(ps.unique_id())}'[-4:]) self.u_env = UnityEnvironment(file_name=util.get_env_path(self.name), worker_id=worker_id) # spaces for NN auto input/output inference logger.warn('Unity environment observation_space and action_space are constructed with invalid range. Use only their shapes.') self.observation_spaces = [] self.action_spaces = [] for a in range(len(self.u_env.brain_names)): observation_shape = (self.get_observable_dim(a)['state'],) if self.get_brain(a).state_space_type == 'discrete': observation_space = gym.spaces.Box(low=0, high=1, shape=observation_shape, dtype=np.int32) else: observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=observation_shape, dtype=np.float32) self.observation_spaces.append(observation_space) if self.is_discrete(a): action_space = gym.spaces.Discrete(self.get_action_dim(a)) else: action_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32) self.action_spaces.append(action_space) for observation_space, action_space in zip(self.observation_spaces, self.action_spaces): set_gym_space_attr(observation_space) set_gym_space_attr(action_space) # TODO experiment to find out optimal benchmarking max_timestep, set # TODO ensure clock_speed from env_spec self.clock_speed = 1 self.clock = Clock(self.clock_speed) self.done = False def check_u_brain_to_agent(self): '''Check the size match between unity brain and agent''' u_brain_num = self.u_env.number_brains agent_num = len(self.body_e) assert u_brain_num == agent_num, f'There must be a Unity brain for each agent. e:{self.e}, brain: {u_brain_num} != agent: {agent_num}.' def check_u_agent_to_body(self, env_info_a, a): '''Check the size match between unity agent and body''' u_agent_num = len(env_info_a.agents) body_num = util.count_nonan(self.body_e[a]) assert u_agent_num == body_num, f'There must be a Unity agent for each body; a:{a}, e:{self.e}, agent_num: {u_agent_num} != body_num: {body_num}.' def get_brain(self, a): '''Get the unity-equivalent of agent, i.e. brain, to access its info''' name_a = self.u_env.brain_names[a] brain_a = self.u_env.brains[name_a] return brain_a def get_env_info(self, env_info_dict, a): name_a = self.u_env.brain_names[a] env_info_a = env_info_dict[name_a] return env_info_a @lab_api def post_body_init(self): '''Run init for components that need bodies to exist first, e.g. memory or architecture.''' self.nanflat_body_e = util.nanflatten(self.body_e) for idx, body in enumerate(self.nanflat_body_e): body.nanflat_e_idx = idx self.body_num = len(self.nanflat_body_e) self.check_u_brain_to_agent() logger.info(util.self_desc(self)) def is_discrete(self, a): '''Check if an agent (brain) is subject to discrete actions''' return self.get_brain(a).is_discrete() def get_action_dim(self, a): '''Get the action dim for an agent (brain) in env''' return self.get_brain(a).get_action_dim() def get_action_space(self, a): return self.action_spaces[a] def get_observable_dim(self, a): '''Get the observable dim for an agent (brain) in env''' return self.get_brain(a).get_observable_dim() def get_observable_types(self, a): '''Get the observable for an agent (brain) in env''' return self.get_brain(a).get_observable_types() def get_observation_space(self, a): return self.observation_spaces[a] @lab_api def reset(self): self.done = False env_info_dict = self.u_env.reset(train_mode=(util.get_lab_mode() != 'dev'), config=self.env_spec.get('unity')) _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e) for (a, b), body in util.ndenumerate_nonan(self.body_e): env_info_a = self.get_env_info(env_info_dict, a) self.check_u_agent_to_body(env_info_a, a) state = env_info_a.states[b] state_e[(a, b)] = state done_e[(a, b)] = self.done return _reward_e, state_e, done_e @lab_api def step(self, action_e): # TODO implement clock_speed: step only if self.clock.to_step() if self.done: return self.reset() action_e = util.nanflatten(action_e) env_info_dict = self.u_env.step(action_e) reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e) for (a, b), body in util.ndenumerate_nonan(self.body_e): env_info_a = self.get_env_info(env_info_dict, a) reward_e[(a, b)] = env_info_a.rewards[b] state_e[(a, b)] = env_info_a.states[b] done_e[(a, b)] = env_info_a.local_done[b] self.done = (util.nonan_all(done_e) or self.clock.get('t') > self.max_timestep) return reward_e, state_e, done_e @lab_api def close(self): self.u_env.close()
class UnityEnv(BaseEnv): r""" Basic Unity ML Agent environment. config example: "env": { "name": "Reacher", "type": "unity", "seed": 0, "to_render": True, "frame_sleep": 0.001, "max_steps": 1000, "one_hot": None, "action_bins": None, "reward_scale": None, "num_envs": None, } """ def __init__(self, config): super(UnityEnv, self).__init__(config) self._env = UnityEnvironment(file_name=get_env_path(self.name), seed=self.seed) self.patch_gym_spaces(self._env) self._set_attr_from_u_env(self._env) # TODO: Logging print(utils.describe(self)) def reset(self): self.done = False info_dict = self._env.reset(train_mode=self.to_render) env_info = self._get_env_info(info_dict, 0) state = env_info.vector_observations[0] return state def step(self, action): info_dict = self._env.step(action) env_info = self._get_env_info(info_dict, 0) state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] return state, reward, done, env_info def render(self): pass def close(self): self._env.close() def _get_brain(self, env, brain_index): r""" Get the unity-equivalent of agent, i.e. brain, to access its info :param env: :param brain_index: :return: """ brain_name = env.brain_names[brain_index] brain = env.brains[brain_name] return brain def patch_gym_spaces(self, env): r""" For standardization, use gym spaces to represent observation and action spaces for Unity. This method iterates through the multiple brains (multiagent) then constructs and returns lists of observation_spaces and action_spaces :param env: :return: """ observation_spaces = [] action_spaces = [] for brain_index in range(len(env.brain_names)): brain = self._get_brain(env, brain_index) # TODO: Logging utils.describe(brain) observation_shape = (brain.get_observable_dim()['state'],) action_dim = (brain.get_action_dim(),) if brain.is_discrete(): dtype = np.int32 action_space = spaces.Discrete(brain.get_action_dim()) else: dtype = np.float32 action_space = spaces.Box(low=0.0, high=1.0, shape=action_dim, dtype=dtype) observation_space = spaces.Box(low=0, high=1, shape=observation_shape, dtype=dtype) utils.set_gym_space_attr(observation_space) utils.set_gym_space_attr(action_space) observation_spaces.append(observation_space) action_spaces.append(action_space) # set for singleton env.observation_space = observation_spaces[0] env.action_space = action_spaces[0] return observation_spaces, action_spaces def _get_env_info(self, env_info_dict, index): r""" Unity API returns a env_info_dict. Use this method to pull brain(env)-specific :param env_info_dict: :param index: :return: """ brain_name = self._env.brain_names[index] env_info = env_info_dict[brain_name] return env_info
class ExperienceManager: def __init__(self): #define the params for later usage self.env = None self.brain_name = None self.agent = None #initialize enviroment and set the state space size and action space size def initEnviroment(self): print('Initialize env') #initalize Unity env #update to you self.env = UnityEnvironment(file_name=BANANA_INSTALLATION) #get the default brain self.brain_name = self.env.brain_names[0] #reset the environment env_info = self.env.reset(train_mode=TRAIN_MODE)[self.brain_name] #get size of action and state self.action_size = self.env.brains[ self.brain_name].vector_action_space_size self.state_size = len(env_info.vector_observations[0]) #initiate Agent self.agent = Agent(state_size=self.state_size, action_size=self.action_size) print('Env init done') #run one episode and return total reward def runEpisode(self): #init score to 0 and reset env score = 0 state = self.env.reset( train_mode=TRAIN_MODE)[self.brain_name].vector_observations[0] while True: #get greedy action action = self.agent.greedy_action(state) #perform action env_info = self.env.step(action)[self.brain_name] #get the step result next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished #store step result and perform learning self.agent.step(state, action, reward, next_state, done) #update state and score state = next_state score += reward if done: #finito break #return the score of whole episode return score #run the whole experiments = defined number of episodes def runEperiment(self, n_episodes=EPISODES_NUM): #init enviroment self.initEnviroment() scores = [] scores_window = deque(maxlen=100) # last 100 scores for i_episode in range(1, n_episodes + 1): #run one episode score = self.runEpisode() #store the score of episode scores_window.append(score) scores.append(score) #print progress print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") #keep progress of last 100 episodes if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) return scores
class TrainerController(object): def __init__(self, env_path, run_id, save_freq, curriculum_file, fast_simulation, load, train, worker_id, keep_checkpoints, lesson, seed, docker_target_name, trainer_config_path, use_data_gatherer): """ :param env_path: Location to the environment executable to be loaded. :param run_id: The sub-directory name for model and summary statistics :param save_freq: Frequency at which to save model :param curriculum_file: Curriculum json file for environment :param fast_simulation: Whether to run the game at training speed :param load: Whether to load the model or randomly initialize :param train: Whether to train model, or only run inference :param worker_id: Number to add to communication port (5005). Used for multi-environment :param keep_checkpoints: How many model checkpoints to keep :param lesson: Start learning from this lesson :param seed: Random seed used for training. :param docker_target_name: Name of docker volume that will contain all data. :param trainer_config_path: Fully qualified path to location of trainer configuration file """ ''' Here's a small change (this only happens if code is launched with the '--data-gatherer' flag) ''' self.use_data_gatherer = use_data_gatherer self.trainer_config_path = trainer_config_path env_path = (env_path.strip().replace('.app', '').replace( '.exe', '').replace('.x86_64', '').replace('.x86', '') ) # Strip out executable extensions if passed # Recognize and use docker volume if one is passed as an argument if docker_target_name == '': self.docker_training = False self.model_path = './models/{run_id}'.format(run_id=run_id) self.curriculum_file = curriculum_file self.summaries_dir = './summaries' else: self.docker_training = True self.model_path = '/{docker_target_name}/models/{run_id}'.format( docker_target_name=docker_target_name, run_id=run_id) env_path = '/{docker_target_name}/{env_name}'.format( docker_target_name=docker_target_name, env_name=env_path) if curriculum_file is None: self.curriculum_file = None else: self.curriculum_file = '/{docker_target_name}/{curriculum_file}'.format( docker_target_name=docker_target_name, curriculum_file=curriculum_file) self.summaries_dir = '/{docker_target_name}/summaries'.format( docker_target_name=docker_target_name) self.logger = logging.getLogger("unityagents") self.run_id = run_id self.save_freq = save_freq self.lesson = lesson self.fast_simulation = fast_simulation self.load_model = load self.train_model = train self.worker_id = worker_id self.keep_checkpoints = keep_checkpoints self.trainers = {} if seed == -1: seed = np.random.randint(0, 999999) self.seed = seed np.random.seed(self.seed) tf.set_random_seed(self.seed) self.env = UnityEnvironment(file_name=env_path, worker_id=self.worker_id, curriculum=self.curriculum_file, seed=self.seed, docker_training=self.docker_training) self.env_name = os.path.basename( os.path.normpath(env_path)) # Extract out name of environment def _get_progress(self): if self.curriculum_file is not None: progress = 0 if self.env.curriculum.measure_type == "progress": for brain_name in self.env.external_brain_names: progress += self.trainers[ brain_name].get_step / self.trainers[ brain_name].get_max_steps return progress / len(self.env.external_brain_names) elif self.env.curriculum.measure_type == "reward": for brain_name in self.env.external_brain_names: progress += self.trainers[brain_name].get_last_reward return progress else: return None else: return None def _process_graph(self): nodes = [] scopes = [] for brain_name in self.trainers.keys(): if self.trainers[brain_name].graph_scope is not None: scope = self.trainers[brain_name].graph_scope + '/' if scope == '/': scope = '' scopes += [scope] if self.trainers[brain_name].parameters[ "trainer"] == "imitation": nodes += [scope + x for x in ["action"]] elif not self.trainers[brain_name].parameters["use_recurrent"]: nodes += [ scope + x for x in ["action", "value_estimate", "action_probs"] ] else: node_list = [ "action", "value_estimate", "action_probs", "recurrent_out", "memory_size" ] nodes += [scope + x for x in node_list] if len(scopes) > 1: self.logger.info("List of available scopes :") for scope in scopes: self.logger.info("\t" + scope) self.logger.info("List of nodes to export :") for n in nodes: self.logger.info("\t" + n) return nodes def _save_model(self, sess, saver, steps=0): """ Saves current model to checkpoint folder. :param sess: Current Tensorflow session. :param steps: Current number of steps in training process. :param saver: Tensorflow saver for session. """ last_checkpoint = self.model_path + '/model-' + str(steps) + '.cptk' saver.save(sess, last_checkpoint) tf.train.write_graph(sess.graph_def, self.model_path, 'raw_graph_def.pb', as_text=False) self.logger.info("Saved Model") def _export_graph(self): """ Exports latest saved model to .bytes format for Unity embedding. """ target_nodes = ','.join(self._process_graph()) ckpt = tf.train.get_checkpoint_state(self.model_path) freeze_graph.freeze_graph( input_graph=self.model_path + '/raw_graph_def.pb', input_binary=True, input_checkpoint=ckpt.model_checkpoint_path, output_node_names=target_nodes, ######## FOLOWING LINE UGLY FIX: only return first 20 characters of run_id ###### output_graph=self.model_path + '/' + self.env_name + "_" + self.run_id[:20] + '.bytes', clear_devices=True, initializer_nodes="", input_saver="", restore_op_name="save/restore_all", filename_tensor_name="save/Const:0") def _initialize_trainers(self, trainer_config, sess): trainer_parameters_dict = {} self.trainers = {} for brain_name in self.env.external_brain_names: trainer_parameters = trainer_config['default'].copy() if len(self.env.external_brain_names) > 1: graph_scope = re.sub('[^0-9a-zA-Z]+', '-', brain_name) trainer_parameters['graph_scope'] = graph_scope trainer_parameters['summary_path'] = '{basedir}/{name}'.format( basedir=self.summaries_dir, name=str(self.run_id) + '_' + graph_scope) else: trainer_parameters['graph_scope'] = '' trainer_parameters['summary_path'] = '{basedir}/{name}'.format( basedir=self.summaries_dir, name=str(self.run_id)) if brain_name in trainer_config: _brain_key = brain_name while not isinstance(trainer_config[_brain_key], dict): _brain_key = trainer_config[_brain_key] for k in trainer_config[_brain_key]: trainer_parameters[k] = trainer_config[_brain_key][k] trainer_parameters_dict[brain_name] = trainer_parameters.copy() for brain_name in self.env.external_brain_names: if trainer_parameters_dict[brain_name]['trainer'] == "imitation": self.trainers[brain_name] = BehavioralCloningTrainer( sess, self.env, brain_name, trainer_parameters_dict[brain_name], self.train_model, self.seed) elif trainer_parameters_dict[brain_name]['trainer'] == "ppo": self.trainers[brain_name] = PPOTrainer( sess, self.env, brain_name, trainer_parameters_dict[brain_name], self.train_model, self.seed, self.use_data_gatherer) else: raise UnityEnvironmentException( "The trainer config contains an unknown trainer type for brain {}" .format(brain_name)) def _load_config(self): try: with open(self.trainer_config_path) as data_file: trainer_config = yaml.load(data_file) return trainer_config except IOError: raise UnityEnvironmentException( """Parameter file could not be found here {}. Will use default Hyper parameters""" .format(self.trainer_config_path)) except UnicodeDecodeError: raise UnityEnvironmentException( "There was an error decoding Trainer Config from this path : {}" .format(self.trainer_config_path)) @staticmethod def _create_model_path(model_path): try: if not os.path.exists(model_path): os.makedirs(model_path) except Exception: raise UnityEnvironmentException( "The folder {} containing the generated model could not be accessed." " Please make sure the permissions are set correctly.".format( model_path)) def start_learning(self): self.env.curriculum.set_lesson_number(self.lesson) trainer_config = self._load_config() self._create_model_path(self.model_path) tf.reset_default_graph() with tf.Session() as sess: self._initialize_trainers(trainer_config, sess) for k, t in self.trainers.items(): self.logger.info(t) init = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=self.keep_checkpoints) # Instantiate model parameters if self.load_model: self.logger.info('Loading Model...') ckpt = tf.train.get_checkpoint_state(self.model_path) if ckpt is None: self.logger.info( 'The model {0} could not be found. Make sure you specified the right ' '--run-id'.format(self.model_path)) saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(init) global_step = 0 # This is only for saving the model self.env.curriculum.increment_lesson(self._get_progress()) curr_info = self.env.reset(train_mode=self.fast_simulation) if self.train_model: for brain_name, trainer in self.trainers.items(): trainer.write_tensorboard_text('Hyperparameters', trainer.parameters) try: while any([ t.get_step <= t.get_max_steps for k, t in self.trainers.items() ]) or not self.train_model: if debug_print: print("|", end='', flush=True) if self.env.global_done: self.env.curriculum.increment_lesson( self._get_progress()) curr_info = self.env.reset( train_mode=self.fast_simulation) for brain_name, trainer in self.trainers.items(): trainer.end_episode() if data_gatherer['reset_after_each_frame']: curr_info = self.env.reset( train_mode=self.fast_simulation) # Decide and take an action take_action_vector, take_action_memories, take_action_text, take_action_outputs = {}, {}, {}, {} for brain_name, trainer in self.trainers.items(): (take_action_vector[brain_name], take_action_memories[brain_name], take_action_text[brain_name], take_action_outputs[brain_name] ) = trainer.take_action(curr_info) new_info = self.env.step(vector_action=take_action_vector, memory=take_action_memories, text_action=take_action_text) ''' ----- ''' ''' Enabling data gathering disables the normal functionality.... ''' if self.use_data_gatherer: if data_gatherer['firstRun']: print("---") print("NORMAL FUNCTIONALITY DISABLED!") print( "Now we just sample stats from the initial distribution and save them:" ) print("Save dir: {}".format(data_gatherer['dir'])) print("---") print( "If you did not expect to see this, NOW is the time to [ctrl-C]! (otherwise: [enter] to continue...)" ) ''' Create the folder-structure if it is needed: ''' paths = [ settings['dir_base'], settings['dir_base'] + settings['project'], data_gatherer['dir'] ] for p in paths: if not os.path.isdir(p): os.makedirs(p) print("Created path: {}".format(p)) else: print("Reusing existing: {}".format(p)) ape = input() data_gatherer['firstRun'] = False #if data_gatherer['reset_after_each_frame']: # curr_info = self.env.reset(train_mode=self.fast_simulation) # take_action_vector, take_action_memories, take_action_text, take_action_outputs = {}, {}, {}, {} # for brain_name, trainer in self.trainers.items(): # (take_action_vector[brain_name], # take_action_memories[brain_name], # take_action_text[brain_name], # take_action_outputs[brain_name]) = trainer.take_action(curr_info) # new_info = self.env.step(vector_action=take_action_vector, memory=take_action_memories, # text_action=take_action_text) is_done = False for x in new_info: for l in range(len(new_info[x].agents)): is_done = is_done or new_info[x].local_done[l] if data_gatherer['idx'] == data_gatherer[ 'n'] or is_done: #WRITE_TO_FILE.... print("Saving chunk {}... ({} samples)".format( data_gatherer['n_chunks'], data_gatherer['idx'])) with open( data_gatherer['dir'] + data_gatherer['file_base'] + "chunk{}.pkl".format( str(data_gatherer['n_chunks']).zfill( 5)), 'wb') as outfile: pickle.dump( data_gatherer['data'] [:data_gatherer['idx'], :, :, :].reshape( (-1, ) + data_gatherer['obs_size']), outfile, pickle.HIGHEST_PROTOCOL) #Prep next: data_gatherer['n_chunks'] += 1 data_gatherer['data'] = np.empty( data_gatherer['size'], dtype=np.uint8) data_gatherer['idx'] = 0 if data_gatherer['n_chunks'] == 1500: print("Total samples gathered: {}".format( (data_gatherer['n_chunks'] - 1000) * 1000)) exit() data_gatherer['data'][ data_gatherer['idx'], :, :, :] = ( 255 * new_info["PepperBrain"].visual_observations[0] ).astype(np.uint8) data_gatherer['idx'] += 1 if data_gatherer['reset_after_each_frame']: continue ''' ----- ''' if settings['store_as_int']: for key in new_info: for x in range( len(new_info[key].visual_observations)): new_info[key].visual_observations[x] = ( 255 * new_info[key].visual_observations[x] ).astype(np.uint8) for brain_name, trainer in self.trainers.items(): if debug_print: print(".", end='', flush=True) trainer.add_experiences( curr_info, new_info, take_action_outputs[brain_name]) trainer.process_experiences(curr_info, new_info) if trainer.is_ready_update( ) and self.train_model and trainer.get_step <= trainer.get_max_steps: if debug_print: print("!", end='', flush=True) # Perform gradient descent with experience buffer print("Updating model... ", end='', flush=True) t = time.time() trainer.update_model() print("[x] Done in {} seconds.".format( time.time())) # Write training statistics to Tensorboard. if debug_print: print(",", end='', flush=True) trainer.write_summary( self.env.curriculum.lesson_number) if self.train_model and trainer.get_step <= trainer.get_max_steps: if debug_print: print("?", end='', flush=True) trainer.increment_step() trainer.update_last_reward() if self.train_model and trainer.get_step <= trainer.get_max_steps: global_step += 1 if global_step % self.save_freq == 0 and global_step != 0 and self.train_model: if debug_print: print("x", end='', flush=True) # Save Tensorflow model self._save_model(sess, steps=global_step, saver=saver) curr_info = new_info # Final save Tensorflow model if global_step != 0 and self.train_model: self._save_model(sess, steps=global_step, saver=saver) except KeyboardInterrupt: if self.train_model: self.logger.info( "Learning was interrupted. Please wait while the graph is generated." ) self._save_model(sess, steps=global_step, saver=saver) pass self.env.close() if self.train_model: self._export_graph()
num_epochs = 300 batch_size = 128 model.fit(states_train, actions_train, validation_split=0.1, batch_size=batch_size, epochs=num_epochs, shuffle=True) print() print(model.metrics_names) print("Test error:", model.evaluate(states_test, actions_test)) env = UnityEnvironment(file_name="drone_sim_external", worker_id=0) num_dagger_iterations = 10 steps = 1000 for iterations in range(num_dagger_iterations): done = False env.reset(train_mode=False) states = np.zeros((1, 13)) threshold = 10 for i in range(steps): action = model.predict(states) action = np.hstack((action[0], 0)) brainInf = env.step(action)['DroneBrain'] states = brainInf.states norm = np.linalg.norm(states[0][3:6] - states[0][9:12]) #TODO: FIGURE OUT HOW TO GENERATE LABELED ACTIONS model.save("trained_model.h5")
class UnityEnvHelper: # constructor - give file_name of agent environment def __init__(self, file_name, no_graphics=True, seed=8888): self.seed = seed self.uenv = UnityEnvironment(file_name=file_name, seed=self.seed, no_graphics=no_graphics) # pick the first agent as the brain self.brain_name = self.uenv.brain_names[0] self.brain = self.uenv.brains[self.brain_name] # get the action space size self.action_size = self.brain.vector_action_space_size # reset the environment , in training mode self.reset(True) # get the state space size self.state_size = len(self.ue_info.vector_observations[0]) def __del__(self): # make sure we close the environment try: self.uenv.close() del self.uenv except: pass def reset(self, train_mode=True): # tell the unity agent to restart an episode # training mode simple seems to run the simulation at full speed self.ue_info = self.uenv.reset(train_mode=train_mode)[self.brain_name] # we pass in current state for convenience def step(self, state, action): # perform action on environment and get observation self.ue_info = self.uenv.step(action)[self.brain_name] # return state , action , next state , reward and done flag # slightly return { 'state': state, 'action': action, 'reward': self.reward(), 'next_state': self.state(), 'done': self.done() } def state(self): # just last observation state return self.ue_info.vector_observations[0] def reward(self): # return reward from last observation return self.ue_info.rewards[0] def done(self): # return done flag return self.ue_info.local_done[0]
class Environment(): """ This is a wrapper class for a Unity environment The Unity environment is wrapped such that the API is similar to a Gym environment. Using this class, DQN algorithms written for Gym environments can be re-used with minimal changes. """ def __init__(self, filename_path, worker_id=0, train_mode=True, no_graphics=False, seed=0): # Create new environment # Create Unity environment self._env = UnityEnvironment(file_name=filename_path, \ worker_id=worker_id,\ no_graphics=no_graphics, \ seed=seed) # get the default brain self._brain_name = self._env.brain_names[0] self._brain = self._env.brains[self._brain_name] # set the initial state self.train_mode = train_mode self._env_info = self._env.reset( train_mode=train_mode)[self._brain_name] self._state = self._env_info.vector_observations[0] # define state_size and action_size self.state_size = len(self._state) self.action_size = self._brain.vector_action_space_size def reset(self): # reset the environment self._env_info = self._env.reset( train_mode=self.train_mode)[self._brain_name] self._state = self._env_info.vector_observations[0] # return the state vector return self._state def step(self, action): # send the action to the environment self._env_info = self._env.step(action)[self._brain_name] # get the next state next_state = self._env_info.vector_observations[0] # get the reward reward = self._env_info.rewards[0] # check if terminal state is reached done = self._env_info.local_done[0] # create dummy value to keep API compatible dummy = 0 # return the next_state vector, the reward, # and whether the terminal state was reached return next_state, reward, done, dummy def close(self): self._env.close() pass
os.makedirs(summary_path) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: # Instantiate model parameters if load_model: print('Loading Model...') ckpt = tf.train.get_checkpoint_state(model_path) saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(init) steps = sess.run(ppo_model.global_step) summary_writer = tf.summary.FileWriter(summary_path) info = env.reset(train_mode=train_model)[brain_name] trainer = Trainer(ppo_model, sess, info, is_continuous, use_observations) while steps <= max_steps or not train_model: if env.global_done: info = env.reset(train_mode=train_model)[brain_name] # Decide and take an action new_info = trainer.take_action(info, env, brain_name) info = new_info trainer.process_experiences(info, time_horizon, gamma, lambd) if len(trainer.training_buffer['actions']) > buffer_size and train_model: # Perform gradient descent with experience buffer trainer.update_model(batch_size, num_epoch) if steps % summary_freq == 0 and steps != 0 and train_model: # Write training statistics to tensorboard. trainer.write_summary(summary_writer, steps) if steps % save_freq == 0 and steps != 0 and train_model:
if avg_score >= TARGET_SCORE: torch.save(agents.actor_local.state_dict(), "ckpt/{}".format(ACTOR_CHECKPOINT_NAME)) torch.save(agents.critic_local.state_dict(), "ckpt/{}".format(CRITIC_CHECKPOINT_NAME)) break return scores env = UnityEnvironment(file_name=ENV_PATH, no_graphics=GRAPHICS_OFF) brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=GRAPHICS_OFF)[brain_name] num_agents = len(env_info.agents) action_size = brain.vector_action_space_size states = env_info.vector_observations state_size = states.shape[1] print('Number of agents: {}'.format(num_agents)) print('Number of actions: {}'.format(action_size)) print('Number of states: {}'.format(state_size)) print('First state: {}'.format(states[0])) if torch.cuda.is_available(): print("trainining on GPU") else: print("training on CPU")
# env_1.reset(train_mode=False) # env_2.reset(train_mode=False) env_path = util.get_env_path('gridworld') # use train_mode = False to debug, i.e. render env at real size, real time train_mode = False # UnityEnvironment interfaces python with Unity, # and contains brains for controlling connected agents. env = UnityEnvironment(file_name=env_path) print(str(env)) # get the default brain default_brain = env.brain_names[0] brain = env.brains[default_brain] env_info = env.reset(train_mode=train_mode)[default_brain] ''' is_continuous = (brain.action_space_type == 'continuous') use_observations = (brain.number_observations > 0) use_states = (brain.state_space_size > 0) - reset env with param, returns dict of {brain: BrainInfo} env.reset(train_mode=train_mode) env_info = env.reset(train_mode=train_mode)[default_brain] - list of 4D np arrays. nth element = nth observation (pixel-wise) of the brain env_info.observations - 2D np array of (batch_size, state_size) for cont and discrete env_info.states.shape - 2D np array of (batch_size, memory_size) which corresponds to
critic_layer_dim_1=args['critic_layer_dim_1'], critic_layer_dim_2=args['critic_layer_dim_2'], critic_layer_dim_3=args['critic_layer_dim_3']) return agent projects = [ "01Run", "02Run", "03Run", "04Run", "05Run", "06Run", "07Run", "08Run", "09Run", "10Run" ] dfs_args = [] agents = [] unity_environment_path = "./Tennis_Linux/Tennis.x86_64" env = UnityEnvironment(file_name=unity_environment_path) brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=False)[brain_name] action_size = brain.vector_action_space_size states = env_info.vector_observations state_size = states.shape[1] agent_2 = loadagent_chkpt("04Run", 9) agent_1 = loadagent_chkpt("06Run", 4) game = TourDDPG(agent_1, agent_2) result = play(env, game, 100) print(result)
def evaluate(agent_dir: Path, number_of_episodes: int = 1000, maximum_timestaps: int = 1000, environment_path: str = DEFAULT_ENVIRONMENT_EXECUTABLE_PATH): """Evaluate an agent on some episodes. Note that the agent is not trained during the evaluation and the exploration is set to 0. Thus the results really reflect the final performance of the agent.""" agent_path = agent_dir / 'checkpoint.pth' if not agent_path.exists(): logging.warning(f'No saved parameters found for agent in {agent_dir}.') return hist_path = agent_dir / 'evaluation_histogram.png' scores_path = agent_dir / 'scores_evaluation.csv' env = UnityEnvironment(file_name=environment_path, no_graphics=True) brain_name = env.brain_names[0] brain = env.brains[brain_name] action_size = brain.vector_action_space_size env_info = env.reset(train_mode=True)[brain_name] state_size = len(env_info.vector_observations[0]) agent = DqnAgent(state_size=state_size, action_size=action_size, device=DEVICE) agent.load(agent_path) scores = [] for _ in tqdm(list(range(1, number_of_episodes + 1))): env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] score = 0 for t in range(maximum_timestaps): action = agent.act(state, epsilon=0.0) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] state = next_state score += reward if done: break scores.append(score) scores_ts = pd.Series(scores) plt.hist(scores, bins=100, color='steelblue') xlim = plt.ylim() med = scores_ts.median() plt.vlines(med, *xlim, linewidth=2, linestyle='--', color='orange', label=f'median: {med}') plt.legend() plt.savefig(hist_path) scores_ts.to_csv(scores_path, index=False)
from unityagents import UnityEnvironment env_name = "env19" # Name of the Unity environment binary to launch train_mode = True # Whether to run the environment in training or inference mode env = UnityEnvironment(file_name=env_name) # Examine environment parameters print(str(env)) # Set the default brain to work with default_brain = env.brain_names[0] brain = env.brains[default_brain] # Reset the environment env_info = env.reset(train_mode=train_mode)[default_brain] # Examine the state space for the default brain print("Agent state looks like: \n{}".format(env_info.states[0])) # Examine the observation space for the default brain # for observation in env_info.observations: # print("Agent observations look like:") # if observation.shape[3] == 3: # plt.imshow(observation[0,:,:,:]) # else: # plt.imshow(observation[0,:,:,0]) for episode in range(100): env_info = env.reset(train_mode=train_mode)[default_brain] done = False
class UnityEnv(gym.Env): def __init__(self, app_name=None, idx=0): # parameter app_path = os.path.join(os.path.dirname(__file__), 'assets', app_name) idx = idx no_graphics = False self.num_envs = 1 # create environment self._env = UnityEnvironment(file_name=app_path, worker_id=idx, no_graphics=no_graphics) self.name = app_name # Only Accept Environment with Only One Brain assert len(self._env.brains) == 1 self.brain_name = self._env.external_brain_names[0] self.brain = self._env.brains[self.brain_name] # viusalization self.use_visual = (self.brain.number_visual_observations == 1) # action space dimension if self.brain.vector_action_space_type == "discrete": self._a_dim = Discrete(1) else: high = np.array([np.inf] * (self.brain.vector_action_space_size)) self._a_dim = Box(-high, high) # observation spce dimension if self.use_visual and False and no_graphic: high = np.array([np.inf] * self.brain.camera_resolutions[0]["height"] * self.brain.camera_resolutions[0]["width"] * 3) self._ob_dim = Box(-high, high) else: high = np.array([np.inf] * self.brain.vector_observation_space_size) self._ob_dim = Box(-high, high) # video buffer self.frames = [] def reset(self): self.frames = [] info = self._env.reset()[self.brain_name] state = info.vector_observations[0] return np.array([state]) def step(self, action): info = self._env.step([action])[self.brain_name] state = info.vector_observations[0] reward = info.rewards[0] done = info.local_done[0] self._collect_frames(info.visual_observations[0]) return np.array([state]), np.array([reward ]), np.array([done ]), np.array([None]) def close(self): self._env.close() def _collect_frames(self, frame): if self.use_visual: self.frames.append(frame) @property def action_space(self): return self._a_dim @property def observation_space(self): return self._ob_dim
def dqn(n_episodes=10000, max_t=1000, eps_start=1.0, eps_end=0.05, eps_decay=0.995, train_mode=True): """Deep Q-Learning. Params ====== agent: env: n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon train_mode (bool): set environment into training mode if True. """ scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = eps_start # initialize epsilon env = UnityEnvironment(file_name="Banana/Banana.exe", base_port=64738, no_graphics=True) brain_name = env.brain_names[0] brain = env.brains[brain_name] brain_name = env.brain_names[0] brain = env.brains[brain_name] action_size = brain.vector_action_space_size env_info = env.reset(train_mode=train_mode)[brain_name] state_size = len(env_info.vector_observations[0]) agent = Agent(state_size=state_size, action_size=action_size, seed=0) for i_episode in range(1, n_episodes + 1): state = env_info.vector_observations[0] score = 0 for t in range(max_t): action = np.int32(agent.act(state, eps)) #next_state, reward, done, _ = env.step(action) env_info = env.step(action)[ brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: env.reset(train_mode=train_mode)[brain_name] break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) if np.mean(scores_window) > 13.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) torch.save(agent.qnetwork_local.state_dict(), 'checkpoint_vanilla.pth') break return scores
def train_agent( env: unityagents.UnityEnvironment, agent: agents.DDPGAgent, n_episodes: int = 200, mean_score_threshold: float = 30.0, max_t: int = 1000, has_ou_noise: bool = True, scores_maxlen: int = 100, ou_noise_sigma_start: float = 0.5, ou_noise_sigma_end: float = 0.01, ou_noise_sigma_decay: float = 0.99, n_random_episodes: int = 100, logging_freq: int = 10, checkpoints_dir: typing.Optional[pathlib.Path] = None, checkpoints_freq: int = 50, ) -> pd.DataFrame: """ Train agent for Unity Tennis environment and return results. Parameters ---------- env Unity environment agent And instance of Deep Reinforcement Learning Agent from drl_ctrl.agents module n_episodes Maximum number of episodes mean_score_threshold Threshold of mean last 100 weights to stop training and save results max_t Maximum number of time steps per episode has_ou_noise If True, Ornstein-Uhlenbeck noise is added to actions scores_maxlen Maximum length of scores window ou_noise_sigma_start Ornstein-Uhlenbeck noise sigma starting value per episode ou_noise_sigma_end Ornstein-Uhlenbeck noise sigma minimum value per episode ou_noise_sigma_decay Ornstein-Uhlenbeck noise sigma multiplicative decay n_random_episodes Number of random episodes to gather experience logging_freq Logging frequency checkpoints_dir Model checkpoints output directory checkpoints_freq Checkpoint frequency to check if agent scores achieves average score threshold """ logger = logging.getLogger(__name__) scores = [] scores_avg100 = [] scores_window = deque(maxlen=scores_maxlen) time_started = time.time() times_total = [] times_per_episode = [] time_steps = [] i_last_checkpoint = 0 for i_episode in range(1, (n_random_episodes + n_episodes + 1)): time_started_episode = time.time() brain_name = env.brain_names[0] env_info = env.reset(train_mode=True)[brain_name] agent.reset() states = env_info.vector_observations num_agents = len(env_info.agents) agent_scores = np.zeros(num_agents) ou_noise_sigma = ou_noise_sigma_start t = 1 while True: # choose action (for each agent) if i_episode <= n_random_episodes: action_size = env.brains[brain_name].vector_action_space_size actions = np.random.randn(num_agents, action_size) actions = np.clip(actions, -1, 1) else: actions = agent.act(states, ou_noise_sigma=ou_noise_sigma, add_noise=has_ou_noise) ou_noise_sigma = max(ou_noise_sigma_end, ou_noise_sigma * ou_noise_sigma_decay) # take action in the environment(for each agent) env_info = env.step(actions)[brain_name] # get next state (for each agent) next_states = env_info.vector_observations # see if episode finished dones = env_info.local_done # update the score (for each agent) agent_scores += env_info.rewards if i_episode <= n_random_episodes: agent.memory.add_batch(states, actions, env_info.rewards, next_states, dones) else: agent.step(states, actions, env_info.rewards, next_states, dones) # roll over states to next time step states = next_states # exit loop if episode finished if np.any(dones): break t += 1 score = float(np.max(agent_scores)) scores_window.append(score) scores.append(score) scores_avg100.append(np.mean(scores_window)) times_total.append(time.time() - time_started) times_per_episode.append(time.time() - time_started_episode) time_steps.append(t) if i_episode % logging_freq == 0: logger.info(f'\rEp: {i_episode}' f'\tSigma({t}): {ou_noise_sigma:.3f}' f'\tScore: {score:.2f}' f'\tAvg. Score: {np.mean(scores_window):.2f}' f'\tTime_e: {times_per_episode[-1]:.3f}s' f'\tTime: {times_total[-1]:.3f}s') if len(scores_window) == scores_maxlen and np.mean( scores_window) >= mean_score_threshold: if (checkpoints_dir is not None and ((i_episode - i_last_checkpoint) % checkpoints_freq) == 0): checkpoint_dir = checkpoints_dir.joinpath( f"episode_{i_episode}") checkpoint_dir.mkdir(parents=True, exist_ok=True) torch.save( agent.actor_local.state_dict(), str(path_util.mk_path_weights_actor_local(checkpoint_dir))) torch.save( agent.actor_target.state_dict(), str(path_util.mk_path_weights_actor_target( checkpoint_dir))) torch.save( agent.critic_local.state_dict(), str(path_util.mk_path_weights_critic_local( checkpoint_dir))) torch.save( agent.critic_target.state_dict(), str( path_util.mk_path_weights_critic_target( checkpoints_dir))) logger.info( f'\nSaved model checkpoint to {str(checkpoints_dir)}') else: logger.info( f'\nEnvironment solved in {i_episode - 100:d} episodes!' f'\nScore: {score:.2f}' f'\tAverage Score: {np.mean(scores_window):.2f}' f'\tAverage Time_e: {np.mean(times_per_episode):.3f}s' f'\tTotal Time: {times_total[-1]:.3f}s') break return pd.DataFrame.from_records( zip(range(len(scores)), scores, scores_avg100, time_steps, times_per_episode, times_total), columns=[ cfg.COL_EPISODE, cfg.COL_SCORE, cfg.COL_SCORE_AVG100, cfg.COL_N_TIME_STEPS, cfg.COL_TIME_PER_EPISODE, cfg.COL_TIME_TOTAL ])
class TrainerController(object): def __init__(self, env_path, run_id, save_freq, curriculum_file, fast_simulation, load, train, worker_id, keep_checkpoints, lesson, seed, docker_target_name, trainer_config_path, no_graphics): """ :param env_path: Location to the environment executable to be loaded. :param run_id: The sub-directory name for model and summary statistics :param save_freq: Frequency at which to save model :param curriculum_file: Curriculum json file for environment :param fast_simulation: Whether to run the game at training speed :param load: Whether to load the model or randomly initialize :param train: Whether to train model, or only run inference :param worker_id: Number to add to communication port (5005). Used for multi-environment :param keep_checkpoints: How many model checkpoints to keep :param lesson: Start learning from this lesson :param seed: Random seed used for training. :param docker_target_name: Name of docker volume that will contain all data. :param trainer_config_path: Fully qualified path to location of trainer configuration file :param no_graphics: Whether to run the Unity simulator in no-graphics mode """ self.trainer_config_path = trainer_config_path if env_path is not None: env_path = (env_path.strip() .replace('.app', '') .replace('.exe', '') .replace('.x86_64', '') .replace('.x86', '')) # Strip out executable extensions if passed # Recognize and use docker volume if one is passed as an argument if docker_target_name == '': self.docker_training = False self.model_path = './models/{run_id}'.format(run_id=run_id) self.curriculum_file = curriculum_file self.summaries_dir = './summaries' else: self.docker_training = True self.model_path = '/{docker_target_name}/models/{run_id}'.format( docker_target_name=docker_target_name, run_id=run_id) if env_path is not None: env_path = '/{docker_target_name}/{env_name}'.format(docker_target_name=docker_target_name, env_name=env_path) if curriculum_file is None: self.curriculum_file = None else: self.curriculum_file = '/{docker_target_name}/{curriculum_file}'.format( docker_target_name=docker_target_name, curriculum_file=curriculum_file) self.summaries_dir = '/{docker_target_name}/summaries'.format(docker_target_name=docker_target_name) self.logger = logging.getLogger("unityagents") self.run_id = run_id self.save_freq = save_freq self.lesson = lesson self.fast_simulation = fast_simulation self.load_model = load self.train_model = train self.worker_id = worker_id self.keep_checkpoints = keep_checkpoints self.trainers = {} if seed == -1: seed = np.random.randint(0, 999999) self.seed = seed np.random.seed(self.seed) tf.set_random_seed(self.seed) self.env = UnityEnvironment(file_name=env_path, worker_id=self.worker_id, curriculum=self.curriculum_file, seed=self.seed, docker_training=self.docker_training, no_graphics=no_graphics) if env_path is None: self.env_name = 'editor_'+self.env.academy_name else: self.env_name = os.path.basename(os.path.normpath(env_path)) # Extract out name of environment def _get_progress(self): if self.curriculum_file is not None: progress = 0 if self.env.curriculum.measure_type == "progress": for brain_name in self.env.external_brain_names: progress += self.trainers[brain_name].get_step / self.trainers[brain_name].get_max_steps return progress / len(self.env.external_brain_names) elif self.env.curriculum.measure_type == "reward": for brain_name in self.env.external_brain_names: progress += self.trainers[brain_name].get_last_reward return progress else: return None else: return None def _process_graph(self): nodes = [] scopes = [] for brain_name in self.trainers.keys(): if self.trainers[brain_name].graph_scope is not None: scope = self.trainers[brain_name].graph_scope + '/' if scope == '/': scope = '' scopes += [scope] if self.trainers[brain_name].parameters["trainer"] == "imitation": nodes += [scope + x for x in ["action"]] else: nodes += [scope + x for x in ["action", "value_estimate", "action_probs"]] if self.trainers[brain_name].parameters["use_recurrent"]: nodes += [scope + x for x in ["recurrent_out", "memory_size"]] if len(scopes) > 1: self.logger.info("List of available scopes :") for scope in scopes: self.logger.info("\t" + scope) self.logger.info("List of nodes to export :") for n in nodes: self.logger.info("\t" + n) return nodes def _save_model(self, sess, saver, steps=0): """ Saves current model to checkpoint folder. :param sess: Current Tensorflow session. :param steps: Current number of steps in training process. :param saver: Tensorflow saver for session. """ last_checkpoint = self.model_path + '/model-' + str(steps) + '.cptk' saver.save(sess, last_checkpoint) tf.train.write_graph(sess.graph_def, self.model_path, 'raw_graph_def.pb', as_text=False) self.logger.info("Saved Model") def _export_graph(self): """ Exports latest saved model to .bytes format for Unity embedding. """ target_nodes = ','.join(self._process_graph()) ckpt = tf.train.get_checkpoint_state(self.model_path) freeze_graph.freeze_graph(input_graph=self.model_path + '/raw_graph_def.pb', input_binary=True, input_checkpoint=ckpt.model_checkpoint_path, output_node_names=target_nodes, output_graph=self.model_path + '/' + self.env_name + "_" + self.run_id + '.bytes', clear_devices=True, initializer_nodes="", input_saver="", restore_op_name="save/restore_all", filename_tensor_name="save/Const:0") def _initialize_trainers(self, trainer_config, sess): trainer_parameters_dict = {} self.trainers = {} for brain_name in self.env.external_brain_names: trainer_parameters = trainer_config['default'].copy() if len(self.env.external_brain_names) > 1: graph_scope = re.sub('[^0-9a-zA-Z]+', '-', brain_name) trainer_parameters['graph_scope'] = graph_scope trainer_parameters['summary_path'] = '{basedir}/{name}'.format( basedir=self.summaries_dir, name=str(self.run_id) + '_' + graph_scope) else: trainer_parameters['graph_scope'] = '' trainer_parameters['summary_path'] = '{basedir}/{name}'.format( basedir=self.summaries_dir, name=str(self.run_id)) if brain_name in trainer_config: _brain_key = brain_name while not isinstance(trainer_config[_brain_key], dict): _brain_key = trainer_config[_brain_key] for k in trainer_config[_brain_key]: trainer_parameters[k] = trainer_config[_brain_key][k] trainer_parameters_dict[brain_name] = trainer_parameters.copy() for brain_name in self.env.external_brain_names: if trainer_parameters_dict[brain_name]['trainer'] == "imitation": self.trainers[brain_name] = BehavioralCloningTrainer(sess, self.env, brain_name, trainer_parameters_dict[brain_name], self.train_model, self.seed) elif trainer_parameters_dict[brain_name]['trainer'] == "ppo": self.trainers[brain_name] = PPOTrainer(sess, self.env, brain_name, trainer_parameters_dict[brain_name], self.train_model, self.seed) else: raise UnityEnvironmentException("The trainer config contains an unknown trainer type for brain {}" .format(brain_name)) def _load_config(self): try: with open(self.trainer_config_path) as data_file: trainer_config = yaml.load(data_file) return trainer_config except IOError: raise UnityEnvironmentException("""Parameter file could not be found here {}. Will use default Hyper parameters""" .format(self.trainer_config_path)) except UnicodeDecodeError: raise UnityEnvironmentException("There was an error decoding Trainer Config from this path : {}" .format(self.trainer_config_path)) @staticmethod def _create_model_path(model_path): try: if not os.path.exists(model_path): os.makedirs(model_path) except Exception: raise UnityEnvironmentException("The folder {} containing the generated model could not be accessed." " Please make sure the permissions are set correctly." .format(model_path)) def start_learning(self): self.env.curriculum.set_lesson_number(self.lesson) trainer_config = self._load_config() self._create_model_path(self.model_path) tf.reset_default_graph() with tf.Session() as sess: self._initialize_trainers(trainer_config, sess) for k, t in self.trainers.items(): self.logger.info(t) init = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=self.keep_checkpoints) # Instantiate model parameters if self.load_model: self.logger.info('Loading Model...') ckpt = tf.train.get_checkpoint_state(self.model_path) if ckpt is None: self.logger.info('The model {0} could not be found. Make sure you specified the right ' '--run-id'.format(self.model_path)) saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(init) global_step = 0 # This is only for saving the model self.env.curriculum.increment_lesson(self._get_progress()) curr_info = self.env.reset(train_mode=self.fast_simulation) if self.train_model: for brain_name, trainer in self.trainers.items(): trainer.write_tensorboard_text('Hyperparameters', trainer.parameters) try: while any([t.get_step <= t.get_max_steps for k, t in self.trainers.items()]) or not self.train_model: if self.env.global_done: self.env.curriculum.increment_lesson(self._get_progress()) curr_info = self.env.reset(train_mode=self.fast_simulation) for brain_name, trainer in self.trainers.items(): trainer.end_episode() # Decide and take an action take_action_vector, take_action_memories, take_action_text, take_action_outputs = {}, {}, {}, {} for brain_name, trainer in self.trainers.items(): (take_action_vector[brain_name], take_action_memories[brain_name], take_action_text[brain_name], take_action_outputs[brain_name]) = trainer.take_action(curr_info) new_info = self.env.step(vector_action=take_action_vector, memory=take_action_memories, text_action=take_action_text) for brain_name, trainer in self.trainers.items(): trainer.add_experiences(curr_info, new_info, take_action_outputs[brain_name]) trainer.process_experiences(curr_info, new_info) if trainer.is_ready_update() and self.train_model and trainer.get_step <= trainer.get_max_steps: # Perform gradient descent with experience buffer trainer.update_model() # Write training statistics to Tensorboard. trainer.write_summary(self.env.curriculum.lesson_number) if self.train_model and trainer.get_step <= trainer.get_max_steps: trainer.increment_step_and_update_last_reward() if self.train_model: global_step += 1 if global_step % self.save_freq == 0 and global_step != 0 and self.train_model: # Save Tensorflow model self._save_model(sess, steps=global_step, saver=saver) curr_info = new_info # Final save Tensorflow model if global_step != 0 and self.train_model: self._save_model(sess, steps=global_step, saver=saver) except KeyboardInterrupt: print('--------------------------Now saving model-------------------------') if self.train_model: self.logger.info("Learning was interrupted. Please wait while the graph is generated.") self._save_model(sess, steps=global_step, saver=saver) pass self.env.close() if self.train_model: self._export_graph()
class UnityEnv(gym.Env): """ Provides Gym wrapper for Unity Learning Environments. Multi-agent environments use lists for object types, as done here: https://github.com/openai/multiagent-particle-envs """ def __init__(self, params): environment_filename = params['path'] worker_id = params['worker_id'] seed = params['seed'] use_visual = params['visual_mode'] multiagent = params['multiagent_mode'] self._env = UnityEnvironment(environment_filename, seed=seed) self.name = self._env.academy_name self.visual_obs = None self._action_space_size = None self._current_state = None self._n_agents = None self._multiagent = multiagent # Check brain configuration if len(self._env.brains) != 1: raise UnityGymException( "There can only be one brain in a UnityEnvironment " "if it is wrapped in a gym.") self.brain_name = self._env.external_brain_names[0] brain = self._env.brains[self.brain_name] if use_visual and brain.number_visual_observations == 0: raise UnityGymException( "`use_visual` was set to True, however there are no" " visual observations as part of this environment.") self.use_visual = brain.number_visual_observations >= 1 and use_visual if brain.number_visual_observations > 1: logger.warning( "The environment contains more than one visual observation. " "Please note that only the first will be provided in the observation." ) if brain.num_stacked_vector_observations != 1: raise UnityGymException( "There can only be one stacked vector observation in a UnityEnvironment " "if it is wrapped in a gym.") # Check for number of agents in scene. initial_info = self._env.reset()[self.brain_name] self._check_agents(len(initial_info.agents)) # Set observation and action spaces if brain.vector_action_space_type == "discrete": if len(brain.vector_action_space_size) == 1: self._action_space = spaces.Discrete( brain.vector_action_space_size[0]) else: self._action_space = spaces.MultiDiscrete( brain.vector_action_space_size) else: self._action_space_size = brain.vector_action_space_size high = np.array([1] * brain.vector_action_space_size) self._action_space = spaces.Box(-high, high, dtype=np.float32) high = np.array([np.inf] * brain.vector_observation_space_size) self.action_meanings = brain.vector_action_descriptions if self.use_visual: if brain.camera_resolutions[0]["blackAndWhite"]: depth = 1 else: depth = 3 self._observation_space = spaces.Box( 0, 1, dtype=np.float32, shape=(brain.camera_resolutions[0]["height"], brain.camera_resolutions[0]["width"], depth)) else: self._observation_space = spaces.Box(-high, high, dtype=np.float32) def reset(self, train_mode=True): """Resets the state of the environment and returns an initial observation. In the case of multi-agent environments, this is a list. Returns: observation (object/list): the initial observation of the space. """ info = self._env.reset(train_mode)[self.brain_name] n_agents = len(info.agents) self._check_agents(n_agents) if not self._multiagent: obs, reward, done, info = self._single_step(info) else: obs, reward, done, info = self._multi_step(info) return obs def step(self, action): """Run one timestep of the environment's dynamics. When end of episode is reached, you are responsible for calling `reset()` to reset this environment's state. Accepts an action and returns a tuple (observation, reward, done, info). In the case of multi-agent environments, these are lists. Args: action (object/list): an action provided by the environment Returns: observation (object/list): agent's observation of the current environment reward (float/list) : amount of reward returned after previous action done (boolean/list): whether the episode has ended. info (dict): contains auxiliary diagnostic information, including BrainInfo. """ # Use random actions for all other agents in environment. if self._multiagent: if not isinstance(action, list): raise UnityGymException( "The environment was expecting `action` to be a list.") if len(action) != self._n_agents: raise UnityGymException( "The environment was expecting a list of {} actions.". format(self._n_agents)) else: action = np.array(action) info = self._env.step(action)[self.brain_name] n_agents = len(info.agents) self._check_agents(n_agents) self._current_state = info if not self._multiagent: obs, reward, done, info = self._single_step(info) else: obs, reward, done, info = self._multi_step(info) return obs, reward, done, info def _single_step(self, info): if self.use_visual: self.visual_obs = info.visual_observations[0][0, :, :, :] default_observation = self.visual_obs else: default_observation = info.vector_observations[0, :] return default_observation, info.rewards[0], info.local_done[0], { "text_observation": info.text_observations[0], "brain_info": info } def _multi_step(self, info): if self.use_visual: self.visual_obs = info.visual_observations default_observation = self.visual_obs else: default_observation = info.vector_observations return list(default_observation), info.rewards, info.local_done, { "text_observation": info.text_observations, "brain_info": info } def render(self, mode='rgb_array'): return self.visual_obs def close(self): """Override _close in your subclass to perform any necessary cleanup. Environments will automatically close() themselves when garbage collected or when the program exits. """ self._env.close() def get_action_meanings(self): return self.action_meanings def seed(self, seed=None): """Sets the seed for this env's random number generator(s). Currently not implemented. """ logger.warning("Could not seed environment %s", self.name) return def _check_agents(self, n_agents): if not self._multiagent and n_agents > 1: raise UnityGymException( "The environment was launched as a single-agent environment, however" "there is more than one agent in the scene.") elif self._multiagent and n_agents <= 1: raise UnityGymException( "The environment was launched as a mutli-agent environment, however" "there is only one agent in the scene.") if self._n_agents is None: self._n_agents = n_agents logger.info("{} agents within environment.".format(n_agents)) elif self._n_agents != n_agents: raise UnityGymException( "The number of agents in the environment has changed since " "initialization. This is not supported.") @property def metadata(self): return {'render.modes': ['rgb_array']} @property def reward_range(self): return -float('inf'), float('inf') @property def spec(self): return None @property def action_space_size(self): return self._action_space_size @property def action_space(self): return self._action_space @property def observation_space(self): return self._observation_space @property def number_agents(self): return self._n_agents