def test_step(): with mock.patch('subprocess.Popen'): with mock.patch('socket.socket') as mock_socket: with mock.patch('glob.glob') as mock_glob: mock_glob.return_value = ['FakeLaunchPath'] mock_socket.return_value.accept.return_value = (mock_socket, 0) mock_socket.recv.return_value.decode.return_value = dummy_start env = UnityEnvironment(' ') brain = env.brains['RealFakeBrain'] mock_socket.recv.side_effect = dummy_reset brain_info = env.reset() mock_socket.recv.side_effect = dummy_step brain_info = env.step([0] * brain.vector_action_space_size * len(brain_info['RealFakeBrain'].agents)) with pytest.raises(UnityActionException): env.step([0]) brain_info = env.step([0] * brain.vector_action_space_size * len(brain_info['RealFakeBrain'].agents)) with pytest.raises(UnityActionException): env.step([0] * brain.vector_action_space_size * len(brain_info['RealFakeBrain'].agents)) env.close() assert env.global_done assert isinstance(brain_info, dict) assert isinstance(brain_info['RealFakeBrain'], BrainInfo) assert isinstance(brain_info['RealFakeBrain'].visual_observations, list) assert isinstance(brain_info['RealFakeBrain'].vector_observations, np.ndarray) assert len(brain_info['RealFakeBrain'].visual_observations) == brain.number_visual_observations assert brain_info['RealFakeBrain'].vector_observations.shape[0] == \ len(brain_info['RealFakeBrain'].agents) assert brain_info['RealFakeBrain'].vector_observations.shape[1] == \ brain.vector_observation_space_size * brain.num_stacked_vector_observations assert not brain_info['RealFakeBrain'].local_done[0] assert brain_info['RealFakeBrain'].local_done[2]
def test_ppo_model_cc_visual_curio(mock_communicator, mock_launcher): tf.reset_default_graph() with tf.Session() as sess: with tf.variable_scope("FakeGraphScope"): mock_communicator.return_value = MockCommunicator( discrete_action=False, visual_inputs=2) env = UnityEnvironment(' ') model = PPOModel(env.brains["RealFakeBrain"], use_curiosity=True) init = tf.global_variables_initializer() sess.run(init) run_list = [model.output, model.all_probs, model.value, model.entropy, model.learning_rate, model.intrinsic_reward] feed_dict = {model.batch_size: 2, model.sequence_length: 1, model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]), model.next_vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]), model.output: [[0.0, 0.0], [0.0, 0.0]], model.visual_in[0]: np.ones([2, 40, 30, 3]), model.visual_in[1]: np.ones([2, 40, 30, 3]), model.next_visual_in[0]: np.ones([2, 40, 30, 3]), model.next_visual_in[1]: np.ones([2, 40, 30, 3]) } sess.run(run_list, feed_dict=feed_dict) env.close()
def test_initialization(): with mock.patch('subprocess.Popen'): with mock.patch('socket.socket') as mock_socket: with mock.patch('glob.glob') as mock_glob: mock_glob.return_value = ['FakeLaunchPath'] mock_socket.return_value.accept.return_value = (mock_socket, 0) mock_socket.recv.return_value.decode.return_value = dummy_start env = UnityEnvironment(' ') with pytest.raises(UnityActionException): env.step([0]) assert env.brain_names[0] == 'RealFakeBrain' env.close()
def test_close(): with mock.patch('subprocess.Popen'): with mock.patch('socket.socket') as mock_socket: with mock.patch('glob.glob') as mock_glob: mock_glob.return_value = ['FakeLaunchPath'] mock_socket.return_value.accept.return_value = (mock_socket, 0) mock_socket.recv.return_value.decode.return_value = dummy_start env = UnityEnvironment(' ') assert env._loaded env.close() assert not env._loaded mock_socket.close.assert_called_once()
def test_ppo_model_discrete(): d_action_c_state_start = '''{ "AcademyName": "RealFakeAcademy", "resetParameters": {}, "brainNames": ["RealFakeBrain"], "externalBrainNames": ["RealFakeBrain"], "logPath":"RealFakePath", "apiNumber":"API-3", "brainParameters": [{ "vectorObservationSize": 3, "numStackedVectorObservations": 2, "vectorActionSize": 2, "memorySize": 0, "cameraResolutions": [{"width":30,"height":40,"blackAndWhite":false}], "vectorActionDescriptions": ["",""], "vectorActionSpaceType": 0, "vectorObservationSpaceType": 1 }] }'''.encode() tf.reset_default_graph() with mock.patch('subprocess.Popen'): with mock.patch('socket.socket') as mock_socket: with mock.patch('glob.glob') as mock_glob: # End of mock with tf.Session() as sess: with tf.variable_scope("FakeGraphScope"): mock_glob.return_value = ['FakeLaunchPath'] mock_socket.return_value.accept.return_value = (mock_socket, 0) mock_socket.recv.return_value.decode.return_value = d_action_c_state_start env = UnityEnvironment(' ') model = PPOModel(env.brains["RealFakeBrain"]) init = tf.global_variables_initializer() sess.run(init) run_list = [model.output, model.all_probs, model.value, model.entropy, model.learning_rate] feed_dict = {model.batch_size: 2, model.sequence_length: 1, model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]]), model.visual_in[0]: np.ones([2, 40, 30, 3]) } sess.run(run_list, feed_dict=feed_dict) env.close()
def test_cc_bc_model(mock_communicator, mock_launcher): tf.reset_default_graph() with tf.Session() as sess: with tf.variable_scope("FakeGraphScope"): mock_communicator.return_value = MockCommunicator( discrete_action=False, visual_inputs=0) env = UnityEnvironment(' ') model = BehavioralCloningModel(env.brains["RealFakeBrain"]) init = tf.global_variables_initializer() sess.run(init) run_list = [model.sample_action, model.policy] feed_dict = {model.batch_size: 2, model.sequence_length: 1, model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]])} sess.run(run_list, feed_dict=feed_dict) env.close()
def test_ppo_model_dc_vector(mock_communicator, mock_launcher): tf.reset_default_graph() with tf.Session() as sess: with tf.variable_scope("FakeGraphScope"): mock_communicator.return_value = MockCommunicator( discrete_action=True, visual_inputs=0) env = UnityEnvironment(' ') model = PPOModel(env.brains["RealFakeBrain"]) init = tf.global_variables_initializer() sess.run(init) run_list = [model.output, model.all_probs, model.value, model.entropy, model.learning_rate] feed_dict = {model.batch_size: 2, model.sequence_length: 1, model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]])} sess.run(run_list, feed_dict=feed_dict) env.close()
def test_cc_bc_model(): c_action_c_state_start = '''{ "AcademyName": "RealFakeAcademy", "resetParameters": {}, "brainNames": ["RealFakeBrain"], "externalBrainNames": ["RealFakeBrain"], "logPath":"RealFakePath", "apiNumber":"API-3", "brainParameters": [{ "vectorObservationSize": 3, "numStackedVectorObservations": 2, "vectorActionSize": 2, "memorySize": 0, "cameraResolutions": [], "vectorActionDescriptions": ["",""], "vectorActionSpaceType": 1, "vectorObservationSpaceType": 1 }] }'''.encode() tf.reset_default_graph() with mock.patch('subprocess.Popen'): with mock.patch('socket.socket') as mock_socket: with mock.patch('glob.glob') as mock_glob: # End of mock with tf.Session() as sess: with tf.variable_scope("FakeGraphScope"): mock_glob.return_value = ['FakeLaunchPath'] mock_socket.return_value.accept.return_value = (mock_socket, 0) mock_socket.recv.return_value.decode.return_value = c_action_c_state_start env = UnityEnvironment(' ') model = BehavioralCloningModel(env.brains["RealFakeBrain"]) init = tf.global_variables_initializer() sess.run(init) run_list = [model.sample_action, model.policy] feed_dict = {model.batch_size: 2, model.sequence_length: 1, model.vector_in: np.array([[1, 2, 3, 1, 2, 3], [3, 4, 5, 3, 4, 5]])} sess.run(run_list, feed_dict=feed_dict) env.close()
def __init__(self, env_spec, env_space, e=0): self.env_spec = env_spec self.env_space = env_space self.info_space = env_space.info_space self.e = e util.set_attr(self, self.env_spec) self.name = self.env_spec['name'] self.body_e = None self.nanflat_body_e = None # nanflatten version of bodies self.body_num = None worker_id = int(f'{os.getpid()}{self.e+int(ps.unique_id())}'[-4:]) self.u_env = UnityEnvironment(file_name=util.get_env_path(self.name), worker_id=worker_id) # spaces for NN auto input/output inference logger.warn('Unity environment observation_space and action_space are constructed with invalid range. Use only their shapes.') self.observation_spaces = [] self.action_spaces = [] for a in range(len(self.u_env.brain_names)): observation_shape = (self.get_observable_dim(a)['state'],) if self.get_brain(a).state_space_type == 'discrete': observation_space = gym.spaces.Box(low=0, high=1, shape=observation_shape, dtype=np.int32) else: observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=observation_shape, dtype=np.float32) self.observation_spaces.append(observation_space) if self.is_discrete(a): action_space = gym.spaces.Discrete(self.get_action_dim(a)) else: action_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32) self.action_spaces.append(action_space) for observation_space, action_space in zip(self.observation_spaces, self.action_spaces): set_gym_space_attr(observation_space) set_gym_space_attr(action_space) # TODO experiment to find out optimal benchmarking max_timestep, set # TODO ensure clock_speed from env_spec self.clock_speed = 1 self.clock = Clock(self.clock_speed) self.done = False
plt.plot(scores["index"], scores["scores_avg"], color=sns.xkcd_rgb["amber"]) plt.legend(["Scores", "MA(%d)" % window_size]) if ARGS.double_dqn: plt.savefig(os.path.join(ARGS.figure_dir, "score_plot_double_dqn.png")) elif ARGS.dueling_dqn: plt.savefig(os.path.join(ARGS.figure_dir, "score_plot_dueling_dqn.png")) else: plt.savefig(os.path.join(ARGS.figure_dir, "score_plot_dqn.png")) if __name__ == "__main__": env = UnityEnvironment(file_name="Banana_Linux_NoVis/Banana.x86_64", worker_id=1, seed=1) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # initialize an agent agent = Agent(state_size=37, action_size=4, seed=1, args=ARGS) # training a dqn agent scores = dqn(n_episodes=3000, max_t=1000) # visualization plot_scores(scores, window_size=ARGS.window_size)
# Imports import agent from unityagents import UnityEnvironment import train_model as tm env = UnityEnvironment(file_name="./Banana_Windows_x86_64/Banana.exe") # Get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # Reset environment env_info = env.reset(train_mode=True)[brain_name] # Size of task (parameters we need) state = env_info.vector_observations[0] state_size = len(state) action_space_size = brain.vector_action_space_size # Initialise an agent agent = agent.Agent(state_size, action_space_size) # Train agent with environment and rewards scores = tm.deep_q_learning(agent, env, brain_name) # Plot scores tm.plot_scores(scores) # Properly close environment env.close()
default=5e-4, metavar='L', help='discount factor (default: 0.99)') parser.add_argument('--seed', type=int, default=123, metavar='N', help='random seed (default: 123)') parser.add_argument('--render', action='store_true', help='render the environment') args = parser.parse_args() # creating Banana unity environment instance env = UnityEnvironment(file_name="Banana_Linux/Banana.x86_64", no_graphics=True) brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] action_size = brain.vector_action_space_size state = env_info.vector_observations[0] state_size = len(state) # policy instance (model) policy = Policy(state_size, action_size) # load the pre-trained weight file # policy.load_state_dict(torch.load('checkpoint_re.pth')) # policy.train()
# python test_agent.py --model checkpoint.pth import argparse import sys import os from unityagents import UnityEnvironment import numpy as np import matplotlib.pyplot as plt import pandas as pd import torch from dqn_agent import Agent if __name__ == '__main__': env = UnityEnvironment(file_name="VisualBanana.app") # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents in the environment print('Number of agents:', len(env_info.agents)) # number of actions action_size = brain.vector_action_space_size print('Number of actions:', action_size)
def train_agent( env: unityagents.UnityEnvironment, agent: agents.DDPGAgent, n_episodes: int = 200, mean_score_threshold: float = 30.0, max_t: int = 1000, has_ou_noise: bool = True, scores_maxlen: int = 100, ou_noise_sigma_start: float = 0.5, ou_noise_sigma_end: float = 0.01, ou_noise_sigma_decay: float = 0.99, n_random_episodes: int = 100, logging_freq: int = 10, checkpoints_dir: typing.Optional[pathlib.Path] = None, checkpoints_freq: int = 50, ) -> pd.DataFrame: """ Train agent for Unity Tennis environment and return results. Parameters ---------- env Unity environment agent And instance of Deep Reinforcement Learning Agent from drl_ctrl.agents module n_episodes Maximum number of episodes mean_score_threshold Threshold of mean last 100 weights to stop training and save results max_t Maximum number of time steps per episode has_ou_noise If True, Ornstein-Uhlenbeck noise is added to actions scores_maxlen Maximum length of scores window ou_noise_sigma_start Ornstein-Uhlenbeck noise sigma starting value per episode ou_noise_sigma_end Ornstein-Uhlenbeck noise sigma minimum value per episode ou_noise_sigma_decay Ornstein-Uhlenbeck noise sigma multiplicative decay n_random_episodes Number of random episodes to gather experience logging_freq Logging frequency checkpoints_dir Model checkpoints output directory checkpoints_freq Checkpoint frequency to check if agent scores achieves average score threshold """ logger = logging.getLogger(__name__) scores = [] scores_avg100 = [] scores_window = deque(maxlen=scores_maxlen) time_started = time.time() times_total = [] times_per_episode = [] time_steps = [] i_last_checkpoint = 0 for i_episode in range(1, (n_random_episodes + n_episodes + 1)): time_started_episode = time.time() brain_name = env.brain_names[0] env_info = env.reset(train_mode=True)[brain_name] agent.reset() states = env_info.vector_observations num_agents = len(env_info.agents) agent_scores = np.zeros(num_agents) ou_noise_sigma = ou_noise_sigma_start t = 1 while True: # choose action (for each agent) if i_episode <= n_random_episodes: action_size = env.brains[brain_name].vector_action_space_size actions = np.random.randn(num_agents, action_size) actions = np.clip(actions, -1, 1) else: actions = agent.act(states, ou_noise_sigma=ou_noise_sigma, add_noise=has_ou_noise) ou_noise_sigma = max(ou_noise_sigma_end, ou_noise_sigma * ou_noise_sigma_decay) # take action in the environment(for each agent) env_info = env.step(actions)[brain_name] # get next state (for each agent) next_states = env_info.vector_observations # see if episode finished dones = env_info.local_done # update the score (for each agent) agent_scores += env_info.rewards if i_episode <= n_random_episodes: agent.memory.add_batch(states, actions, env_info.rewards, next_states, dones) else: agent.step(states, actions, env_info.rewards, next_states, dones) # roll over states to next time step states = next_states # exit loop if episode finished if np.any(dones): break t += 1 score = float(np.max(agent_scores)) scores_window.append(score) scores.append(score) scores_avg100.append(np.mean(scores_window)) times_total.append(time.time() - time_started) times_per_episode.append(time.time() - time_started_episode) time_steps.append(t) if i_episode % logging_freq == 0: logger.info(f'\rEp: {i_episode}' f'\tSigma({t}): {ou_noise_sigma:.3f}' f'\tScore: {score:.2f}' f'\tAvg. Score: {np.mean(scores_window):.2f}' f'\tTime_e: {times_per_episode[-1]:.3f}s' f'\tTime: {times_total[-1]:.3f}s') if len(scores_window) == scores_maxlen and np.mean( scores_window) >= mean_score_threshold: if (checkpoints_dir is not None and ((i_episode - i_last_checkpoint) % checkpoints_freq) == 0): checkpoint_dir = checkpoints_dir.joinpath( f"episode_{i_episode}") checkpoint_dir.mkdir(parents=True, exist_ok=True) torch.save( agent.actor_local.state_dict(), str(path_util.mk_path_weights_actor_local(checkpoint_dir))) torch.save( agent.actor_target.state_dict(), str(path_util.mk_path_weights_actor_target( checkpoint_dir))) torch.save( agent.critic_local.state_dict(), str(path_util.mk_path_weights_critic_local( checkpoint_dir))) torch.save( agent.critic_target.state_dict(), str( path_util.mk_path_weights_critic_target( checkpoints_dir))) logger.info( f'\nSaved model checkpoint to {str(checkpoints_dir)}') else: logger.info( f'\nEnvironment solved in {i_episode - 100:d} episodes!' f'\nScore: {score:.2f}' f'\tAverage Score: {np.mean(scores_window):.2f}' f'\tAverage Time_e: {np.mean(times_per_episode):.3f}s' f'\tTotal Time: {times_total[-1]:.3f}s') break return pd.DataFrame.from_records( zip(range(len(scores)), scores, scores_avg100, time_steps, times_per_episode, times_total), columns=[ cfg.COL_EPISODE, cfg.COL_SCORE, cfg.COL_SCORE_AVG100, cfg.COL_N_TIME_STEPS, cfg.COL_TIME_PER_EPISODE, cfg.COL_TIME_TOTAL ])
from unityagents import UnityEnvironment from utils import sim_act from model import DeterministicActor import numpy as np import torch # sim options NUM_SIMS = 5 # Maximum number of training episodes # observation and action specs of each agent osize = 24 asize = 2 # create environment env = UnityEnvironment(file_name='tennis.app') # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # initialize actors and critics actor = DeterministicActor(osize, asize, seed=0) actor.load_state_dict(torch.load('checkpoint_actor_cpu.pth')) # ------ Train loop ------- for ep_count in range(NUM_SIMS): # reset the environment env_info = env.reset(train_mode=False)[brain_name]
import torch import numpy as np from collections import deque import matplotlib.pyplot as plt #%matplotlib inline from unityagents import UnityEnvironment from drl.agent import Agent # get environment env = UnityEnvironment(file_name='Tennis_Linux/Tennis.x86_64') # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=False)[brain_name] num_agents = len(env_info.agents) action_size = brain.vector_action_space_size states = env_info.vector_observations state_size = states.shape[1] agent = Agent(state_size=state_size, action_size=action_size, random_seed=1337) agent.load(torch.load("checkpoints/checkpoint_actor_v2_solved.pth"), torch.load("checkpoints/checkpoint_critic_v2_solved.pth")); print_every=100 scores_deque = deque(maxlen=print_every) scores_total = []
# # Multiple env classes simultaneously # env_path = environment.get_env_path('3dball') # env_1 = UnityEnvironment(file_name=env_path, worker_id=1) # env_path = environment.get_env_path('gridworld') # env_2 = UnityEnvironment(file_name=env_path, worker_id=2) # env_1.reset(train_mode=False) # env_2.reset(train_mode=False) env_path = util.get_env_path('gridworld') # use train_mode = False to debug, i.e. render env at real size, real time train_mode = False # UnityEnvironment interfaces python with Unity, # and contains brains for controlling connected agents. env = UnityEnvironment(file_name=env_path) print(str(env)) # get the default brain default_brain = env.brain_names[0] brain = env.brains[default_brain] env_info = env.reset(train_mode=train_mode)[default_brain] ''' is_continuous = (brain.action_space_type == 'continuous') use_observations = (brain.number_observations > 0) use_states = (brain.state_space_size > 0) - reset env with param, returns dict of {brain: BrainInfo} env.reset(train_mode=train_mode) env_info = env.reset(train_mode=train_mode)[default_brain]
config = JobConfig(filename) train_ddpg_agent_job(config) elif command == "hyperopt": if args[0] == '-f': metaconf_file = args[1] path = generate_random_configuration_files(metaconf_file) else: raise (ValueError, f"Unknown parameter{args[0]}") elif command == "demo": try: checkpoint_path = args[0] conf = JobConfig(checkpoint_path + "config.yml") env = UnityEnvironment( file_name="./resources/Tennis_Linux/Tennis.x86_64") brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=False)[brain_name] num_agents = len(env_info.agents) action_size = brain.vector_action_space_size states = env_info.vector_observations state_size = states.shape[1] agent = DDPGAgent(state_size, action_size, conf.random_seed, conf.buffer_size, conf.batch_size, conf.gamma, conf.tau, conf.lr_actor, conf.lr_critic, conf.weight_decay, conf.sigma, conf.actor_nn_size, conf.critic_nn_size,
save_freq = int(options['--save-freq']) env_name = options['<env>'] # Algorithm-specific parameters for tuning gamma = float(options['--gamma']) lambd = float(options['--lambd']) time_horizon = int(options['--time-horizon']) beta = float(options['--beta']) num_epoch = int(options['--num-epoch']) epsilon = float(options['--epsilon']) buffer_size = int(options['--buffer-size']) learning_rate = float(options['--learning-rate']) hidden_units = int(options['--hidden-units']) batch_size = int(options['--batch-size']) env = UnityEnvironment(file_name=env_name) print(str(env)) brain_name = env.brain_names[0] tf.reset_default_graph() # Create the Tensorflow model graph ppo_model = create_agent_model(env, lr=learning_rate, h_size=hidden_units, epsilon=epsilon, beta=beta) is_continuous = (env.brains[brain_name].action_space_type == "continuous") use_observations = (env.brains[brain_name].number_observations > 0) if not os.path.exists(model_path): os.makedirs(model_path)
from unityagents import UnityEnvironment from tqdm import tqdm import numpy as np import torch from torch import FloatTensor, LongTensor, cuda import sys DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if sys.platform == "darwin": env = UnityEnvironment(file_name="./Banana.app") else: env = UnityEnvironment(file_name="/data/Banana_Linux_NoVis/Banana.x86_64") # env = UnityEnvironment(file_name="./Banana_Linux") # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] print(brain_name) print(brain) state_size = brain.vector_observation_space_size print("State size: ", state_size) # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents in the environment print('Number of agents:', len(env_info.agents))
from unityagents import UnityEnvironment import numpy as np import torch import matplotlib.pyplot as plt from agent import Agent num_agents = 1 # please do not modify the line below env = UnityEnvironment(file_name="Reacher_Windows_x86_64/Reacher.exe") # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents in the environment print('Number of agents:', len(env_info.agents)) # number of actions action_size = brain.vector_action_space_size print('Number of actions:', action_size) # examine the state space state = env_info.vector_observations[0] print('States look like:', state) state_size = len(state)
from unityagents import UnityEnvironment import numpy as np from tqdm import tqdm from agent import DQAgent from utils import draw unity_environment_path = "./Banana_Linux/Banana.x86_64" best_model_path = "./best_model.checkpoint" if __name__ == "__main__": # prepare environment env = UnityEnvironment(file_name=unity_environment_path) brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] # number of actions action_size = brain.vector_action_space_size # examine the state space state = env_info.vector_observations[0] state_size = len(state) agent = DQAgent(state_size, action_size) agent.load(best_model_path) test_scores = [] for i_episode in tqdm(range(1, 101)): score = 0 # initialize the score env_info = env.reset( train_mode=True)[brain_name] # reset the environment
class UnityEnv: ''' Class for all Envs. Standardizes the UnityEnv design to work in Lab. Access Agents properties by: Agents - AgentSpace - AEBSpace - EnvSpace - Envs ''' def __init__(self, env_spec, env_space, e=0): self.env_spec = env_spec self.env_space = env_space self.info_space = env_space.info_space self.e = e util.set_attr(self, self.env_spec) self.name = self.env_spec['name'] self.body_e = None self.nanflat_body_e = None # nanflatten version of bodies self.body_num = None worker_id = int(f'{os.getpid()}{self.e+int(ps.unique_id())}'[-4:]) self.u_env = UnityEnvironment(file_name=util.get_env_path(self.name), worker_id=worker_id) # spaces for NN auto input/output inference logger.warn('Unity environment observation_space and action_space are constructed with invalid range. Use only their shapes.') self.observation_spaces = [] self.action_spaces = [] for a in range(len(self.u_env.brain_names)): observation_shape = (self.get_observable_dim(a)['state'],) if self.get_brain(a).state_space_type == 'discrete': observation_space = gym.spaces.Box(low=0, high=1, shape=observation_shape, dtype=np.int32) else: observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=observation_shape, dtype=np.float32) self.observation_spaces.append(observation_space) if self.is_discrete(a): action_space = gym.spaces.Discrete(self.get_action_dim(a)) else: action_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32) self.action_spaces.append(action_space) for observation_space, action_space in zip(self.observation_spaces, self.action_spaces): set_gym_space_attr(observation_space) set_gym_space_attr(action_space) # TODO experiment to find out optimal benchmarking max_timestep, set # TODO ensure clock_speed from env_spec self.clock_speed = 1 self.clock = Clock(self.clock_speed) self.done = False def check_u_brain_to_agent(self): '''Check the size match between unity brain and agent''' u_brain_num = self.u_env.number_brains agent_num = len(self.body_e) assert u_brain_num == agent_num, f'There must be a Unity brain for each agent. e:{self.e}, brain: {u_brain_num} != agent: {agent_num}.' def check_u_agent_to_body(self, env_info_a, a): '''Check the size match between unity agent and body''' u_agent_num = len(env_info_a.agents) body_num = util.count_nonan(self.body_e[a]) assert u_agent_num == body_num, f'There must be a Unity agent for each body; a:{a}, e:{self.e}, agent_num: {u_agent_num} != body_num: {body_num}.' def get_brain(self, a): '''Get the unity-equivalent of agent, i.e. brain, to access its info''' name_a = self.u_env.brain_names[a] brain_a = self.u_env.brains[name_a] return brain_a def get_env_info(self, env_info_dict, a): name_a = self.u_env.brain_names[a] env_info_a = env_info_dict[name_a] return env_info_a @lab_api def post_body_init(self): '''Run init for components that need bodies to exist first, e.g. memory or architecture.''' self.nanflat_body_e = util.nanflatten(self.body_e) for idx, body in enumerate(self.nanflat_body_e): body.nanflat_e_idx = idx self.body_num = len(self.nanflat_body_e) self.check_u_brain_to_agent() logger.info(util.self_desc(self)) def is_discrete(self, a): '''Check if an agent (brain) is subject to discrete actions''' return self.get_brain(a).is_discrete() def get_action_dim(self, a): '''Get the action dim for an agent (brain) in env''' return self.get_brain(a).get_action_dim() def get_action_space(self, a): return self.action_spaces[a] def get_observable_dim(self, a): '''Get the observable dim for an agent (brain) in env''' return self.get_brain(a).get_observable_dim() def get_observable_types(self, a): '''Get the observable for an agent (brain) in env''' return self.get_brain(a).get_observable_types() def get_observation_space(self, a): return self.observation_spaces[a] @lab_api def reset(self): self.done = False env_info_dict = self.u_env.reset(train_mode=(util.get_lab_mode() != 'dev'), config=self.env_spec.get('unity')) _reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e) for (a, b), body in util.ndenumerate_nonan(self.body_e): env_info_a = self.get_env_info(env_info_dict, a) self.check_u_agent_to_body(env_info_a, a) state = env_info_a.states[b] state_e[(a, b)] = state done_e[(a, b)] = self.done return _reward_e, state_e, done_e @lab_api def step(self, action_e): # TODO implement clock_speed: step only if self.clock.to_step() if self.done: return self.reset() action_e = util.nanflatten(action_e) env_info_dict = self.u_env.step(action_e) reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e) for (a, b), body in util.ndenumerate_nonan(self.body_e): env_info_a = self.get_env_info(env_info_dict, a) reward_e[(a, b)] = env_info_a.rewards[b] state_e[(a, b)] = env_info_a.states[b] done_e[(a, b)] = env_info_a.local_done[b] self.done = (util.nonan_all(done_e) or self.clock.get('t') > self.max_timestep) return reward_e, state_e, done_e @lab_api def close(self): self.u_env.close()
score = 0 env_info = env.reset(train_mode=False)[brain_name] state = env_info.vector_observations[0] while True: action = agent.act(state) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[ 0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished state = next_state score += reward if done: break print('episodes %d get score %d' % (i, score)) env.close() if __name__ == '__main__': env = UnityEnvironment(file_name="./Banana_Env/Banana_Linux/Banana.x86_64") brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] # number of actions action_size = brain.vector_action_space_size # examine the state space state = env_info.vector_observations[0] state_size = len(state) agent = getattr(agent, cfgs.AGENT_TYPE)(state_size, action_size, seed=0) show_results(env, brain_name, agent)
parser.add_argument('--model', type=str, help='Path to trained model', default='checkpoint.pth') parser.add_argument('--type', type=str, help='NN type - NoisyDueling, Dueling or Q', default='NoisyDueling') parser.add_argument('--cuda', dest='cuda', action='store_true') parser.add_argument('--no_cuda', dest='cuda', action='store_false') parser.set_defaults(cuda=True) print('Testing') args = parser.parse_args() env = UnityEnvironment(file_name=args.environment) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=False)[brain_name] # initialize agent action_size = brain.vector_action_space_size state = env_info.vector_observations[0] state_size = len(state) agent = Agent(state_size=state_size, action_size=action_size, seed=0,
# ### 1. Start the Environment # # Run the next code cell to install a few packages. This line will take a few minutes to run! # The environments corresponding to both versions of the environment are already saved in the Workspace and can be accessed at the file paths provided below. # # Please select one of the two options below for loading the environment. # In[3]: from unityagents import UnityEnvironment import numpy as np # select this option to load version 1 (with a single agent) of the environment env = UnityEnvironment(file_name='Tennis_Linux/Tennis.x86_64', no_graphics=True) # Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python. # In[4]: # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] for brains in env.brain_names: print(brains) # ### 2. Examine the State and Action Spaces # # Run the code cell below to print some information about the environment.
def evaluate(agent_dir: Path, number_of_episodes: int = 1000, maximum_timestaps: int = 1000, environment_path: str = DEFAULT_ENVIRONMENT_EXECUTABLE_PATH): """Evaluate an agent on some episodes. Note that the agent is not trained during the evaluation and the exploration is set to 0. Thus the results really reflect the final performance of the agent.""" agent_path = agent_dir / 'checkpoint.pth' if not agent_path.exists(): logging.warning(f'No saved parameters found for agent in {agent_dir}.') return hist_path = agent_dir / 'evaluation_histogram.png' scores_path = agent_dir / 'scores_evaluation.csv' env = UnityEnvironment(file_name=environment_path, no_graphics=True) brain_name = env.brain_names[0] brain = env.brains[brain_name] action_size = brain.vector_action_space_size env_info = env.reset(train_mode=True)[brain_name] state_size = len(env_info.vector_observations[0]) agent = DqnAgent(state_size=state_size, action_size=action_size, device=DEVICE) agent.load(agent_path) scores = [] for _ in tqdm(list(range(1, number_of_episodes + 1))): env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] score = 0 for t in range(maximum_timestaps): action = agent.act(state, epsilon=0.0) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] state = next_state score += reward if done: break scores.append(score) scores_ts = pd.Series(scores) plt.hist(scores, bins=100, color='steelblue') xlim = plt.ylim() med = scores_ts.median() plt.vlines(med, *xlim, linewidth=2, linestyle='--', color='orange', label=f'median: {med}') plt.legend() plt.savefig(hist_path) scores_ts.to_csv(scores_path, index=False)
args = vars(parser.parse_args()) print(args) for key, value in args.items(): exec(f'{key} = {value}') os.system(f'mkdir -p results/model-{model_num}') with open(f'results/model-{model_num}/training_params.json', 'w') as outfile: json.dump(args, outfile) # env = UnityEnvironment(file_name="Tennis_Linux/Tennis.x86_64", worker_id=int(f'4{model_num}')) # env = UnityEnvironment(file_name="Tennis_Linux_NoVis/Tennis.x86_64", worker_id=int(f'{model_num}')) # env = UnityEnvironment(file_name="Soccer_Linux/Soccer.x86_64", worker_id=int(f'5{model_num}')) # env = UnityEnvironment(file_name="Soccer_Linux_NoVis/Soccer.x86_64", worker_id=int(f'5{model_num}')) env = UnityEnvironment(file_name="Tennis.app", worker_id=1000) brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] num_agents = len(env_info.agents) action_size = brain.vector_action_space_size states = env_info.vector_observations state_size = states.shape[1] agent = Agents(state_size=state_size, action_size=action_size, num_agents=num_agents, random_seed=seed, fc1_units=fc1_units, fc2_units=fc2_units, BUFFER_SIZE=BUFFER_SIZE, BATCH_SIZE=BATCH_SIZE, GAMMA=GAMMA, TAU=TAU, LR_ACTOR=LR_ACTOR, LR_CRITIC=LR_CRITIC, CRITIC_WEIGHT_DECAY=CRITIC_WEIGHT_DECAY)
def main(): env = UnityEnvironment(file_name="./Tennis_Linux/Tennis.x86_64") print_env_info(env) random_play(env) env.close()
default='cuda', help="Select device for training and inference") parser.add_argument('--discount-rate', type=float, default=0.99, help='') parser.add_argument('--tau', type=float, default=0.95, help='') parser.add_argument('--gradient-clip', type=float, default=5, help='') parser.add_argument('--rollout-length', type=int, default=2048, help='') parser.add_argument('--ppo-epochs', type=int, default=10, help='') parser.add_argument('--ppo-clip', type=float, default=2.0, help='') parser.add_argument('--batch-size', type=int, default=32, help='') parser.add_argument('--entropy-coefficent', type=float, default=1E-2, help='') parser.add_argument('--required-reward', type=float, default=30, help='') parser.add_argument('--learning-rate', type=float, default=3E-4, help='') parser.add_argument('--hidden-units', type=int, default=512, help='') args = parser.parse_args() env = UnityEnvironment(file_name='../Reacher_Linux/Reacher.x86_64') brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] _STATE_SIZE = env_info.vector_observations.shape[1] _NUM_ACTIONS = brain.vector_action_space_size _NUM_AGENTS = len(env_info.agents) def play(policy, args): env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations scores = np.zeros(_NUM_AGENTS) while True: actions, _, _, _ = policy(torch.FloatTensor(states).to(args.device))
'--eps_start', help='starting value of epsilon', nargs=1) parse.add_argument('-e', '--eps_end', help='minimum value of epsilon', nargs=1) parse.add_argument( '-d', '--eps_decay', help='multiplicative factor (per episode) for decreasing epsilon', nargs=1) # set parameters for agent training n_episodes, max_t, eps_start, eps_end, eps_decay = arg_parser( parse.parse_args()) # start environment env = UnityEnvironment(file_name=ENVIRONMENT_PATH) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of actions available to the agent action_size = brain.vector_action_space_size # examine state space state = env_info.vector_observations[0] # get size of the state
import gym import random import torch import numpy as np import sys from tqdm import tqdm from unityagents import UnityEnvironment from collections import deque import matplotlib.pyplot as plt from ddpg_agent import Agent, ReplayBuffer, OUNoise env = UnityEnvironment(file_name='app/Reacher.app') print('Loaded env') device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Using device: {}".format(device)) # Get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] print(brain) # Reset the environment env_info = env.reset(train_mode=True)[brain_name] # Number of agents num_agents = len(env_info.agents) print('Number of agents:{}'.format(num_agents))
class TrainerController(object): def __init__(self, env_path, run_id, save_freq, curriculum_file, fast_simulation, load, train, worker_id, keep_checkpoints, lesson, seed, docker_target_name, trainer_config_path, no_graphics): """ :param env_path: Location to the environment executable to be loaded. :param run_id: The sub-directory name for model and summary statistics :param save_freq: Frequency at which to save model :param curriculum_file: Curriculum json file for environment :param fast_simulation: Whether to run the game at training speed :param load: Whether to load the model or randomly initialize :param train: Whether to train model, or only run inference :param worker_id: Number to add to communication port (5005). Used for multi-environment :param keep_checkpoints: How many model checkpoints to keep :param lesson: Start learning from this lesson :param seed: Random seed used for training. :param docker_target_name: Name of docker volume that will contain all data. :param trainer_config_path: Fully qualified path to location of trainer configuration file :param no_graphics: Whether to run the Unity simulator in no-graphics mode """ self.trainer_config_path = trainer_config_path if env_path is not None: env_path = (env_path.strip() .replace('.app', '') .replace('.exe', '') .replace('.x86_64', '') .replace('.x86', '')) # Strip out executable extensions if passed # Recognize and use docker volume if one is passed as an argument if docker_target_name == '': self.docker_training = False self.model_path = './models/{run_id}'.format(run_id=run_id) self.curriculum_file = curriculum_file self.summaries_dir = './summaries' else: self.docker_training = True self.model_path = '/{docker_target_name}/models/{run_id}'.format( docker_target_name=docker_target_name, run_id=run_id) if env_path is not None: env_path = '/{docker_target_name}/{env_name}'.format(docker_target_name=docker_target_name, env_name=env_path) if curriculum_file is None: self.curriculum_file = None else: self.curriculum_file = '/{docker_target_name}/{curriculum_file}'.format( docker_target_name=docker_target_name, curriculum_file=curriculum_file) self.summaries_dir = '/{docker_target_name}/summaries'.format(docker_target_name=docker_target_name) self.logger = logging.getLogger("unityagents") self.run_id = run_id self.save_freq = save_freq self.lesson = lesson self.fast_simulation = fast_simulation self.load_model = load self.train_model = train self.worker_id = worker_id self.keep_checkpoints = keep_checkpoints self.trainers = {} if seed == -1: seed = np.random.randint(0, 999999) self.seed = seed np.random.seed(self.seed) tf.set_random_seed(self.seed) self.env = UnityEnvironment(file_name=env_path, worker_id=self.worker_id, curriculum=self.curriculum_file, seed=self.seed, docker_training=self.docker_training, no_graphics=no_graphics) if env_path is None: self.env_name = 'editor_'+self.env.academy_name else: self.env_name = os.path.basename(os.path.normpath(env_path)) # Extract out name of environment def _get_progress(self): if self.curriculum_file is not None: progress = 0 if self.env.curriculum.measure_type == "progress": for brain_name in self.env.external_brain_names: progress += self.trainers[brain_name].get_step / self.trainers[brain_name].get_max_steps return progress / len(self.env.external_brain_names) elif self.env.curriculum.measure_type == "reward": for brain_name in self.env.external_brain_names: progress += self.trainers[brain_name].get_last_reward return progress else: return None else: return None def _process_graph(self): nodes = [] scopes = [] for brain_name in self.trainers.keys(): if self.trainers[brain_name].graph_scope is not None: scope = self.trainers[brain_name].graph_scope + '/' if scope == '/': scope = '' scopes += [scope] if self.trainers[brain_name].parameters["trainer"] == "imitation": nodes += [scope + x for x in ["action"]] else: nodes += [scope + x for x in ["action", "value_estimate", "action_probs"]] if self.trainers[brain_name].parameters["use_recurrent"]: nodes += [scope + x for x in ["recurrent_out", "memory_size"]] if len(scopes) > 1: self.logger.info("List of available scopes :") for scope in scopes: self.logger.info("\t" + scope) self.logger.info("List of nodes to export :") for n in nodes: self.logger.info("\t" + n) return nodes def _save_model(self, sess, saver, steps=0): """ Saves current model to checkpoint folder. :param sess: Current Tensorflow session. :param steps: Current number of steps in training process. :param saver: Tensorflow saver for session. """ last_checkpoint = self.model_path + '/model-' + str(steps) + '.cptk' saver.save(sess, last_checkpoint) tf.train.write_graph(sess.graph_def, self.model_path, 'raw_graph_def.pb', as_text=False) self.logger.info("Saved Model") def _export_graph(self): """ Exports latest saved model to .bytes format for Unity embedding. """ target_nodes = ','.join(self._process_graph()) ckpt = tf.train.get_checkpoint_state(self.model_path) freeze_graph.freeze_graph(input_graph=self.model_path + '/raw_graph_def.pb', input_binary=True, input_checkpoint=ckpt.model_checkpoint_path, output_node_names=target_nodes, output_graph=self.model_path + '/' + self.env_name + "_" + self.run_id + '.bytes', clear_devices=True, initializer_nodes="", input_saver="", restore_op_name="save/restore_all", filename_tensor_name="save/Const:0") def _initialize_trainers(self, trainer_config, sess): trainer_parameters_dict = {} self.trainers = {} for brain_name in self.env.external_brain_names: trainer_parameters = trainer_config['default'].copy() if len(self.env.external_brain_names) > 1: graph_scope = re.sub('[^0-9a-zA-Z]+', '-', brain_name) trainer_parameters['graph_scope'] = graph_scope trainer_parameters['summary_path'] = '{basedir}/{name}'.format( basedir=self.summaries_dir, name=str(self.run_id) + '_' + graph_scope) else: trainer_parameters['graph_scope'] = '' trainer_parameters['summary_path'] = '{basedir}/{name}'.format( basedir=self.summaries_dir, name=str(self.run_id)) if brain_name in trainer_config: _brain_key = brain_name while not isinstance(trainer_config[_brain_key], dict): _brain_key = trainer_config[_brain_key] for k in trainer_config[_brain_key]: trainer_parameters[k] = trainer_config[_brain_key][k] trainer_parameters_dict[brain_name] = trainer_parameters.copy() for brain_name in self.env.external_brain_names: if trainer_parameters_dict[brain_name]['trainer'] == "imitation": self.trainers[brain_name] = BehavioralCloningTrainer(sess, self.env, brain_name, trainer_parameters_dict[brain_name], self.train_model, self.seed) elif trainer_parameters_dict[brain_name]['trainer'] == "ppo": self.trainers[brain_name] = PPOTrainer(sess, self.env, brain_name, trainer_parameters_dict[brain_name], self.train_model, self.seed) else: raise UnityEnvironmentException("The trainer config contains an unknown trainer type for brain {}" .format(brain_name)) def _load_config(self): try: with open(self.trainer_config_path) as data_file: trainer_config = yaml.load(data_file) return trainer_config except IOError: raise UnityEnvironmentException("""Parameter file could not be found here {}. Will use default Hyper parameters""" .format(self.trainer_config_path)) except UnicodeDecodeError: raise UnityEnvironmentException("There was an error decoding Trainer Config from this path : {}" .format(self.trainer_config_path)) @staticmethod def _create_model_path(model_path): try: if not os.path.exists(model_path): os.makedirs(model_path) except Exception: raise UnityEnvironmentException("The folder {} containing the generated model could not be accessed." " Please make sure the permissions are set correctly." .format(model_path)) def start_learning(self): self.env.curriculum.set_lesson_number(self.lesson) trainer_config = self._load_config() self._create_model_path(self.model_path) tf.reset_default_graph() with tf.Session() as sess: self._initialize_trainers(trainer_config, sess) for k, t in self.trainers.items(): self.logger.info(t) init = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=self.keep_checkpoints) # Instantiate model parameters if self.load_model: self.logger.info('Loading Model...') ckpt = tf.train.get_checkpoint_state(self.model_path) if ckpt is None: self.logger.info('The model {0} could not be found. Make sure you specified the right ' '--run-id'.format(self.model_path)) saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(init) global_step = 0 # This is only for saving the model self.env.curriculum.increment_lesson(self._get_progress()) curr_info = self.env.reset(train_mode=self.fast_simulation) if self.train_model: for brain_name, trainer in self.trainers.items(): trainer.write_tensorboard_text('Hyperparameters', trainer.parameters) try: while any([t.get_step <= t.get_max_steps for k, t in self.trainers.items()]) or not self.train_model: if self.env.global_done: self.env.curriculum.increment_lesson(self._get_progress()) curr_info = self.env.reset(train_mode=self.fast_simulation) for brain_name, trainer in self.trainers.items(): trainer.end_episode() # Decide and take an action take_action_vector, take_action_memories, take_action_text, take_action_outputs = {}, {}, {}, {} for brain_name, trainer in self.trainers.items(): (take_action_vector[brain_name], take_action_memories[brain_name], take_action_text[brain_name], take_action_outputs[brain_name]) = trainer.take_action(curr_info) new_info = self.env.step(vector_action=take_action_vector, memory=take_action_memories, text_action=take_action_text) for brain_name, trainer in self.trainers.items(): trainer.add_experiences(curr_info, new_info, take_action_outputs[brain_name]) trainer.process_experiences(curr_info, new_info) if trainer.is_ready_update() and self.train_model and trainer.get_step <= trainer.get_max_steps: # Perform gradient descent with experience buffer trainer.update_model() # Write training statistics to Tensorboard. trainer.write_summary(self.env.curriculum.lesson_number) if self.train_model and trainer.get_step <= trainer.get_max_steps: trainer.increment_step_and_update_last_reward() if self.train_model: global_step += 1 if global_step % self.save_freq == 0 and global_step != 0 and self.train_model: # Save Tensorflow model self._save_model(sess, steps=global_step, saver=saver) curr_info = new_info # Final save Tensorflow model if global_step != 0 and self.train_model: self._save_model(sess, steps=global_step, saver=saver) except KeyboardInterrupt: print('--------------------------Now saving model-------------------------') if self.train_model: self.logger.info("Learning was interrupted. Please wait while the graph is generated.") self._save_model(sess, steps=global_step, saver=saver) pass self.env.close() if self.train_model: self._export_graph()
def load_env(env_path: str, no_graphics: bool = False) -> UnityEnvironment: return UnityEnvironment(file_name=env_path, no_graphics=no_graphics)
ACTORNET_PATH = './checkpoint_actor.pth' NUM_EPISODES_TEST = 100 MAX_T_TEST = 200 def get_lr(optimizer): for param_group in optimizer.param_groups: return param_group['lr'] if not VIS: path_prefix = '_NoVis' else: path_prefix = '' env = UnityEnvironment(file_name='Tennis_Linux' + path_prefix + '/Tennis.x86_64') # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size)
import argparse from time import sleep from unityagents import UnityEnvironment from maddpg import MADDPGAgent from config import config from train import training_loop from play import play_loop parser = argparse.ArgumentParser() parser.add_argument('--train', action='store_true', dest='train', help='Set the train mode') parser.add_argument('--file_prefix', default=None, help='Set the file for agent to load weights with using prefix') parser.add_argument('--playthroughs', default=10, type=int, help='Number of playthroughs played in a play mode') parser.add_argument('--sleep', default=0, type=int, help='Time before environment starts in a play mode [seconds]') arguments = parser.parse_args() env = UnityEnvironment(file_name='./Tennis.app', seed=config.general.seed) brain_name = env.brain_names[0] agent = MADDPGAgent(config=config, file_prefix=arguments.file_prefix) if arguments.train: print('Train mode \n') training_loop(env, brain_name, agent, config) else: print('Play mode \n') sleep(arguments.sleep) play_loop(env, brain_name, agent, playthrougs=arguments.playthroughs)
end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(recent_scores))) if np.mean(recent_scores) >= 13.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(recent_scores))) torch.save(model.local_network.state_dict(), 'banana_ckpt.pth') break if __name__ == "__main__": # Environment setup env = UnityEnvironment(file_name="Banana.app") brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] state_size = len(state) action_size = brain.vector_action_space_size # Load model model = DQN(state_size=state_size, action_size=action_size, seed=0) # Start training train_dqn(model) env.close()
import os import torch from unityagents import UnityEnvironment import sys sys.path.append("../") from agent import Agent os.environ["CUDA_VISIBLE_DEVICES"] = "1" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("CUDA available {}".format(torch.cuda.is_available())) env = UnityEnvironment(file_name="../Reacher_Linux/Reacher.x86_64", no_graphics=False) # get the default brain# get t brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=False)[brain_name] # number of agents in the environment print('Number of agents:', len(env_info.agents)) # number of actions action_size = brain.vector_action_space_size print('Number of actions:', action_size) # examine the state space
# ############################################################################## # SETUP # ############################################################################## # Generate Output Directory Paths model_dir = os.path.join("models", MODEL_NAME) snapshots_dir = os.path.join(model_dir, "snapshots") # SET SEEDS FOR REPRODUCIBILITY np.random.seed(SEED) torch.manual_seed(SEED) # ############################################################################## # ENVIRONMENT # ############################################################################## env = UnityEnvironment(file_name=ENV_FILE, seed=SEED) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # ############################################################################## # AGENT # ############################################################################## # INITIALIZE AGENT, AND LOAD WEIGHTS FROM BEST SNAPSHOT maddpg = MADDPG( actor_layer_sizes=ACTOR_LAYER_SIZES, critic_layer_sizes=CRITIC_LAYER_SIZES, clamp_actions=CLAMP_ACTIONS, logger=None, )
class UnityEnv(gym.Env): def __init__(self, app_name=None, idx=0): # parameter app_path = os.path.join(os.path.dirname(__file__), 'assets', app_name) idx = idx no_graphics = False self.num_envs = 1 # create environment self._env = UnityEnvironment(file_name=app_path, worker_id=idx, no_graphics=no_graphics) self.name = app_name # Only Accept Environment with Only One Brain assert len(self._env.brains) == 1 self.brain_name = self._env.external_brain_names[0] self.brain = self._env.brains[self.brain_name] # viusalization self.use_visual = (self.brain.number_visual_observations == 1) # action space dimension if self.brain.vector_action_space_type == "discrete": self._a_dim = Discrete(1) else: high = np.array([np.inf] * (self.brain.vector_action_space_size)) self._a_dim = Box(-high, high) # observation spce dimension if self.use_visual and False and no_graphic: high = np.array([np.inf] * self.brain.camera_resolutions[0]["height"] * self.brain.camera_resolutions[0]["width"] * 3) self._ob_dim = Box(-high, high) else: high = np.array([np.inf] * self.brain.vector_observation_space_size) self._ob_dim = Box(-high, high) # video buffer self.frames = [] def reset(self): self.frames = [] info = self._env.reset()[self.brain_name] state = info.vector_observations[0] return np.array([state]) def step(self, action): info = self._env.step([action])[self.brain_name] state = info.vector_observations[0] reward = info.rewards[0] done = info.local_done[0] self._collect_frames(info.visual_observations[0]) return np.array([state]), np.array([reward ]), np.array([done ]), np.array([None]) def close(self): self._env.close() def _collect_frames(self, frame): if self.use_visual: self.frames.append(frame) @property def action_space(self): return self._a_dim @property def observation_space(self): return self._ob_dim
from unityagents import UnityEnvironment import numpy as np env = UnityEnvironment(file_name='Reacher.exe') # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( num_agents, state_size)) print('The state for the first agent looks like:', states[0]) env_info = env.reset(train_mode=False)[brain_name] # reset the environment # states = env_info.vector_observations # get the current state (for each agent) # scores = np.zeros(num_agents) # initialize the score (for each agent) # while True:
def __init__(self, env_path, run_id, save_freq, curriculum_file, fast_simulation, load, train, worker_id, keep_checkpoints, lesson, seed, docker_target_name, trainer_config_path, no_graphics): """ :param env_path: Location to the environment executable to be loaded. :param run_id: The sub-directory name for model and summary statistics :param save_freq: Frequency at which to save model :param curriculum_file: Curriculum json file for environment :param fast_simulation: Whether to run the game at training speed :param load: Whether to load the model or randomly initialize :param train: Whether to train model, or only run inference :param worker_id: Number to add to communication port (5005). Used for multi-environment :param keep_checkpoints: How many model checkpoints to keep :param lesson: Start learning from this lesson :param seed: Random seed used for training. :param docker_target_name: Name of docker volume that will contain all data. :param trainer_config_path: Fully qualified path to location of trainer configuration file :param no_graphics: Whether to run the Unity simulator in no-graphics mode """ self.trainer_config_path = trainer_config_path if env_path is not None: env_path = (env_path.strip() .replace('.app', '') .replace('.exe', '') .replace('.x86_64', '') .replace('.x86', '')) # Strip out executable extensions if passed # Recognize and use docker volume if one is passed as an argument if docker_target_name == '': self.docker_training = False self.model_path = './models/{run_id}'.format(run_id=run_id) self.curriculum_file = curriculum_file self.summaries_dir = './summaries' else: self.docker_training = True self.model_path = '/{docker_target_name}/models/{run_id}'.format( docker_target_name=docker_target_name, run_id=run_id) if env_path is not None: env_path = '/{docker_target_name}/{env_name}'.format(docker_target_name=docker_target_name, env_name=env_path) if curriculum_file is None: self.curriculum_file = None else: self.curriculum_file = '/{docker_target_name}/{curriculum_file}'.format( docker_target_name=docker_target_name, curriculum_file=curriculum_file) self.summaries_dir = '/{docker_target_name}/summaries'.format(docker_target_name=docker_target_name) self.logger = logging.getLogger("unityagents") self.run_id = run_id self.save_freq = save_freq self.lesson = lesson self.fast_simulation = fast_simulation self.load_model = load self.train_model = train self.worker_id = worker_id self.keep_checkpoints = keep_checkpoints self.trainers = {} if seed == -1: seed = np.random.randint(0, 999999) self.seed = seed np.random.seed(self.seed) tf.set_random_seed(self.seed) self.env = UnityEnvironment(file_name=env_path, worker_id=self.worker_id, curriculum=self.curriculum_file, seed=self.seed, docker_training=self.docker_training, no_graphics=no_graphics) if env_path is None: self.env_name = 'editor_'+self.env.academy_name else: self.env_name = os.path.basename(os.path.normpath(env_path)) # Extract out name of environment
from unityagents import UnityEnvironment import numpy as np import matplotlib.pyplot as plt if __name__ == '__main__': env = UnityEnvironment( file_name="../Banana_Env/Banana_Linux_Pixels/Banana.x86_64") # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents in the environment print('Number of agents:', len(env_info.agents)) # number of actions action_size = brain.vector_action_space_size print('Number of actions:', action_size) # examine the state space state = env_info.visual_observations[0] print('States look like:') plt.imshow(np.squeeze(state)) plt.show() state_size = state.shape print('States have shape:', state.shape) env_info = env.reset(train_mode=False)[brain_name] # reset the environment state = env_info.visual_observations[0] # get the current state score = 0 # initialize the score