def get_agent_unity(): sys.path.append(UNITY_PYTHONPATH) from unityagents import UnityEnvironment env = UnityEnvironment(file_name=FPATH, seed=RANDOM_SEED) brain_name = env.brain_names[BRAIN_INDEX] brain = env.brains[brain_name] state_size = brain.vector_observation_space_size action_size = brain.vector_action_space_size agent = Agent(state_size=state_size, action_size=action_size, random_seed=RANDOM_SEED) return env, agent
def play(): environment = Environemnt(env_path="envs/3/Tennis.exe", train_mode=False) num_agents = environment.get_number_of_agents() state_size = len(environment.get_current_state()[0]) action_size = environment.get_number_of_actions() agent = Agent(state_size=state_size, action_size=action_size, random_seed=0, num_agents=num_agents) play_env(environment, agent, num_agents)
def __init__(self, state_size, action_size, num_agents, random_seed): self.state_size = state_size self.action_size = action_size self.random_seed = random.seed(random_seed) # creating agents and store them into agents list self.agents = [Agent(state_size, action_size, num_agents, random_seed) for i in range(num_agents)] # Replay memory self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed) self.step_count = 0
def ddpg_train(n_episodes, seed, buffer_size, batch_size, gamma, tau, lr_actor, lr_critic, weight_decay): memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) agents = [ Agent(state_size, action_size, seed, buffer_size, batch_size, gamma, tau, lr_actor, lr_critic, weight_decay, memory) for _ in range(num_agents) ] load(agents) scores_deque = deque(maxlen=100) scores = [] for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations episode_scores = np.zeros(num_agents) while True: for agent in agents: agent.reset() actions = list() for agent, state in zip(agents, states): actions.append(agent.act(state)) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done for agent, state, action, reward, next_state, done in zip( agents, states, actions, rewards, next_states, dones): agent.step(state, action, reward, next_state, done) states = next_states episode_scores += np.array(rewards) if np.any(dones): break score = episode_scores.max() scores_deque.append(score) scores.append(score) print('\rEpisode: \t{} \tScore: \t{:.2f} \tAverage Score: \t{:.2f}'. format(i_episode, np.mean(score), np.mean(scores_deque)), end="") if i_episode % 10 == 0: save(agents) if np.mean(scores_deque) >= 0.5: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode, np.mean(scores_deque))) break fig = plt.figure() ax = fig.add_subplot(111) ax.grid() ax.plot(np.arange(len(scores)), scores) ax.set(xlabel="Episode #", ylabel="'Score", title="DDPG Network") fig.savefig("ddpg_network.pdf")
def __init__(self, config): self.config = config # Replay memory self.memory = ReplayBuffer(self.config.action_size, self.config.buffer_size, self.config.batch_size, self.config.seed) self.agents = [ Agent(self.config) for _ in range(self.config.num_agents) ] # 'action_size', 'num_agents', and 'random_seed' #self.agents = [Agent(self.config, self.config.action_size, self.config.num_agents, self.config.random_seed) for _ in range(self.config.num_agents)] self.t_step = 0 self.loss = (0.0, 0.0)
def __init__(self, num_agents, state_size, action_size, random_seed): super(MADDPG, self).__init__() self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.discount_factor = GAMMA self.tau = TAU self.iter = 0 self.maddpg_agents = [Agent(state_size, action_size, random_seed) for i in range(num_agents)]
def load_trained_agent(filepath): """ Load the results an parameters of a trained agent""" checkpoint = torch.load(filepath) agent = Agent(state_size=checkpoint['state_size'], action_size=checkpoint['action_size'], random_seed=checkpoint['seed'], hidden_layers=checkpoint['hidden_layers'], n_agents=checkpoint['n_agents']) agent.actor_local.load_state_dict(checkpoint['al_state_dict']) agent.critic_local.load_state_dict(checkpoint['cl_state_dict']) return agent
def __init__(self): # Create the Gym environment self.env = gym.make('DeeplengDocking-v1') rospy.loginfo("Gym environment done") self.agent = Agent(state_size=13, action_size=3, random_seed=2) # Set the logging system rospack = rospkg.RosPack() pkg_path = rospack.get_path('deepleng_control') outdir = pkg_path + '/training_results' # env = wrappers.Monitor(env, outdir, force=True) # rospy.loginfo("Monitor Wrapper started") self.max_episodes = 200 self.max_timesteps = 1000
def __init__(self, state_size, action_size, n_agents, random_seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.ma = [ Agent(state_size, action_size, i, n_agents, random_seed) for i in range(n_agents) ] # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.t_step = 0
def __init__(self, state_size, action_size, num_agents, random_seeds): self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.random_seeds = random_seeds self.agents = [ Agent(self.state_size, self.action_size, random_seeds[i]) for i in range(self.num_agents) ] self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed=7)
def __init__(self, state_size, action_size, seed = 42): super(MADDPG, self).__init__() self.agents = [Agent(state_size, action_size, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, agent_number=0, epsilon=EPSILON, epsilon_decay=EPSILON_DECAY, weight_decay=WEIGHT_DECAY, clipgrad=CLIPGRAD), Agent(state_size, action_size, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, agent_number=1, epsilon=EPSILON, epsilon_decay=EPSILON_DECAY, weight_decay=WEIGHT_DECAY, clipgrad=CLIPGRAD)] self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Init tracking of params wandb.login() wandb.init(project=project_name, name=name, config={"buffer_size": BUFFER_SIZE, "batch_size": BATCH_SIZE, "learn_every": LEARN_EVERY, "learn_number": LEARN_NUMBER, "lr_actor": LR_ACTOR, "lr_critic": LR_CRITIC, "gamma": GAMMA, "tau": TAU, "epsilon": EPSILON, "epsilon_decay": EPSILON_DECAY, "weight_decay": WEIGHT_DECAY, "clipgrad": CLIPGRAD}) jovian.log_hyperparams(project=project_name, name=name, config={"buffer_size": BUFFER_SIZE, "batch_size": BATCH_SIZE, "learn_every": LEARN_EVERY, "learn_number": LEARN_NUMBER, "lr_actor": LR_ACTOR, "lr_critic": LR_CRITIC, "gamma": GAMMA, "tau": TAU, "epsilon": EPSILON, "epsilon_decay": EPSILON_DECAY, "weight_decay": WEIGHT_DECAY, "clipgrad": CLIPGRAD})
def __init__(self, state_size, action_size, num_agents, random_seed): self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) # Instantiate Multiple Agent self.agents = [ Agent(state_size, action_size, random_seed, num_agents) for i in range(num_agents) ] # Instantiate Memory replay Buffer (shared between agents) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def ddpg(n_episodes=2000, store_every=10): scores_deque = deque(maxlen=store_every) scores = [] agents = Agent(state_size=state_size, action_size=action_size, num_agents=num_agents, random_seed=0) for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=GRAPHICS_OFF)[brain_name] state = env_info.vector_observations agents.reset() score = np.zeros(num_agents) while True: action = agents.act(state) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done agents.step(state, action, rewards, next_state, dones) state = next_state score += rewards if np.any(dones): break scores_deque.append(np.mean(score)) scores.append(np.mean(score)) avg_score = np.mean(scores_deque) print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}\t {}'.format( i_episode, np.mean(scores_deque), np.mean(score), strftime("%H:%M:%S", gmtime())), end="") if i_episode % store_every == 0 or avg_score >= TARGET_SCORE: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, avg_score)) if avg_score >= TARGET_SCORE: torch.save(agents.actor_local.state_dict(), "ckpt/{}".format(ACTOR_CHECKPOINT_NAME)) torch.save(agents.critic_local.state_dict(), "ckpt/{}".format(CRITIC_CHECKPOINT_NAME)) break return scores
def load_agents(state_size, action_size, num_agents, memory): # create agents agents= [] for i in range(num_agents): agents.append(Agent(state_size, action_size, 1, memory)) # load checkpoints for i in range(len(agents)): checkpoint = torch.load('./data/checkpoint_agent_{}_qnetwork_local.pth'.format(i)) agents[i].qnetwork_local.load_state_dict(checkpoint['state_dict']) checkpoint = torch.load('./data/checkpoint_agent_{}_qnetwork_target.pth'.format(i)) agents[i].qnetwork_target.load_state_dict(checkpoint['state_dict']) checkpoint = torch.load('./data/checkpoint_agent_{}_munetwork_local.pth'.format(i)) agents[i].munetwork_local.load_state_dict(checkpoint['state_dict']) checkpoint = torch.load('./data/checkpoint_agent_{}_munetwork_target.pth'.format(i)) agents[i].munetwork_target.load_state_dict(checkpoint['state_dict']) return agents
def main(args): print(args) env = UnityEnvironment(file_name=args.path) env_wr = EnvWrapper(env) agent = Agent(state_size=33, action_size=4, random_seed=10) scores = train(env_wr, agent, n_episodes=args.episodes) fig, ax = plt.subplots(1, 1, figsize=(8, 8)) ax.plot(scores) ax.set_xlabel("Episodes") ax.set_ylabel("Score") fig.savefig("scores.png") env_wr.close()
def __init__(self, agent_count, state_size, action_size, random_seed): self.action_size = action_size self.state_size = state_size self.agent_count = agent_count self.agents = [Agent(agent_count, state_size, action_size, random_seed) for _ in range(agent_count) ] random.seed(random_seed) # Noise process self.noise = OUNoise(action_size, random_seed) self.exploration = 1.0 # Replay memory self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed) self.setp_count = 0
def ddpg_train(n_episodes, seed, buffer_size, batch_size, gamma, tau, lr_actor, lr_critic, weight_decay): scores = [] scores_deque = deque(maxlen=100) agent = Agent(n_agents, state_size, action_size, seed, buffer_size, batch_size, gamma, tau, lr_actor, lr_critic, weight_decay) load(agent) for i_episode in range(n_episodes): env_info = env.reset( train_mode=True)[brain_name] # reset the environment states = env_info.vector_observations agent.reset() # reset the agent noise score = np.zeros(n_agents) while True: actions = agent.act(states) # send the action to the environment env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations # get the next state rewards = env_info.rewards # get the reward dones = env_info.local_done # see if episode has finished agent.step(states, actions, rewards, next_states, dones) score += rewards # update the score states = next_states # roll over the state to next time step if np.any(dones): # exit loop if episode finished break scores.append(np.mean(score)) scores_deque.append(np.mean(score)) print('\rEpisode: \t{} \tScore: \t{:.2f} \tAverage Score: \t{:.2f}'. format(i_episode, np.mean(score), np.mean(scores_deque)), end="") if n_episodes % 10 == 0: torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') if np.mean(scores_deque) >= 30.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode, np.mean(scores_deque))) break fig = plt.figure() ax = fig.add_subplot(111) ax.grid() ax.plot(np.arange(len(scores)), scores) ax.set(xlabel="Episode #", ylabel="'Score", title="DDPG Network") fig.savefig("ddpg_network.pdf")
def __init__(self, state_size, action_size, discount_factor=0.95, tau=0.02): self.num_agents = 2 self.maddpg = [ Agent(state_size, action_size, 2) for i in range(self.num_agents) ] self.state_size = state_size self.action_size = action_size self.discount_factor = discount_factor self.tau = tau self.iter = 0 self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, num_agents) #defined in the function setup self.t_step = 0
def main(): # load version 1 (with 1 agent) of the environment env_name = "Reacher_Windows_x86_64_version1\Reacher.exe" # Add the Unity Reacher Environment name no_graphics = True env = UnityEnvironment(file_name = env_name, no_graphics = no_graphics) # Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python. # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the enviroment env_info = env.reset(train_mode = True)[brain_name] # number of agents num_agents = len(env_info.agents) print("Number of agents : ", num_agents) # size of each action action_size = brain.vector_action_space_size print("Size of each action : ", action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print("There are {} agents. Each observes a state with length: {}".format(states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") random_seed = 12345 #10 agent = Agent(state_size, action_size, random_seed, device = device) scores = train_ddpg_v1(env, agent, n_episodes = 300) fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(1, len(scores)+1), scores) plt.ylabel('Score') plt.xlabel('Episode #') plt.show()
def __init__(self, agent_count, state_size, action_size, random_seed): """Initialize a MultiAgent object. Params ====== agent_count (int): Number of agents """ self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.agents = [ Agent( memory=self.memory, state_size=state_size, action_size=action_size, random_seed=random_seed, ) for _ in range(agent_count) ]
def launch(app_path, train_or_test, save_or_load_path, hyper_file): env = UnityEnvironment(file_name=app_path) brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=train_or_test)[brain_name] # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size)) agent = Agent(state_size=state_size, action_size=action_size, n_agents=num_agents, random_seed=42) if train_or_test: if hyper_file is None: scores = train_ddpg(env, agent, num_agents, save_or_load_path) else: with open(hyper_file) as f: variables = json.load(f) if len(list(set(variables.keys()) & set(["n_episodes", "max_t", "print_every"]))) != 3: print("Parameters file is not well specified") pass else: scores = train_ddpg(env, agent, num_agents, save_or_load_path, variables["n_episodes"], variables["max_t"], variables["print_every"]) plot_scores(scores, True) else: agent.qnetwork_local.load_state_dict(torch.load(save_or_load_path)) test_ddpg(env, agent, num_agent) env.close()
def main(): env = UnityEnvironment(file_name='data/Reacher_Linux/Reacher.x86_64') # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) # size of each action action_size = brain.vector_action_space_size # examine the state space states = env_info.vector_observations state_size = states.shape[1] n_agent = 20 agent = Agent(state_size=state_size, action_size=action_size, random_seed=2, n_agent=n_agent) # load trained model agent.actor_local.load_state_dict(torch.load('model/checkpoint_actor.pth')) agent.critic_local.load_state_dict( torch.load('model/checkpoint_critic.pth')) state = env.reset() env_info = env.reset(train_mode=False)[brain_name] state = env_info.vector_observations for t in range(1000): action = [ agent.act(state[agent_x], agent_x, add_noise=False) for agent_x in range(n_agent) ] env_info = env.step(action)[brain_name] next_state = env_info.vector_observations reward = env_info.rewards done = env_info.local_done state = next_state if all(done): break env.close()
def trained_agent(): agent = Agent(n_agents, state_size, action_size, 0, 0, 0, 0, 0, 0, 0, 0) load(agent) for episode in range(3): env_info = env.reset(train_mode=False)[brain_name] states = env_info.vector_observations score = np.zeros(n_agents) while True: actions = agent.act(states, add_noise=False) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done score += rewards states = next_states if np.any(dones): break print('Episode: \t{} \tScore: \t{:.2f}'.format(episode, np.mean(score))) env.close()
def main(): # select this option to load version 2 (with 20 agents) of the environment env = UnityEnvironment(file_name='data/Reacher_Linux/Reacher.x86_64') # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents n_agent = len(env_info.agents) # size of each action action_size = brain.vector_action_space_size # size of state space state_size = env_info.vector_observations.shape[1] # train agent = Agent(state_size=state_size, action_size=action_size, random_seed=2, n_agent=n_agent) scores = ddpg(env, agent, n_agent)
def ddpg(n_episodes=1000, max_t=500, print_every=100): scores_deque = deque(maxlen=print_every) scores = [] # Create the env and the agent terminating_angle = 15 env = CubeEnv(np.deg2rad(terminating_angle)) agent = Agent(state_size=3, action_size=1, random_seed=2) plotter = LivePlotter(env, max_t, terminating_angle, n_episodes) for i_episode in range(1, n_episodes + 1): state = env.reset() agent.reset() score = 0 done = False plotter.reset() while not done: # Select the next action and update system action = agent.act(state) * 10 next_state, reward, done, _ = env.step(action) agent.step(state, action, reward, next_state, done) # Update plots and metrics state = next_state score += reward plotter.add_data_from_env(env) scores_deque.append(score) scores.append(score) plotter.add_score(score) print('\rEpisode {}\tScore: {}'.format(i_episode, score), end="") # Display the plotrs plotter.display() # Save model torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') return scores
def test(): agent = Agent(state_size=33, action_size=4, seed=0) load_model(agent.critic_local, 'solved_critic_trained_model.pth') load_model(agent.actor_local, 'solved_actor_trained_model.pth') env_info = env.reset(train_mode=False)[brain_name] state = env_info.vector_observations score = np.zeros(1) while True: action = agent.act(state, 0, False) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations # get the next state reward = env_info.rewards # get the reward done = env_info.local_done # see if episode has finished state = next_state score += reward if np.any(done): print('\r\tTest Score: {:.2f}'.format(score[0], end="")) break
def __init__(self, state_size, action_size, num_agents, random_seed, config): """Initialize the MultAgent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.agents = [ Agent(state_size, action_size, num_agents, random_seed, config) for _ in range(num_agents) ] self.num_agents = num_agents self.action_size = action_size # Replay memory self.batch_size = config["batch_size"] self.memory = ReplayBuffer(action_size, config["buffer_size"], self.batch_size, random_seed, num_agents)
def ddpg_test(): agents = [ Agent(state_size, action_size, 0, 0, 0, 0, 0, 0, 0, 0, 0) for _ in range(num_agents) ] load(agents) for i_episode in range(3): env_info = env.reset(train_mode=False)[brain_name] states = env_info.vector_observations while True: for agent in agents: agent.reset() actions = list() for agent, state in zip(agents, states): actions.append(agent.act(state)) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations dones = env_info.local_done states = next_states if np.any(dones): break
def __init__(self, num_agents=2, state_size=24, action_size=2): """Initialize a maddpg_agent wrapper. Params ====== num_agents (int): the number of agents in the environment state_size (int): dimension of each state action_size (int): dimension of each action """ self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.agents = [ Agent(state_size, action_size, i + 1, random_seed=0) for i in range(num_agents) ] # Replay memory shared by all agents self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed=0)
def runner(chkp_actor=None, chkp_critic=None): ''' This function loads the environment and the agent. By default runs in training mode, but if a checkpoint file is passed it runs in eval mode. Params ====== chkp_actor (None|file): file containing an actor checkpoint saved during training. chkp_critic (None|file): file containing a critic checkpoint saved during training. ''' # instantiate Unity environment env = UnityEnvironment(file_name='./Reacher_Linux/Reacher.x86_64') # get first brain brain_name = env.brain_names[0] # get action size brain = env.brains[brain_name] action_size = brain.vector_action_space_size # get state size env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations state_size = states.shape[1] # instantiate the Agent agent = Agent(state_size=state_size, action_size=action_size, random_seed=2) if chkp_actor: cp_actor = torch.load(chkp_actor) cp_critic = torch.load(chkp_critic) agent.actor_local.load_state_dict(cp_actor) agent.critic_local.load_state_dict(cp_critic) ddpg(agent, env, brain_name, n_episodes=100, train=False) else: ddpg(agent, env, brain_name, train=True) env.close()