def __init__(self, params): seed = params['general_params']['seed'] self.__set_seed(seed=seed) env_params = params['env_params'] env_params['seed'] = seed self.env = UnityEnv(params=env_params) agent_params = params['agent_params'] agent_params['state_size'] = self.env.observation_space.shape[0] agent_params['action_size'] = self.env.action_space_size self.agent = AgentPPO(params=agent_params) trainer_params = params['trainer_params'] self.learning_rate_decay = trainer_params['learning_rate_decay'] self.results_path = trainer_params['results_path'] self.model_path = trainer_params['model_path'] self.t_max = trainer_params['t_max'] # data gathering variables self.avg_rewards = [] self.scores = [] self.score = 0 print("PPO agent.") print("Configuration:") pprint(params) logging.info("Configuration: {}".format(params))
def watch_agent(env_name, agent_ckpt, steps): device = torch.device(DEVICE) if env_name == 'reacher': env = UnityEnv(env_file='data/Reacher.exe', no_graphics=False) policy = ReacherActorCritic(env.state_size, env.action_size).to(device) else: env = UnityEnv(env_file='data/Crawler/Crawler_Windows_x86_64.exe', no_graphics=False, mlagents=True) policy = CrawlerActorCritic(env.state_size, env.action_size).to(device) checkpoint = torch.load(agent_ckpt, map_location=DEVICE) policy.load_state_dict(checkpoint) running_rewards = np.zeros(env.num_agents) scores = np.zeros(env.num_agents) state = env.reset(train=False) for step_i in range(steps): action, _, _, _ = policy(torch.from_numpy(state).float().to(device)) state, r, done = env.step(action.detach().cpu().numpy()) running_rewards += r # check if agent is done agents_are_done = True for i in range(env.num_agents): if done[i] and scores[i] == 0: scores[i] = running_rewards[i] if scores[i] == 0: agents_are_done = False if agents_are_done: break env.close() print(f'Average score of 20 agents is: {np.mean(scores):.2f}')
def main(path='model_checkpoints'): # seed = 1234 ### For unity ### env = UnityEnv(env_file='Environments/Reacher_Linux_20/Reacher.x86_64', no_graphics=False) # number of agents num_agents = env.num_agents print('Number of agents:', num_agents) # size of each action action_size = env.action_size # examine the state space state_size = env.state_size print('Size of each action: {}, Size of the state space {}'.format( action_size, state_size)) ### For gym ### # K = 2 # env = gym.make('MountainCarContinuous-v0') # nS = env.observation_space.shape[0] # nA = env.action_space.shape[0] # K_envs = MultiEnv(env,nS,K) ddpg_config = Config('maddpg') maddpg = MultiAgent(env, state_size, action_size, ddpg_config) maddpg.load_weights(ddpg_config.critic_path, ddpg_config.actor_path) maddpg.evaluate()
def main(algo): seed = 7 # Load the ENV ### For running in VSCode ### # env = UnityEnv(env_file='Environments/Tennis_Linux/Tennis.x86_64',no_graphics=True) ### For running from terminal ### env = UnityEnv(env_file='../Environments/Tennis_Linux/Tennis.x86_64', no_graphics=True) # number of agents num_agents = env.num_agents print('Number of agents:', num_agents) # size of each action action_size = env.action_size # examine the state space state_size = env.state_size print('Size of each action: {}, Size of the state space {}'.format( action_size, state_size)) ddpg_config = Config(algo) maddpg = MultiAgent(env, state_size, action_size, ddpg_config) maddpg.seed_replay_buffer() maddpg.train()
def main(path='model_checkpoints'): seed = 1234 env = UnityEnv(env_file='Environments/Reacher_Linux_20/Reacher.x86_64', no_graphics=False) # number of agents num_agents = env.num_agents print('Number of agents:', num_agents) # size of each action action_size = env.action_size # examine the state space state_size = env.state_size print('Size of each action: {}, Size of the state space {}'.format( action_size, state_size)) path = 'model_checkpoints/ppo.ckpt' agent = PPO(env, action_size, state_size, seed) agent.load_weights(path) rewards = [] state = env.reset() for i in range(4000): action, _, _, _ = agent.policy(state) next_state, reward, done = env.step(action.cpu().numpy()) # print(next_state,reward,done) state = next_state rewards.append(np.sum(rewards)) env.close() print("The agent achieved an average score of {:.2f}".format( np.mean(rewards)))
def main(path='model_checkpoints'): seed = 1234 env = UnityEnv(env_file='../Environments/Tennis_Linux/Tennis.x86_64', no_graphics=False) # number of agents num_agents = env.num_agents print('Number of agents:', num_agents) # size of each action action_size = env.action_size # examine the state space state_size = env.state_size print('Size of each action: {}, Size of the state space {}'.format( action_size, state_size)) config = Config('ddpg') path = '/home/shuza/Code/Udacity_multiplayer/DDPG/model_weights/ddpg.ckpt' agent = Agent(state_size * 2, action_size * 2, Actor, Critic, config) agent.load_weights(path) rewards = [] state = env.reset() for i in range(4000): action = agent.evaluate(state.reshape(-1)) next_state, reward, done = env.step(action.reshape(2, -1)) # print(next_state,reward,done) state = next_state rewards.append(np.sum(rewards)) if done.any(): break env.close() print("The agent achieved an average score of {:.2f}".format( np.mean(rewards)))
def __init__(self, params): seed = params['general_params']['seed'] self.__set_seed(seed=seed) env_params = params['env_params'] env_params['seed'] = seed self.env = UnityEnv(params=env_params) agent_params = params['agent_params'] self.__num_of_agents = self.env.observation_space.shape[0] state_size = self.env.observation_space.shape[1] action_size = self.env.action_space_size agent_params['num_of_agents'] = self.__num_of_agents agent_params['state_size'] = state_size agent_params['action_size'] = action_size self.agents = Agents(params=agent_params) trainer_params = params['trainer_params'] self.learning_rate_decay = trainer_params['learning_rate_decay'] self.results_path = trainer_params['results_path'] self.model_path = trainer_params['model_path'] self.t_max = trainer_params['t_max'] self.exploration_noise = UOProcess() # data gathering variables self.avg_rewards = [] self.scores = [] self.score = 0 self.sigma = 0.5 print("MADDPG agent.") print("Configuration:") pprint(params) logging.info("Configuration: {}".format(params))
def watch_rnd_game(steps): env = UnityEnv(env_file='data/Crawler/Crawler_Windows_x86_64.exe', no_graphics=False, mlagents=True) env.reset(train=False) rewards = np.zeros(env.num_agents) for i in range(steps): action = np.random.rand(env.num_agents, env.action_size) _, r, done = env.step(action) rewards += r if done.all(): break print(f'Average score of 20 agents is: {np.mean(rewards):.2f}') env.close()
def main(algo): seed = 7 path = 'model_checkpoints/ppo.ckpt' # Load the ENV # env = UnityEnv(env_file='Environments/Reacher_Linux_one/Reacher.x86_64',no_graphics=True) env = UnityEnv(env_file='Environments/Tennis_Linux/Tennis.x86_64', no_graphics=True) # number of agents num_agents = env.num_agents print('Number of agents:', num_agents) # size of each action action_size = env.action_size # examine the state space state_size = env.state_size print('Size of each action: {}, Size of the state space {}'.format( action_size, state_size)) config = Config(algo) if torch.cuda.is_available(): device = torch.device("cuda:0") device2 = torch.device("cuda:1") agent = PPO(action_size, state_size, seed, device, config) # try: # except: # device = torch.device("cuda:0") # else: # device = torch.device('cpu') # try: # agent_a = PPO(action_size,state_size,seed,device,config) # agent_b = PPO(action_size,state_size,seed,device2,config) # print('Double GPU') # except: # print('Single GPU') # agent_a = PPO(action_size,state_size,seed,device,config) # agent_b = PPO(action_size,state_size,seed,device,config) train_ppo(env, agent, EPISODES, path)
clip = random.choice(clips) nstep = random.choice(nsteps) epoch = random.choice(epochs) gae_tau = random.choice(gae_taus) weight_decay = random.choice(weight_decays) lrate_decay = random.choice(lrate_decays) lrate_schedule = lambda it: lrate_decay**it summary = f'nbatch_{nbatch:d}_lrate_{lrate:.0E}_clip_{clip:.2f}' summary += f'_nstep_{nstep:d}_epoch_{epoch:d}_gae_{gae_tau:.2f}' summary += f'_lrdecay_{lrate_decay}_wdcay_{weight_decay}' writer = SummaryWriter(os.path.join(root_logdir, summary)) # create new environment env = UnityEnv(env_file='data/Crawler/Crawler_Windows_x86_64.exe', mlagents=True) # create new policy policy = CrawlerActorCritic(env.state_size, env.action_size).to(device) # create agent a = Agent(env, policy, nsteps=nstep, gamma=gamma, epochs=epoch, nbatchs=nbatch, ratio_clip=clip, lrate=lrate, gradient_clip=gradient_clip, beta=beta,
episodes = 2000 # total number of episodes to run steps = 2000 # maximum number of steps per episode upd_every = 1 # update agents every # of steps batch_size = 128 expl_theta = 0.15 expl_sigma = 0.2 lrate_actor = 1e-3 lrate_critic = 1e-3 tau = 0.02 # environment env = UnityEnv() agent = Agent(env.state_size * 2, env.action_size, Actor, Critic, exploration_sigma=expl_sigma, exploration_theta=expl_theta, lrate_actor=lrate_actor, lrate_critic=lrate_critic, update_every=upd_every, batch_size=batch_size, tau=tau) # logging scores = deque(maxlen=100)
writer.add_scalar('data/score', mean, ep_i) if mean > 0.50 and mean > last_saved: summary += " (saved)" last_saved = mean agent.save('saved_models/tennis_ddpg.ckpt') print(summary) if __name__ == '__main__': # hyperparameters episodes = 2000 steps = 2000 # environment env = UnityEnv(no_graphics=False) state_size = env.state_size * 2 action_size = env.action_size * 2 # agent agent = Agent(state_size, action_size, Actor, Critic, lrate_critic=1e-3, lrate_actor=1e-4, tau=0.01, buffer_size=1e6, batch_size=256, gamma=0.99, exploration_mu=0.0,
class Trainer: def __init__(self, params): seed = params['general_params']['seed'] self.__set_seed(seed=seed) env_params = params['env_params'] env_params['seed'] = seed self.env = UnityEnv(params=env_params) agent_params = params['agent_params'] agent_params['state_size'] = self.env.observation_space.shape[0] agent_params['action_size'] = self.env.action_space.n self.agent = Agent(params=agent_params) trainer_params = params['trainer_params'] self.learning_rate_decay = trainer_params['learning_rate_decay'] self.max_eps = trainer_params['max_eps'] self.final_eps = trainer_params['final_eps'] self.eps_decay = trainer_params['eps_decay'] self.b_decay = trainer_params['b_decay'] self.results_path = trainer_params['results_path'] self.model_path = trainer_params['model_path'] # data gathering variables self.avg_rewards = [] self.scores = [] self.score = 0 print("Configuration:") pprint(params) logging.info("Configuration: {}".format(params)) def train(self, num_of_episodes): reward_window = deque(maxlen=100) self.eps_decay = (self.final_eps / self.max_eps)**(1 / (0.2 * num_of_episodes)) reward_matrix = np.zeros((num_of_episodes, 300)) for episode_i in range(1, num_of_episodes): state = self.env.reset() done = False total_reward = 0 total_loss = 0 #self.agent.eps = self.max_eps/(episode_i + 1) self.agent.eps *= self.eps_decay #self.agent.b = 1 - np.exp(-self.b_decay * episode_i) counter = 0 while not done: action = self.agent.choose_action(state) next_state, reward, done, _ = self.env.step(action) self.agent.step(state, action, reward, next_state, done) state = next_state # DEBUG # logging.info("epsiode: {}, reward: {}, counter: {}, action: {}, actions: {}, fc1 weight data: {}". # format(episode_i, reward, counter, action, actions, # self.agent.get_qlocal().fc1.weight.data)) total_loss += self.agent.agent_loss total_reward += reward reward_matrix[episode_i, counter] = reward counter += 1 reward_window.append(total_reward) print( '\rEpisode {}\tCurrent Score: {:.2f}\tAverage Score: {:.2f} ' '\t\tTotal loss: {:.2f}\tEpsilon: {:.2f}\tBeta: {:.2f}\tLearning rate: {:.4f}' .format(episode_i, total_reward, np.mean(reward_window), total_loss, self.agent.eps, self.agent.b, self.agent.learning_rate), end="") logging.info( 'Episode {}\tCurrent Score: {:.2f}\tAverage Score: {:.2f} ' '\t\tTotal loss: {:.2f}\tEpsilon: {:.2f}\tBeta: {:.2f}\tLearning rate: {:.4f}' .format(episode_i, total_reward, np.mean(reward_window), total_loss, self.agent.eps, self.agent.b, self.agent.learning_rate)) self.agent.learning_rate *= self.learning_rate_decay self.agent.set_learning_rate(self.agent.learning_rate) if episode_i % 100 == 0: avg_reward = np.mean(np.array(reward_window)) print("\rEpisode: {}\tAverage total reward: {:.2f}".format( episode_i, avg_reward)) self.avg_rewards.append(avg_reward) if avg_reward >= 13.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(episode_i - 100, avg_reward)) torch.save( self.agent.get_qlocal().state_dict(), self.model_path + 'checkpoint_{}.pth'.format(datetime.datetime.today(). strftime('%Y-%m-%d_%H-%M'))) t = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M') reward_matrix.dump(self.results_path + 'reward_matrix_new_{}.dat'.format(t)) np.array(self.avg_rewards).dump(self.results_path + 'average_rewards_new_{}.dat'.format(t)) def test(self, checkpoint_filename, time_span=10): checkpoint_path = self.model_path + checkpoint_filename self.agent.get_qlocal().load_state_dict(torch.load(checkpoint_path)) for t in range(time_span): state = self.env.reset(train_mode=False) self.score = 0 done = False while not done: action = self.agent.choose_action(state, 'test') sys.stdout.flush() self.env.render() state, reward, done, _ = self.env.step(action) self.score += reward print('\nFinal score:', self.score) self.env.close() @staticmethod def __set_seed(seed): torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) random.seed(seed) np.random.seed(seed)
def make_env(file_name, wrapped=False): if wrapped: env = UnityEnv(environment_filename=file_name) else: env = UnityEnvironment(file_name=file_name) return env
class Trainer: def __init__(self, params): seed = params['general_params']['seed'] self.__set_seed(seed=seed) env_params = params['env_params'] env_params['seed'] = seed self.env = UnityEnv(params=env_params) agent_params = params['agent_params'] agent_params['state_size'] = self.env.observation_space.shape[0] agent_params['action_size'] = self.env.action_space_size self.agent = AgentPPO(params=agent_params) trainer_params = params['trainer_params'] self.learning_rate_decay = trainer_params['learning_rate_decay'] self.results_path = trainer_params['results_path'] self.model_path = trainer_params['model_path'] self.t_max = trainer_params['t_max'] # data gathering variables self.avg_rewards = [] self.scores = [] self.score = 0 print("PPO agent.") print("Configuration:") pprint(params) logging.info("Configuration: {}".format(params)) def train(self, num_of_episodes): logging.info("Training:") reward_window = deque(maxlen=100) # reward_matrix = np.zeros((num_of_episodes, 300)) for episode_i in range(1, num_of_episodes): state = self.env.reset() total_reward = 0 total_loss = 0 counter = 0 total_action_mean = 0 total_action_std = 0 for t in range(self.t_max): action, log_probs, mean, std = self.agent.choose_action(state) next_state, reward, done, _ = self.env.step(action) self.agent.step(state, action, reward, next_state, done, log_probs) state = next_state # DEBUG # logging.info("epsiode: {}, reward: {}, counter: {}, action: {}". # format(episode_i, reward, counter, action)) total_loss += self.agent.agent_loss total_reward += np.array(reward) counter += 1 total_action_mean = total_action_mean * ( counter - 1) / counter + np.mean(mean) / counter total_action_std = total_action_std * ( counter - 1) / counter + np.mean(std) / counter reward_window.append(total_reward) self.avg_rewards.append(np.mean(total_reward)) print( '\rEpisode {}\tCurrent Score: {:.2f}\tAverage Score: {:.2f}\tMean: {:.2f} \tStd {:.2f} ' '\t\tTotal loss: {:.2f}\tLearning rate (actor): {:.4f}\tLearning rate (critic): {:.4f}' .format(episode_i, np.mean(total_reward), np.mean(reward_window), total_action_mean, total_action_std, total_loss, self.agent.learning_rate_policy, self.agent.learning_rate_value_fn), end="") # logging.info('Episode {}\tCurrent Score (average over 20 robots): {:.2f}\tAverage Score (over episodes): {:.2f} ' # '\t\tTotal loss: {:.2f}\tLearning rate (actor): {:.4f}\tLearning rate (critic): {:.4f}'. # format(episode_i, np.mean(total_reward), np.mean(reward_window), # total_loss, self.agent.learning_rate_policy, self.agent.learning_rate_value_fn)) self.agent.learning_rate_policy *= self.learning_rate_decay self.agent.learning_rate_value_fn *= self.learning_rate_decay self.agent.set_learning_rate(self.agent.learning_rate_policy, self.agent.learning_rate_value_fn) if episode_i % 100 == 0: avg_reward = np.mean(np.array(reward_window)) print("\rEpisode: {}\tAverage total reward: {:.2f}".format( episode_i, avg_reward)) if avg_reward >= 30.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(episode_i - 100, avg_reward)) if not os.path.exists(self.model_path): os.makedirs(self.model_path) torch.save( self.agent.get_actor().state_dict(), self.model_path + 'checkpoint_actor_{}.pth'.format( datetime.datetime.today().strftime( '%Y-%m-%d_%H-%M'))) torch.save( self.agent.get_critic().state_dict(), self.model_path + 'checkpoint_critic_{}.pth'.format( datetime.datetime.today().strftime( '%Y-%m-%d_%H-%M'))) t = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M') # reward_matrix.dump(self.results_path + 'reward_matrix_new_{}.dat'.format(t)) np.array(self.avg_rewards).dump(self.results_path + 'average_rewards_new_{}.dat'.format(t)) def test(self, checkpoint_actor_filename, checkpoint_critic_filename, time_span=10): checkpoint_actor_path = self.model_path + checkpoint_actor_filename checkpoint_critic_path = self.model_path + checkpoint_critic_filename self.agent.get_actor().load_state_dict( torch.load(checkpoint_actor_path)) self.agent.get_critic().load_state_dict( torch.load(checkpoint_critic_path)) for t in range(time_span): state = self.env.reset(train_mode=False) self.score = 0 #done = False while True: action = self.agent.choose_action(state, 'test') sys.stdout.flush() self.env.render() state, reward, done, _ = self.env.step(action) self.score += np.array(reward) if any(done): break print('\nFinal score:', self.score) self.env.close() @staticmethod def __set_seed(seed): torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) random.seed(seed) np.random.seed(seed)
gamma = 0.99 timesteps = 100 ratio_clip = 0.2 batch_size = int(32 * 20) epochs = 10 gradient_clip = 10.0 lrate = 1e-4 log_each = 10 beta = 0.01 gae_tau = 0.95 decay_steps = None solved = 30.0 out_file = 'saved_models/ppo.ckpt' device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') env = UnityEnv(env_file='data/Reacher/Reacher.exe') policy = ReacherActorCritic(env.state_size, env.action_size).to(device) a = Agent(env, policy, timesteps=timesteps, gamma=gamma, epochs=epochs, batch_size=batch_size, ratio_clip=ratio_clip, lrate=lrate, gradient_clip=gradient_clip, beta=beta, gae_tau=gae_tau) train(a, iterations=iterations,
class Trainer: def __init__(self, params): seed = params['general_params']['seed'] self.__set_seed(seed=seed) env_params = params['env_params'] env_params['seed'] = seed self.env = UnityEnv(params=env_params) agent_params = params['agent_params'] self.__num_of_agents = self.env.observation_space.shape[0] state_size = self.env.observation_space.shape[1] action_size = self.env.action_space_size agent_params['num_of_agents'] = self.__num_of_agents agent_params['state_size'] = state_size agent_params['action_size'] = action_size self.agents = Agents(params=agent_params) trainer_params = params['trainer_params'] self.learning_rate_decay = trainer_params['learning_rate_decay'] self.results_path = trainer_params['results_path'] self.model_path = trainer_params['model_path'] self.t_max = trainer_params['t_max'] self.exploration_noise = UOProcess() # data gathering variables self.avg_rewards = [] self.scores = [] self.score = 0 self.sigma = 0.5 print("MADDPG agent.") print("Configuration:") pprint(params) logging.info("Configuration: {}".format(params)) def train(self, num_of_episodes): logging.info("Training:") reward_window = deque(maxlen=100) for episode_i in range(1, num_of_episodes): states = self.env.reset() self.agents.reset(self.sigma) scores = np.zeros(self.env.observation_space.shape[0]) total_loss = 0 self.sigma *= 0.99 counter = 0 for t in range(self.t_max): actions = self.agents.choose_action(states) next_states, rewards, dones, _ = self.env.step(actions) self.agents.step(states, actions, rewards, next_states, dones) states = next_states # DEBUG # logging.info("epsiode: {}, reward: {}, counter: {}, action: {}". # format(episode_i, reward, counter, action)) total_loss += self.agents.agent_loss scores += rewards counter += 1 if any(dones): break reward_window.append(np.max(scores)) self.avg_rewards.append(np.mean(np.array(reward_window))) print( '\rEpisode {}\tCurrent Score: {:.4f}\tAverage Score: {:.4f} ' '\t\tTotal loss: {:.2f}\tLearning rate (actor): {:.4f}\tLearning rate (critic): {:.4f}' .format(episode_i, np.max(scores), np.mean(reward_window), total_loss, self.agents.learning_rate_actor, self.agents.learning_rate_critic), end="") logging.info( 'Episode {}\tCurrent Score: {:.4f}\tAverage Score (over episodes): {:.4f} ' '\t\tTotal loss: {:.2f}\tLearning rate (actors): {:.4f}\tLearning rate (critic): {:.4f}' .format(episode_i, np.max(scores), np.mean(reward_window), total_loss, self.agents.learning_rate_actor, self.agents.learning_rate_critic)) self.agents.learning_rate_actor *= self.learning_rate_decay self.agents.learning_rate_critic *= self.learning_rate_decay self.agents.set_learning_rate(self.agents.learning_rate_actor, self.agents.learning_rate_critic) if episode_i % 100 == 0: avg_reward = np.mean(np.array(reward_window)) print("\rEpisode: {}\tAverage total reward: {:.2f}".format( episode_i, avg_reward)) if avg_reward >= 1.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(episode_i - 100, avg_reward)) if not os.path.exists(self.model_path): os.makedirs(self.model_path) t = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M') torch.save( self.agents.get_actor()[0].state_dict(), self.model_path + 'checkpoint_actor1_{}.pth'.format(t)) torch.save( self.agents.get_actor()[1].state_dict(), self.model_path + 'checkpoint_actor2_{}.pth'.format(t)) torch.save( self.agents.get_critic().state_dict(), self.model_path + 'checkpoint_critic_{}.pth'.format(t)) np.array(self.avg_rewards).dump( self.results_path + 'average_rewards_{}.dat'.format(t)) t = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M') # reward_matrix.dump(self.results_path + 'reward_matrix_new_{}.dat'.format(t)) np.array(self.avg_rewards).dump(self.results_path + 'average_rewards_{}.dat'.format(t)) def test(self, checkpoint_actor1_filename, checkpoint_actor2_filename, checkpoint_critic_filename, time_span=10): checkpoint_actor1_path = self.model_path + checkpoint_actor1_filename checkpoint_actor2_path = self.model_path + checkpoint_actor2_filename checkpoint_critic_path = self.model_path + checkpoint_critic_filename self.agents.get_actor()[0].load_state_dict( torch.load(checkpoint_actor1_path)) self.agents.get_actor()[1].load_state_dict( torch.load(checkpoint_actor2_path)) self.agents.get_critic().load_state_dict( torch.load(checkpoint_critic_path)) for t in range(time_span): state = self.env.reset(train_mode=False) self.score = 0 #done = False while True: action = self.agents.choose_action(state, 'test') state, reward, done, _ = self.env.step(action) self.score += np.array(np.max(reward)) if any(done): break print('\nFinal score:', self.score) self.env.close() @staticmethod def __set_seed(seed): torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) random.seed(seed) np.random.seed(seed)
next_state, reward, done = env.step(action.reshape(2, -1)) score += reward state = next_state if done.any(): break scores.append(np.max(score)) return scores if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--agent', '-a', default='saved_models/tennis_ddpg.ckpt') args = parser.parse_args() # create environment env = UnityEnv(no_graphics=False) state_size = env.state_size * 2 action_size = env.action_size * 2 # restore agent checkpoint agent = Agent(state_size, action_size, Actor, Critic, restore=args.agent) # watch agent scores = watch(env, agent, 10) print(f'Average score over 10 episodes: {np.mean(scores):.2f}') env.close()
# 'worker_id' : 0, # 'seed' : np.random.randint(1000), # 'visual_mode' : False, # 'multiagent_mode' : True} # env_name = 'Reacher' # env = UnityEnv(env_params) # env_params = { 'path': '../Reacher_Linux/Reacher.x86_64', 'worker_id': 0, 'seed': np.random.randint(1000), 'visual_mode': False, 'multiagent_mode': False } env_name = 'Reacher' env = UnityEnv(env_params) # env_name = 'MountainCarContinuous-v0' # env = gym.make(env_name) #Pendulum-v0 #MountainCarContinuous-v0 #LunarLanderContinuous-v2 try: observation = env.reset(train_mode=False) except: observation = env.reset() # observation = env.reset() action_space = env.action_space observation_space = env.observation_space params = dict() params['action_dim'] = len(env.action_space.low) params['state_dim'] = len(observation_space.low) params['num_episodes'] = 200 params['buffer_size'] = int(1e6) # replay buffer size