def ddpg_run(episodes=1000, seed=42): env = start_env() env_info = reset_env_info(env) state_size = get_state_size(env_info) action_size = get_action_size(env) print('Seed used:', seed) total_agents = get_total_agents(env_info) agent = DDPGAgent(total_agents, state_size, action_size, seed) scores = [] scores_window = deque(maxlen=100) for episode in range(1, episodes+1): init_time = datetime.datetime.now() env_info = reset_env_info(env) score = np.zeros(total_agents) dones = np.zeros(total_agents) agent.reset() critic_losses = [] actor_losses = [] while not np.any(dones): states = env_info.vector_observations actions = agent.act(states, add_noise=True) env_info = env_step(env, actions) next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done critic_loss, actor_loss = agent.step(states, actions, rewards, next_states, dones) critic_losses.append(critic_loss) actor_losses.append(actor_loss) #print('\rActor Loss: {:.6f} - Critic Loss: {:.6f}'.format(actor_loss, critic_loss), end='') score += rewards scores_window.append(np.mean(score)) scores.append(np.mean(score)) print('Ep. {}/{} - Avg Global Score: {:.2f} - Avg Ep. Score: {:.2f} - Min Ep. Score: {:.2f} - Max Ep. Score: {:.2f} - Actor loss: {:.6f}, Critic loss: {:.6f} - time: {}'.format(episode, episodes, np.mean(scores_window), np.mean(score), np.min(score), np.max(score), np.mean(actor_losses), np.mean(critic_losses), datetime.datetime.now() - init_time, end=' ')) if np.mean(scores_window) >= 30.0 and episode >= 100: print('\nEnvironment solved (mean of 30.0 for 100 episodes) in {:d} episodes!\tAverage Score: {:.2f}'.format(episode, np.mean(scores_window))) torch.save(agent.actor_local.state_dict(), 'actor_local_checkpoint.pth') torch.save(agent.actor_target.state_dict(), 'actor_target_checkpoint.pth') torch.save(agent.critic_local.state_dict(), 'critic_local_checkpoint.pth') torch.save(agent.critic_target.state_dict(), 'critic_target_checkpoint.pth') break env.close() return scores
def play(): env = UnityEnvironment(file_name='./Reacher.app') # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size)) # create agent agent = DDPGAgent(state_size=state_size, action_size=action_size, seed=0) # load weights agent.policy_local.load_state_dict(torch.load('policy.pth')) env_info = env.reset(train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state (for each agent) score = 0 # initialize the score (for each agent) while True: action = agent.act(state, add_noise=False) # select an action (for each agent) env_info = env.step(action)[brain_name] # send all actions to tne environment next_state = env_info.vector_observations[0] # get next state (for each agent) reward = env_info.rewards[0] # get reward (for each agent) done = env_info.local_done[0] # see if episode finished score += reward # update the score (for each agent) state = next_state # roll over states to next time step if done: # exit loop if episode finished break print('Total score (averaged over agents) this episode: {}'.format(score)) env.close()
def testAgent(): print("Testing the Agent") agent = DDPGAgent(state_size=state_size, action_size=action_size, n_agents=n_agents, seed=0, pretrainedWeightsFile='checkpoint_actor.pth', train=False) env_info = env.reset(train_mode=False)[brain_name] # reset the environment states = env_info.vector_observations # get the current state score = np.zeros(n_agents) # initialize the score while True: actions = agent.act(states) # select an action env_info = env.step(actions)[ brain_name] # send the action to the environment next_states = env_info.vector_observations # get the next state rewards = env_info.rewards # get the reward dones = env_info.local_done # see if episode has finished score += np.array(rewards) # update the score states = next_states # roll over the state to next time step if np.any(dones): # exit loop if episode finished break print("Score: {}".format(np.mean(score))) return score