agent.reset() score = 0 actions = [np.array([0.0, 0.0, 0.0, 0.0])] * num_agents # Train until environment ends the episode while True: for env_agent_idx in range(num_agents): # Let deep learning agent act based on states actions[env_agent_idx] = agent.act(states[env_agent_idx]) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done for env_agent_idx in range(num_agents): # Save to replay buffer agent.memorize(states[env_agent_idx], actions[env_agent_idx], \ rewards[env_agent_idx], next_states[env_agent_idx], \ dones[env_agent_idx]) # Learn agent.step() states = next_states score += np.sum(rewards) / len(rewards) if np.any(dones): break # Check and track scores scores_deque.append(score) scores.append(score) average_score = np.mean(scores_deque) print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format( i_episode, average_score, score), end="") if i_episode % print_every == 0:
states = env_info.vector_observations score = 0 steps = 0 # Train until environment ends the episode while True: steps += 1 # Let deep learning agent act based on states action_0 = agent.act(states[0]) action_1 = agent.act(states[1]) # Send action to Unity environment env_info = env.step([action_0, action_1])[brain_name] states_next = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done # Save experiences to replay buffer agent.memorize(states[0], action_0, rewards[0], states_next[0], dones[0]) agent.memorize(states[1], action_1, rewards[1], states_next[1], dones[1]) # Learn agent.update_step() agent.update_step() states = states_next score += np.sum(rewards) / len(rewards) if np.any(dones): break # Check and track scores scores_deque.append(score) scores.append(score) score_average = np.mean(scores_deque) print('\rEpisode {}\tAverage Score: {:.4f}\tScore: {:.4f}'.format( i_episode, score_average, score),