Example #1
0
        print('Size of observations: {}'.format(self.n_states()))
        print('Example state:', self.states()[0])

    def close(self):
        self.env.close()


env = UnityEnvWrapper(no_graphics=False)
agent1 = Agent(state_size=env.n_states() + 1,
               action_size=env.n_actions(),
               random_seed=2)
agent2 = Agent(state_size=env.n_states() + 1,
               action_size=env.n_actions(),
               random_seed=2)

agent2.actor_local = agent1.actor_local
agent2.actor_target = agent1.actor_target
agent2.actor_optimizer = agent1.actor_optimizer

print(env.n_agents(), env.n_states(), env.n_actions())


def play():
    agent1.actor_local.load_state_dict(torch.load('checkpoint_actor_1.pth'))
    agent1.actor_local.eval()
    # agent2.actor_local.load_state_dict(torch.load('checkpoint_actor_2.pth'))
    # agent2.actor_local.eval()

    state = env.reset(train_mode=False)
    while True:
        state1 = np.concatenate([state[0], [1]])
class MADDPG():
    def __init__(self, state_size, action_size, random_seed):
        """Initialize 2 Agent objects.
        
        Params
        ======
            state_size (int): dimension of one agent's observation
            action_size (int): dimension of each action
        """
        self.state_size = state_size
        self.action_size = action_size
        # Initialize the agents
        self.ddpg_agent0 = Agent(state_size, action_size, random_seed=0)
        self.ddpg_agent1 = Agent(state_size, action_size, random_seed=1)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def act(self, states, rand=False):
        """Agents act with actor_local"""
        if rand == False:
            action0 = self.ddpg_agent0.act(states[0])
            action1 = self.ddpg_agent1.act(states[1])
            actions = [action0, action1]
            return actions
        if rand == True:
            actions = np.random.randn(2, 2)
            actions = np.clip(actions, -1, 1)
            return actions

    def step(self, states, actions, rewards, next_states, dones, learn=True):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        state0 = states[0]
        state1 = states[1]

        action0 = actions[0]
        action1 = actions[1]

        reward0 = rewards[0]
        reward1 = rewards[1]

        next_state0 = next_states[0]
        next_state1 = next_states[1]

        done0 = dones[0]
        done1 = dones[1]

        self.memory.add(state0, state1, action0, action1, reward0, reward1,
                        next_state0, next_state1, done0, done1)

        if learn == True and len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def learn(self, experiences, GAMMA):
        s0, s1, a0, a1, r0, r1, next_s0, next_s1, d0, d1 = experiences

        # next actions (for CRITIC network)
        a_next0 = self.ddpg_agent0.actor_target(next_s0)
        a_next1 = self.ddpg_agent1.actor_target(next_s1)

        # action predictions (for ACTOR network)
        a_pred0 = self.ddpg_agent0.actor_local(s0)
        a_pred1 = self.ddpg_agent1.actor_local(s1)

        # ddpg agents learn separately, each agent learns from its perspective, that is why states, actions, etc are swapped
        self.ddpg_agent0.learn(s0, s1, a0, a1, r0, r1, next_s0, next_s1, d0,
                               d1, a_next0, a_next1, a_pred0, a_pred1)
        self.ddpg_agent1.learn(s1, s0, a1, a0, r1, r0, next_s1, next_s0, d1,
                               d0, a_next1, a_next0, a_pred1, a_pred0)
Example #3
0
# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(
    states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

agent_1 = Agent(state_size=state_size, action_size=action_size, random_seed=2)
agent_2 = Agent(state_size=state_size, action_size=action_size, random_seed=3)
agent_2.memory = agent_1.memory
agent_2.actor_local = agent_1.actor_local
agent_2.actor_target = agent_1.actor_target
agent_2.critic_local = agent_1.critic_local
agent_2.critic_target = agent_1.critic_target
t_max = 1000
print_every = 100
maxlen = 100

score = []
ev_score = []
scores_deque = deque(maxlen=maxlen)
for i_episode in range(1, env.n_episodes + 1):  # play game for 5 episodes
    env_info = env.reset(train_mode=True)[brain_name]  # reset the environment
    states = env_info.vector_observations  # get the current state (for each agent)
    scores = np.zeros(num_agents)  # initialize the score (for each agent)
    agent_1.reset()
# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

random_seed = 1
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
scores = np.zeros(num_agents)

if torch.cuda.is_available():
    trained_model = torch.load('checkpoint_actor.pth')
else:
    trained_model = torch.load('checkpoint_actor.pth',map_location={'cuda:0': 'cpu'})

agent = Agent(state_size=state_size, action_size=action_size, random_seed=random_seed)
agent.actor_local = Actor(state_size, action_size, random_seed).to(device)
agent.actor_local.load_state_dict(trained_model)

env_info = env.reset(train_mode=False)[brain_name]     # reset the environment
states = env_info.vector_observations                  # get the current state (for each agent)

while True:
    action = agent.act(states, add_noise=False)
    env_info = env.step(action)[brain_name]
    states = env_info.vector_observations              # get next state (for each agent)