print('Size of observations: {}'.format(self.n_states())) print('Example state:', self.states()[0]) def close(self): self.env.close() env = UnityEnvWrapper(no_graphics=False) agent1 = Agent(state_size=env.n_states() + 1, action_size=env.n_actions(), random_seed=2) agent2 = Agent(state_size=env.n_states() + 1, action_size=env.n_actions(), random_seed=2) agent2.actor_local = agent1.actor_local agent2.actor_target = agent1.actor_target agent2.actor_optimizer = agent1.actor_optimizer print(env.n_agents(), env.n_states(), env.n_actions()) def play(): agent1.actor_local.load_state_dict(torch.load('checkpoint_actor_1.pth')) agent1.actor_local.eval() # agent2.actor_local.load_state_dict(torch.load('checkpoint_actor_2.pth')) # agent2.actor_local.eval() state = env.reset(train_mode=False) while True: state1 = np.concatenate([state[0], [1]])
class MADDPG(): def __init__(self, state_size, action_size, random_seed): """Initialize 2 Agent objects. Params ====== state_size (int): dimension of one agent's observation action_size (int): dimension of each action """ self.state_size = state_size self.action_size = action_size # Initialize the agents self.ddpg_agent0 = Agent(state_size, action_size, random_seed=0) self.ddpg_agent1 = Agent(state_size, action_size, random_seed=1) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def act(self, states, rand=False): """Agents act with actor_local""" if rand == False: action0 = self.ddpg_agent0.act(states[0]) action1 = self.ddpg_agent1.act(states[1]) actions = [action0, action1] return actions if rand == True: actions = np.random.randn(2, 2) actions = np.clip(actions, -1, 1) return actions def step(self, states, actions, rewards, next_states, dones, learn=True): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward state0 = states[0] state1 = states[1] action0 = actions[0] action1 = actions[1] reward0 = rewards[0] reward1 = rewards[1] next_state0 = next_states[0] next_state1 = next_states[1] done0 = dones[0] done1 = dones[1] self.memory.add(state0, state1, action0, action1, reward0, reward1, next_state0, next_state1, done0, done1) if learn == True and len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def learn(self, experiences, GAMMA): s0, s1, a0, a1, r0, r1, next_s0, next_s1, d0, d1 = experiences # next actions (for CRITIC network) a_next0 = self.ddpg_agent0.actor_target(next_s0) a_next1 = self.ddpg_agent1.actor_target(next_s1) # action predictions (for ACTOR network) a_pred0 = self.ddpg_agent0.actor_local(s0) a_pred1 = self.ddpg_agent1.actor_local(s1) # ddpg agents learn separately, each agent learns from its perspective, that is why states, actions, etc are swapped self.ddpg_agent0.learn(s0, s1, a0, a1, r0, r1, next_s0, next_s1, d0, d1, a_next0, a_next1, a_pred0, a_pred1) self.ddpg_agent1.learn(s1, s0, a1, a0, r1, r0, next_s1, next_s0, d1, d0, a_next1, a_next0, a_pred1, a_pred0)
# size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) agent_1 = Agent(state_size=state_size, action_size=action_size, random_seed=2) agent_2 = Agent(state_size=state_size, action_size=action_size, random_seed=3) agent_2.memory = agent_1.memory agent_2.actor_local = agent_1.actor_local agent_2.actor_target = agent_1.actor_target agent_2.critic_local = agent_1.critic_local agent_2.critic_target = agent_1.critic_target t_max = 1000 print_every = 100 maxlen = 100 score = [] ev_score = [] scores_deque = deque(maxlen=maxlen) for i_episode in range(1, env.n_episodes + 1): # play game for 5 episodes env_info = env.reset(train_mode=True)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) agent_1.reset()
# size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) random_seed = 1 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") scores = np.zeros(num_agents) if torch.cuda.is_available(): trained_model = torch.load('checkpoint_actor.pth') else: trained_model = torch.load('checkpoint_actor.pth',map_location={'cuda:0': 'cpu'}) agent = Agent(state_size=state_size, action_size=action_size, random_seed=random_seed) agent.actor_local = Actor(state_size, action_size, random_seed).to(device) agent.actor_local.load_state_dict(trained_model) env_info = env.reset(train_mode=False)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) while True: action = agent.act(states, add_noise=False) env_info = env.step(action)[brain_name] states = env_info.vector_observations # get next state (for each agent)