コード例 #1
0
ファイル: lunar_lander.py プロジェクト: jimzers/ddpg-pytorch
agent = Agent(alpha=0.000025, beta = 0.00025, input_dims=[8], tau=0.001, env=env, batch_size=64, layer1_size=400, layer2_size=300, n_actions=2)

episodes = 1000

np.random.seed(42)

tau_hist = []
score_hist = []
for i in range(episodes):
    done = False
    score = 0
    state = env.reset()
    while not done:
        act = agent.choose_action(state)
        next_state, reward, done, _ = env.step(act)
        agent.store(state, act, reward, next_state, int(done))
        agent.learn()
        score += reward
        state = next_state

    agent.save_models()
    score_hist.append(score)
    tau_hist.append(agent.tau)
    avg_score = np.mean(score_hist[-100:])
    print('episode ' + str(i + 1) + 'score %.2f' % score +
              'average score %.2f' % avg_score)

episodes = np.arange(1, episodes + 1)
plot_curve(episodes, score_hist, tau_hist)
コード例 #2
0
class MADDPG():
    def __init__(self, state_size, action_size, random_seed):
        """Initialize 2 Agent objects.
        
        Params
        ======
            state_size (int): dimension of one agent's observation
            action_size (int): dimension of each action
        """
        self.state_size = state_size
        self.action_size = action_size
        # Initialize the agents
        self.ddpg_agent0 = Agent(state_size, action_size, random_seed=0)
        self.ddpg_agent1 = Agent(state_size, action_size, random_seed=1)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def act(self, states, rand=False):
        """Agents act with actor_local"""
        if rand == False:
            action0 = self.ddpg_agent0.act(states[0])
            action1 = self.ddpg_agent1.act(states[1])
            actions = [action0, action1]
            return actions
        if rand == True:
            actions = np.random.randn(2, 2)
            actions = np.clip(actions, -1, 1)
            return actions

    def step(self, states, actions, rewards, next_states, dones, learn=True):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        state0 = states[0]
        state1 = states[1]

        action0 = actions[0]
        action1 = actions[1]

        reward0 = rewards[0]
        reward1 = rewards[1]

        next_state0 = next_states[0]
        next_state1 = next_states[1]

        done0 = dones[0]
        done1 = dones[1]

        self.memory.add(state0, state1, action0, action1, reward0, reward1,
                        next_state0, next_state1, done0, done1)

        if learn == True and len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def learn(self, experiences, GAMMA):
        s0, s1, a0, a1, r0, r1, next_s0, next_s1, d0, d1 = experiences

        # next actions (for CRITIC network)
        a_next0 = self.ddpg_agent0.actor_target(next_s0)
        a_next1 = self.ddpg_agent1.actor_target(next_s1)

        # action predictions (for ACTOR network)
        a_pred0 = self.ddpg_agent0.actor_local(s0)
        a_pred1 = self.ddpg_agent1.actor_local(s1)

        # ddpg agents learn separately, each agent learns from its perspective, that is why states, actions, etc are swapped
        self.ddpg_agent0.learn(s0, s1, a0, a1, r0, r1, next_s0, next_s1, d0,
                               d1, a_next0, a_next1, a_pred0, a_pred1)
        self.ddpg_agent1.learn(s1, s0, a1, a0, r1, r0, next_s1, next_s0, d1,
                               d0, a_next1, a_next0, a_pred1, a_pred0)