Example #1
0
 def __init__(self, agent_init_params=None, alg_types=None,
              gamma=0.95, tau=0.01, lr=0.01, hidden_dim=64
             #  discrete_action=False
     ):
     """
     Inputs:
         agent_init_params (list of dict): List of dicts with parameters to
                                           initialize each agent
             num_in_pol (int): Input dimensions to policy
             num_out_pol (int): Output dimensions to policy
             num_in_critic (int): Input dimensions to critic
         alg_types (list of str): Learning algorithm for each agent (DDPG
                                    or MADDPG)
         gamma (float): Discount factor
         tau (float): Target update rate
         lr (float): Learning rate for policy and critic
         hidden_dim (int): Number of hidden dimensions for networks
         discrete_action (bool): Whether or not to use discrete action space
     """
     self.nagents = len(alg_types)
     self.alg_types = alg_types
     self.agents = [DDPGAgent(lr=lr, hidden_dim=hidden_dim, **params)
                    for params in agent_init_params]
     self.agent_init_params = agent_init_params
     self.gamma = gamma
     self.tau = tau
     self.lr = lr
     # self.discrete_action = discrete_action
     self.pol_dev = 'cpu'  # device for policies
     self.critic_dev = 'cpu'  # device for critics
     self.trgt_pol_dev = 'cpu'  # device for target policies
     self.trgt_critic_dev = 'cpu'  # device for target critics
     self.niter = 0
     # summaries tracker 
     self.agent_losses = defaultdict(list)
                break

        # save most recent score
        scores_window.append(round(np.max(scores), 2))

        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)))

        if np.mean(scores_window) >= 1:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode - 100, np.mean(scores_window)))
            for i, a in enumerate(agent.maddpg_agent):
                torch.save(a.actor_local.state_dict(),
                           'MADDPG_actor_{}.pth'.format(i + 1))
            break


if __name__ == "__main__":

    # instantiate agents and multiagent framework
    Player1 = DDPGAgent(state_size, action_size, num_agents, random_seed=0)
    Player2 = DDPGAgent(state_size, action_size, num_agents, random_seed=0)

    maddpg = MADDPG(agents=[Player1, Player2])

    # Learn agent and save model
    learn(maddpg)
    env.close()
    "Pong-v0", "MsPacman-v0", "SpaceInvaders-v0", "Seaquest-v0",
    "LunarLanderV2", "Reacher-v2", "FrozenLake-v0"
]

env = gym.make("BipedalWalker-v2")
obs, rew, done, ep_ret, ep_len = env.reset(), 0, False, 0, 0

epochs = 100
steps_per_epoch = 50
max_ep_len = 500
replay_size = int(1e6)
start_steps = 2000
batch_size = 64
tf.set_random_seed(0)

agent = DDPGAgent(env.observation_space, env.action_space)
buffer = DDPGReplayBuffer(env.observation_space.shape[0],
                          env.action_space.shape[0],
                          size=replay_size)

rewards = [0]
q_losses = []
pi_losses = []
total_steps = steps_per_epoch * epochs
ep_ret = 0
ep_len = 0
for t in tqdm(range(5000)):
    if t > start_steps:
        act = agent.get_action(obs)
        # print(act)
    else:
    
    # Begin simulation
    env_info = env.reset(train_mode=False)[brain_name] # reset the environment
    state = env_info.vector_observations[0]            # get the current state
    score = 0                                          # initialize the score
    while True:
        action = agent.act(state, add_noise=False)    # select an action (without noise)
        env_info = env.step(action)[brain_name]        # send the action to the environment
        next_state = env_info.vector_observations[0]   # get the next state
        reward = env_info.rewards[0]                   # get the reward
        done = env_info.local_done[0]                  # see if episode has finished
        score += reward                                # update the score
        state = next_state                             # roll over the state to next time step
        if done:                                       # exit loop if episode finished
            break
    
    print("Score: {}".format(score))

    
if __name__ == "__main__":
    
    # instantiate agent and load weights
    agent = DDPGAgent(state_size=33, action_size=4, model=(Actor, Critic), random_seed=0)
    agent.actor_local.load_state_dict(torch.load('DDPG_actor.pth'))
        
    # Run simulation with specified agent
    print('Running simulation with DDPG agent')
    run(agent)
    env.close()

Example #5
0
# create experience buffer
buffer = ExperienceBuffer(osize, asize, max_len=params["BUFFER_LENGTH"])

# create noise models
np.random.seed(0)  # set the numpy seed

# create actor network
actor = DeterministicActor(osize, asize, seed=0).to(device)
target_actor = DeterministicActor(osize, asize, seed=0).to(device)

# create critic network
critic = QCritic(osize, asize, seed=0).to(device)
target_critic = QCritic(osize, asize, seed=0).to(device)

# create DDPG agents
agent_0 = DDPGAgent(actor, critic, target_actor, target_critic, buffer, params)
agent_1 = DDPGAgent(actor, critic, target_actor, target_critic, buffer, params)

# ------  Train loop -------

for ep_count in range(1, MAX_EPISODES):

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
    states = env_info.vector_observations

    ep_reward = np.zeros(num_agents)
    ep_steps = 1

    while True: