def __init__(self, agent_init_params=None, alg_types=None, gamma=0.95, tau=0.01, lr=0.01, hidden_dim=64 # discrete_action=False ): """ Inputs: agent_init_params (list of dict): List of dicts with parameters to initialize each agent num_in_pol (int): Input dimensions to policy num_out_pol (int): Output dimensions to policy num_in_critic (int): Input dimensions to critic alg_types (list of str): Learning algorithm for each agent (DDPG or MADDPG) gamma (float): Discount factor tau (float): Target update rate lr (float): Learning rate for policy and critic hidden_dim (int): Number of hidden dimensions for networks discrete_action (bool): Whether or not to use discrete action space """ self.nagents = len(alg_types) self.alg_types = alg_types self.agents = [DDPGAgent(lr=lr, hidden_dim=hidden_dim, **params) for params in agent_init_params] self.agent_init_params = agent_init_params self.gamma = gamma self.tau = tau self.lr = lr # self.discrete_action = discrete_action self.pol_dev = 'cpu' # device for policies self.critic_dev = 'cpu' # device for critics self.trgt_pol_dev = 'cpu' # device for target policies self.trgt_critic_dev = 'cpu' # device for target critics self.niter = 0 # summaries tracker self.agent_losses = defaultdict(list)
break # save most recent score scores_window.append(round(np.max(scores), 2)) if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) if np.mean(scores_window) >= 1: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) for i, a in enumerate(agent.maddpg_agent): torch.save(a.actor_local.state_dict(), 'MADDPG_actor_{}.pth'.format(i + 1)) break if __name__ == "__main__": # instantiate agents and multiagent framework Player1 = DDPGAgent(state_size, action_size, num_agents, random_seed=0) Player2 = DDPGAgent(state_size, action_size, num_agents, random_seed=0) maddpg = MADDPG(agents=[Player1, Player2]) # Learn agent and save model learn(maddpg) env.close()
"Pong-v0", "MsPacman-v0", "SpaceInvaders-v0", "Seaquest-v0", "LunarLanderV2", "Reacher-v2", "FrozenLake-v0" ] env = gym.make("BipedalWalker-v2") obs, rew, done, ep_ret, ep_len = env.reset(), 0, False, 0, 0 epochs = 100 steps_per_epoch = 50 max_ep_len = 500 replay_size = int(1e6) start_steps = 2000 batch_size = 64 tf.set_random_seed(0) agent = DDPGAgent(env.observation_space, env.action_space) buffer = DDPGReplayBuffer(env.observation_space.shape[0], env.action_space.shape[0], size=replay_size) rewards = [0] q_losses = [] pi_losses = [] total_steps = steps_per_epoch * epochs ep_ret = 0 ep_len = 0 for t in tqdm(range(5000)): if t > start_steps: act = agent.get_action(obs) # print(act) else:
# Begin simulation env_info = env.reset(train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state score = 0 # initialize the score while True: action = agent.act(state, add_noise=False) # select an action (without noise) env_info = env.step(action)[brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished score += reward # update the score state = next_state # roll over the state to next time step if done: # exit loop if episode finished break print("Score: {}".format(score)) if __name__ == "__main__": # instantiate agent and load weights agent = DDPGAgent(state_size=33, action_size=4, model=(Actor, Critic), random_seed=0) agent.actor_local.load_state_dict(torch.load('DDPG_actor.pth')) # Run simulation with specified agent print('Running simulation with DDPG agent') run(agent) env.close()
# create experience buffer buffer = ExperienceBuffer(osize, asize, max_len=params["BUFFER_LENGTH"]) # create noise models np.random.seed(0) # set the numpy seed # create actor network actor = DeterministicActor(osize, asize, seed=0).to(device) target_actor = DeterministicActor(osize, asize, seed=0).to(device) # create critic network critic = QCritic(osize, asize, seed=0).to(device) target_critic = QCritic(osize, asize, seed=0).to(device) # create DDPG agents agent_0 = DDPGAgent(actor, critic, target_actor, target_critic, buffer, params) agent_1 = DDPGAgent(actor, critic, target_actor, target_critic, buffer, params) # ------ Train loop ------- for ep_count in range(1, MAX_EPISODES): # reset the environment env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations ep_reward = np.zeros(num_agents) ep_steps = 1 while True: