def main(args): set_seed(args.seed) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # initialize environment n_players = 3 env = football_env.create_environment( env_name="academy_3_vs_1_with_keeper", representation="simple115", number_of_left_players_agent_controls=n_players, stacked=False, logdir="/tmp/football", write_goal_dumps=False, write_full_episode_dumps=False, render=False) # state and action space state_space_size = env.observation_space.shape[ 1] # we are using simple115 representation action_space_size = env.action_space.nvec.tolist()[0] # 三个 players 动作空间相同 # state[98:100] 表示控制的三个球员 # model print("loading models") actors = [ Actor(state_space_size=state_space_size, action_space_size=action_space_size) for _ in range(n_players) ] critics = [ Critic(state_space_size=state_space_size, action_space_size=action_space_size, n_players=n_players) for _ in range(n_players) ] old_actors = [ Actor(state_space_size=state_space_size, action_space_size=action_space_size) for _ in range(n_players) ] old_critics = [ Critic(state_space_size=state_space_size, action_space_size=action_space_size, n_players=n_players) for _ in range(n_players) ] for old_actor, actor in zip(old_actors, actors): old_actor.load_state_dict(actor.state_dict()) for old_critic, critic in zip(old_critics, critics): old_critic.load_state_dict(critic.state_dict()) # maddpg maddpg = MADDPG(env=env, action_list=list(range(action_space_size)), actors=actors, critics=critics, old_actors=old_actors, old_critics=old_critics, args=args, device=device) print("learn") maddpg.learn()
def update(): if ALGORITHM == 'maddpg': ddpg = MADDPG(avs.n_actions, avs.n_features, 1, 'maddpg model', RETRAIN) elif ALGORITHM == 'ddpg': ddpg = DDPG(avs.n_actions, avs.n_features, 1, 'ddpg model', RETRAIN) else: ddpg = DDPG(avs.n_actions, avs.n_features, 1, 'ddpg model', RETRAIN) t1 = time.time() rewards1 = 0 rewards2 = 0 var = VAR collision = 0 avgreward1 = [] avgreward2 = [] collision_percentage = [] for i in range(MAX_EPISODES): s1, s2 = avs.reset() ep_reward1 = 0 ep_reward2 = 0 if i % 100000 == 0 and i > IMITATION_EPISODE: plot(avgreward1, avgreward2, collision_percentage, i) for j in range(MAX_EP_STEPS): if RENDER: avs.render() # Add exploration noise if i < IMITATION_EPISODE or i % 4 == 0: a1 = imitation(avs.agent1, avs.agent2, avs.target1) a2 = imitation(avs.agent2, avs.agent1, avs.target2) else: # add randomness to action selection for exploration a1 = ddpg.choose_action(s1) a1 = [ np.clip(np.random.normal(a1[0], var), -1, 1), np.clip(np.random.normal(a1[1], var), -1, 1) ] a2 = ddpg.choose_action(s2) a2 = [ np.clip(np.random.normal(a2[0], var), -1, 1), np.clip(np.random.normal(a2[1], var), -1, 1) ] # a2 = imitation(avs.agent2, avs.agent1, avs.target2) if DEBUG: time.sleep(0.1) s_1, r1, s_2, r2, done, info = avs.step(a1, a2) if ALGORITHM == 'ddpg': ddpg.store_transition(s1, a1, r1, s_1) ddpg.store_transition(s2, a2, r2, s_2) else: ddpg.store_transition(s1, s2, a1, a2, r1, s_1, s_2) ddpg.store_transition(s2, s1, a2, a1, r2, s_2, s_1) s1 = s_1 s2 = s_2 ep_reward1 += r1 ep_reward2 += r2 if j == MAX_EP_STEPS - 1 or done: print("pt:", ddpg.pointer) print('Episode:', i, 'Step:', j, ' Reward: %i' % int(ep_reward1), int(ep_reward2), 'Explore: %.2f' % var) if i >= IMITATION_EPISODE: rewards1 += ep_reward1 rewards2 += ep_reward2 if r1 < -100: collision += 1 if (i + 1) % 100 == 0: avgreward1.append(rewards1 / 100) avgreward2.append(rewards2 / 100) collision_percentage.append(collision) rewards1 = 0 rewards2 = 0 collision = 0 break if ddpg.pointer > MEMORY_CAPACITY: ddpg.learn() ddpg.learn() if var > MIN_VAR and i > IMITATION_EPISODE: var *= DECAY # decay the action randomness if i % 4 != 0 and ep_reward1 > 100 and ep_reward2 > 100 and i > IMITATION_EPISODE: ddpg.save(i) print('Running time: ', time.time() - t1)
env.render() #time.sleep(0.1) # to slow down the action for the video actions = maddpg_agents.choose_action(obs) obs_, reward, done, info = env.step(actions) state = obs_list_to_state_vector(obs) state_ = obs_list_to_state_vector(obs_) if episode_step >= MAX_STEPS: done = [True] * n_agents memory.store_transition(obs, state, actions, reward, obs_, state_, done) if total_steps % 100 == 0 and not evaluate: maddpg_agents.learn(memory) obs = obs_ score += sum(reward) total_steps += 1 episode_step += 1 score_history.append(score) avg_score = np.mean(score_history[-100:]) if not evaluate: if avg_score > best_score: maddpg_agents.save_checkpoint() best_score = avg_score if i % PRINT_INTERVAL == 0 and i > 0: print('episode', i, 'average score {:.1f}'.format(avg_score))
def train(env, num_episodes=5000, max_t=1000, warmup_episodes=0): """ Monitor agent's performance. Params ====== - env: instance of the environment - num_episodes: maximum number of episodes of agent-environment interaction - max_t: maximum number of timesteps per episode - warmup_episodes: how many episodes to explore and collect samples before learning begins Returns ======= - scores: list containing received rewards """ # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # number of actions action_size = brain.vector_action_space_size print('Number of actions:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) # amplitude of OU noise # this slowly decreases to 0 noise = 1.0 noise_reduction = 0.9999 # list containing max scores from each episode episode_scores = [] # last 100 scores scores_window = deque(maxlen=100) mean_score = 0.0 maddpg = MADDPG(state_size, action_size, num_agents * state_size, num_agents * action_size) # for each episode for i_episode in range(1, num_episodes + 1): # reset the environment and begin the episode env_info = env.reset(train_mode=True)[brain_name] maddpg.reset() # get the current state (for each agent) states = env_info.vector_observations # initialize the score (for each agent) scores = np.zeros(num_agents) for t in range(max_t): # select an action (for each agent) if i_episode > warmup_episodes: actions = maddpg.act(states, noise) noise *= noise_reduction else: # Collect random samples to explore and fill the replay buffer actions = np.random.uniform(-1, 1, (num_agents, action_size)) # send all actions to the environment env_info = env.step(actions)[brain_name] # get next state (for each agent) next_states = env_info.vector_observations # get reward (for each agent) rewards = env_info.rewards # see if episode finished dones = env_info.local_done # agents perform internal updates based on sampled experience maddpg.step(states, actions, rewards, next_states, dones) # roll over states to next time step states = next_states # learn when time is right if t % LEARN_EVERY == 0 and i_episode > warmup_episodes: for _ in range(LEARN_BATCH): maddpg.learn() # update the score (for each agent) scores += rewards # exit loop if episode finished if np.any(dones): break episode_max_score = np.max(scores) episode_scores.append(episode_max_score) if i_episode > warmup_episodes: # save final score scores_window.append(episode_max_score) mean_score = np.mean(scores_window) # monitor progress if i_episode % 10 == 0: print("\rEpisode {:d}/{:d} || Average score {:.2f}".format( i_episode, num_episodes, mean_score)) else: print("\rWarmup episode {:d}/{:d}".format(i_episode, warmup_episodes), end="") if i_episode % SAVE_EVERY == 0 and i_episode > warmup_episodes: maddpg.save_weights(i_episode) # check if task is solved if i_episode >= 100 and mean_score >= 0.5: print( '\nEnvironment solved in {:d} episodes. Average score: {:.2f}'. format(i_episode, mean_score)) maddpg.save_weights() break if i_episode == num_episodes: print("\nGame over. Too bad! Final score {:.2f}\n".format(mean_score)) return episode_scores