Exemple #1
0
def main(args):
    set_seed(args.seed)
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    # initialize environment
    n_players = 3
    env = football_env.create_environment(
        env_name="academy_3_vs_1_with_keeper",
        representation="simple115",
        number_of_left_players_agent_controls=n_players,
        stacked=False,
        logdir="/tmp/football",
        write_goal_dumps=False,
        write_full_episode_dumps=False,
        render=False)

    # state and action space
    state_space_size = env.observation_space.shape[
        1]  # we are using simple115 representation
    action_space_size = env.action_space.nvec.tolist()[0]  # 三个 players 动作空间相同
    # state[98:100] 表示控制的三个球员

    # model
    print("loading models")
    actors = [
        Actor(state_space_size=state_space_size,
              action_space_size=action_space_size) for _ in range(n_players)
    ]
    critics = [
        Critic(state_space_size=state_space_size,
               action_space_size=action_space_size,
               n_players=n_players) for _ in range(n_players)
    ]
    old_actors = [
        Actor(state_space_size=state_space_size,
              action_space_size=action_space_size) for _ in range(n_players)
    ]
    old_critics = [
        Critic(state_space_size=state_space_size,
               action_space_size=action_space_size,
               n_players=n_players) for _ in range(n_players)
    ]
    for old_actor, actor in zip(old_actors, actors):
        old_actor.load_state_dict(actor.state_dict())
    for old_critic, critic in zip(old_critics, critics):
        old_critic.load_state_dict(critic.state_dict())

    # maddpg
    maddpg = MADDPG(env=env,
                    action_list=list(range(action_space_size)),
                    actors=actors,
                    critics=critics,
                    old_actors=old_actors,
                    old_critics=old_critics,
                    args=args,
                    device=device)
    print("learn")
    maddpg.learn()
Exemple #2
0
def update():
    if ALGORITHM == 'maddpg':
        ddpg = MADDPG(avs.n_actions, avs.n_features, 1, 'maddpg model',
                      RETRAIN)
    elif ALGORITHM == 'ddpg':
        ddpg = DDPG(avs.n_actions, avs.n_features, 1, 'ddpg model', RETRAIN)
    else:
        ddpg = DDPG(avs.n_actions, avs.n_features, 1, 'ddpg model', RETRAIN)
    t1 = time.time()
    rewards1 = 0
    rewards2 = 0
    var = VAR
    collision = 0
    avgreward1 = []
    avgreward2 = []
    collision_percentage = []
    for i in range(MAX_EPISODES):
        s1, s2 = avs.reset()
        ep_reward1 = 0
        ep_reward2 = 0
        if i % 100000 == 0 and i > IMITATION_EPISODE:
            plot(avgreward1, avgreward2, collision_percentage, i)
        for j in range(MAX_EP_STEPS):
            if RENDER:
                avs.render()

            # Add exploration noise
            if i < IMITATION_EPISODE or i % 4 == 0:
                a1 = imitation(avs.agent1, avs.agent2, avs.target1)
                a2 = imitation(avs.agent2, avs.agent1, avs.target2)
            else:
                # add randomness to action selection for exploration
                a1 = ddpg.choose_action(s1)
                a1 = [
                    np.clip(np.random.normal(a1[0], var), -1, 1),
                    np.clip(np.random.normal(a1[1], var), -1, 1)
                ]
                a2 = ddpg.choose_action(s2)
                a2 = [
                    np.clip(np.random.normal(a2[0], var), -1, 1),
                    np.clip(np.random.normal(a2[1], var), -1, 1)
                ]
                # a2 = imitation(avs.agent2, avs.agent1, avs.target2)

            if DEBUG:
                time.sleep(0.1)
            s_1, r1, s_2, r2, done, info = avs.step(a1, a2)
            if ALGORITHM == 'ddpg':
                ddpg.store_transition(s1, a1, r1, s_1)
                ddpg.store_transition(s2, a2, r2, s_2)
            else:
                ddpg.store_transition(s1, s2, a1, a2, r1, s_1, s_2)
                ddpg.store_transition(s2, s1, a2, a1, r2, s_2, s_1)

            s1 = s_1
            s2 = s_2
            ep_reward1 += r1
            ep_reward2 += r2

            if j == MAX_EP_STEPS - 1 or done:
                print("pt:", ddpg.pointer)
                print('Episode:', i,
                      'Step:', j, ' Reward: %i' % int(ep_reward1),
                      int(ep_reward2), 'Explore: %.2f' % var)

                if i >= IMITATION_EPISODE:
                    rewards1 += ep_reward1
                    rewards2 += ep_reward2
                    if r1 < -100:
                        collision += 1
                    if (i + 1) % 100 == 0:
                        avgreward1.append(rewards1 / 100)
                        avgreward2.append(rewards2 / 100)
                        collision_percentage.append(collision)
                        rewards1 = 0
                        rewards2 = 0
                        collision = 0
                break
        if ddpg.pointer > MEMORY_CAPACITY:
            ddpg.learn()
            ddpg.learn()
            if var > MIN_VAR and i > IMITATION_EPISODE:
                var *= DECAY  # decay the action randomness
        if i % 4 != 0 and ep_reward1 > 100 and ep_reward2 > 100 and i > IMITATION_EPISODE:
            ddpg.save(i)
    print('Running time: ', time.time() - t1)
Exemple #3
0
                env.render()
                #time.sleep(0.1) # to slow down the action for the video
            actions = maddpg_agents.choose_action(obs)
            obs_, reward, done, info = env.step(actions)

            state = obs_list_to_state_vector(obs)
            state_ = obs_list_to_state_vector(obs_)

            if episode_step >= MAX_STEPS:
                done = [True] * n_agents

            memory.store_transition(obs, state, actions, reward, obs_, state_,
                                    done)

            if total_steps % 100 == 0 and not evaluate:
                maddpg_agents.learn(memory)

            obs = obs_

            score += sum(reward)
            total_steps += 1
            episode_step += 1

        score_history.append(score)
        avg_score = np.mean(score_history[-100:])
        if not evaluate:
            if avg_score > best_score:
                maddpg_agents.save_checkpoint()
                best_score = avg_score
        if i % PRINT_INTERVAL == 0 and i > 0:
            print('episode', i, 'average score {:.1f}'.format(avg_score))
Exemple #4
0
def train(env, num_episodes=5000, max_t=1000, warmup_episodes=0):
    """ Monitor agent's performance.
    
    Params
    ======
    - env: instance of the environment
    - num_episodes: maximum number of episodes of agent-environment interaction
    - max_t: maximum number of timesteps per episode
    - warmup_episodes: how many episodes to explore and collect samples before learning begins
    
    Returns
    =======
    - scores: list containing received rewards
    """

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)

    # number of actions
    action_size = brain.vector_action_space_size
    print('Number of actions:', action_size)

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(
        states.shape[0], state_size))

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 1.0
    noise_reduction = 0.9999

    # list containing max scores from each episode
    episode_scores = []
    # last 100 scores
    scores_window = deque(maxlen=100)
    mean_score = 0.0

    maddpg = MADDPG(state_size, action_size, num_agents * state_size,
                    num_agents * action_size)

    # for each episode
    for i_episode in range(1, num_episodes + 1):
        # reset the environment and begin the episode
        env_info = env.reset(train_mode=True)[brain_name]
        maddpg.reset()

        # get the current state (for each agent)
        states = env_info.vector_observations

        # initialize the score (for each agent)
        scores = np.zeros(num_agents)

        for t in range(max_t):
            # select an action (for each agent)
            if i_episode > warmup_episodes:
                actions = maddpg.act(states, noise)
                noise *= noise_reduction
            else:
                # Collect random samples to explore and fill the replay buffer
                actions = np.random.uniform(-1, 1, (num_agents, action_size))

            # send all actions to the environment
            env_info = env.step(actions)[brain_name]

            # get next state (for each agent)
            next_states = env_info.vector_observations

            # get reward (for each agent)
            rewards = env_info.rewards

            # see if episode finished
            dones = env_info.local_done

            # agents perform internal updates based on sampled experience
            maddpg.step(states, actions, rewards, next_states, dones)

            # roll over states to next time step
            states = next_states

            # learn when time is right
            if t % LEARN_EVERY == 0 and i_episode > warmup_episodes:
                for _ in range(LEARN_BATCH):
                    maddpg.learn()

            # update the score (for each agent)
            scores += rewards

            # exit loop if episode finished
            if np.any(dones):
                break

        episode_max_score = np.max(scores)
        episode_scores.append(episode_max_score)

        if i_episode > warmup_episodes:
            # save final score
            scores_window.append(episode_max_score)
            mean_score = np.mean(scores_window)
            # monitor progress
            if i_episode % 10 == 0:
                print("\rEpisode {:d}/{:d} || Average score {:.2f}".format(
                    i_episode, num_episodes, mean_score))
        else:
            print("\rWarmup episode {:d}/{:d}".format(i_episode,
                                                      warmup_episodes),
                  end="")

        if i_episode % SAVE_EVERY == 0 and i_episode > warmup_episodes:
            maddpg.save_weights(i_episode)

        # check if task is solved
        if i_episode >= 100 and mean_score >= 0.5:
            print(
                '\nEnvironment solved in {:d} episodes. Average score: {:.2f}'.
                format(i_episode, mean_score))
            maddpg.save_weights()
            break
    if i_episode == num_episodes:
        print("\nGame over. Too bad! Final score {:.2f}\n".format(mean_score))
    return episode_scores