Exemple #1
0
def agent(obs_dict, config_dict):
    global prev_direction

    env = make('hungry_geese')
    # agent = QAgent(rows=11, columns=11, num_actions=3)
    agent = PPOAgent(rows=11, columns=11, num_actions=3)
    model_name = ''
    agent.load_model_weights('models/' + model_name + '.h5')

    state = preprocess_state(obs_dict, prev_direction)
    action = agent.select_action(state)
    direction = get_direction(prev_direction, action)
    prev_direction = direction
    return env.specification.action.enum[direction]
Exemple #2
0
def test_run():
    """
    use this to test runner
    """
    env = BillardsEnv()
    task = AvoidanceTask(env)

    actor_critic = Policy(env.get_obs_shape(), task.get_action_space()) # TODO: get action space not yet implemented in env
    agent = PPOAgent(actor_critic, clip_param=0.1,
                     ppo_epoch=4, num_mini_batch=32,
                     value_loss_coef=0.5, entropy_coef=0.01,
                     lr=2e-4, eps=1e-5, max_grad_norm=40)

    runner = PPORunner(env=env, task=task, device='cuda', summary_path='./summary/5', agent=agent, actor_critic=actor_critic)

    num_batches = 2000

    try:
        for i in range(num_batches):
            runner.run_batch()

            if i % 100 == 0:
                torch.save(actor_critic, './models/model_5_' + str(i) + '.pt' )

    except:
        torch.save(actor_critic, './models/model_5_crash.pt')
        exit(0)

    torch.save(actor_critic, './models/model_5_end.pt')
Exemple #3
0
def main():
    seaborn.set()

    # create agents with LSTM policy network
    agents = [PPOAgent(game.actions,
                       LSTMPolicy(game.state_shape()[0], game.actions),
                       lr=5e-5, discount=0.99, eps=0.1)
              for _ in range(game.num_players)]

    # load agents if resuming
    for i, a in enumerate(agents):
        path = find_latest(args.agents, 'agent_{}_*.pt'.format(i))
        print(f'Resuming agent {i} from path "{path}"')
        a.load(path)

    # load generator
    path = find_latest(args.generator, 'generator_[0-9]*.pt')
    print(f'Resuming generator from path "{path}"')
    generator = RaceTrackGenerator.from_file(path)
    latent = generator.latent_size

    # agents on own boards
    own_boards = torch.zeros(args.num_boards, RaceConfig.max_segments, 2, device=device)
    for i in range(0, RaceConfig.max_segments, 16):
        own_boards[:, i: i + 16, 0] = 2 * ((i // 16) % 2) - 1

    run_evaluation(agents, game, own_boards, name='Own')

    # # agents on random boards
    val = 1.
    random_boards = torch.zeros(args.num_boards, RaceConfig.max_segments, 2, device=device)
    random_boards[:, :, 0].uniform_(-val, val)

    run_evaluation(agents, game, random_boards, name='Random')

    # generated dummy boards
    dummy_generator = RaceTrackGenerator(latent, lr=1e-5, asynchronous=True)
    generated_boards = dummy_generator.generate(RaceConfig.max_segments, args.num_boards)

    run_evaluation(agents, game, generated_boards, name='Dummy')

    # generated boards
    generated_boards = generator.generate(RaceConfig.max_segments, args.num_boards, t=10.)

    run_evaluation(agents, game, generated_boards, name='Generated')
Exemple #4
0
def ppo_train(model_name, load_model=False, actor_filename=None, critic_filename=None, optimizer_filename=None):
    print("PPO -- Training")

    env = make('hungry_geese')
    trainer = env.train(['greedy', None, 'agents/boilergoose.py', 'agents/handy_rl.py'])

    agent = PPOAgent(rows=11, columns=11, num_actions=3)
    memory = Memory()

    if load_model:
        agent.load_model_weights(actor_filename, critic_filename)
        agent.load_optimizer_weights(optimizer_filename)

    episode = 0
    start_episode = 0
    end_episode = 50000
    reward_threshold = None
    threshold_reached = False
    epochs = 4
    batch_size = 128
    current_frame = 0

    training_rewards = []
    evaluation_rewards = []
    last_1000_ep_reward = []

    for episode in range(start_episode + 1, end_episode + 1):
        obs_dict = trainer.reset()
        ep_reward, ep_steps, done = 0, 0, False
        prev_direction = 0

        while not done:
            current_frame += 1
            ep_steps += 1

            state = preprocess_state(obs_dict, prev_direction)
            action = agent.select_action(state, training=True)
            direction = get_direction(prev_direction, action)
            next_obs_dict, _, done, _ = trainer.step(env.specification.action.enum[direction])
            reward = calculate_reward(obs_dict, next_obs_dict)
            next_state = preprocess_state(next_obs_dict, direction)
            memory.add(state, action, reward, next_state, float(done))

            obs_dict = next_obs_dict
            prev_direction = direction

            ep_reward += reward

            if current_frame % batch_size == 0:
                for _ in range(epochs):
                    states, actions, rewards, next_states, dones = memory.get_all_samples()
                    agent.fit(states, actions, rewards, next_states, dones)
                memory.clear()
                agent.update_networks()

        print("EPISODE " + str(episode) + " - REWARD: " + str(ep_reward) + " - STEPS: " + str(ep_steps))

        if len(last_1000_ep_reward) == 1000:
            last_1000_ep_reward = last_1000_ep_reward[1:]
        last_1000_ep_reward.append(ep_reward)

        if reward_threshold:
            if len(last_1000_ep_reward) == 1000:
                if np.mean(last_1000_ep_reward) >= reward_threshold:
                    print("You solved the task after" + str(episode) + "episodes")
                    agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(episode) + '.h5',
                                             'models/ppo_critic_' + model_name + '_' + str(episode) + '.h5')
                    threshold_reached = True
                    break

        if episode % 1000 == 0:
            print('Episode ' + str(episode) + '/' + str(end_episode))

            last_1000_ep_reward_mean = np.mean(last_1000_ep_reward).round(3)
            training_rewards.append(last_1000_ep_reward_mean)
            print('Average reward in last 1000 episodes: ' + str(last_1000_ep_reward_mean))
            print()

        if episode % 1000 == 0:
            eval_reward = 0
            for i in range(100):
                obs_dict = trainer.reset()
                done = False
                prev_direction = 0
                while not done:
                    state = preprocess_state(obs_dict, prev_direction)
                    action = agent.select_action(state)
                    direction = get_direction(prev_direction, action)
                    next_obs_dict, _, done, _ = trainer.step(env.specification.action.enum[direction])
                    reward = calculate_reward(obs_dict, next_obs_dict)
                    obs_dict = next_obs_dict
                    prev_direction = direction
                    eval_reward += reward
            eval_reward /= 100
            evaluation_rewards.append(eval_reward)
            print("Evaluation reward: " + str(eval_reward))
            print()

        if episode % 5000 == 0:
            agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(episode) + '.h5',
                                     'models/ppo_critic_' + model_name + '_' + str(episode) + '.h5')
            agent.save_optimizer_weights('models/ppo_' + model_name + '_' + str(episode) + '_optimizer.npy')

    agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(end_episode) + '.h5',
                             'models/ppo_critic_' + model_name + '_' + str(end_episode) + '.h5')
    agent.save_optimizer_weights('models/ppo_' + model_name + '_' + str(end_episode) + '_optimizer.npy')

    if threshold_reached:
        plt.plot([i for i in range(start_episode + 1000, episode, 1000)], training_rewards)
    else:
        plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], training_rewards)
    plt.title("Reward")
    plt.show()

    plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], evaluation_rewards)
    plt.title('Evaluation rewards')
    plt.show()
Exemple #5
0
def main():
    run_path = args.resume_path if args.resume_path else find_next_run_dir(
        'experiments')
    print(f'Running experiment {run_path}')

    episode = 0
    finish_mean = 0.

    # create agents with LSTM policy network
    agents = [
        PPOAgent(game.actions,
                 LSTMPolicy(game.state_shape()[0], game.actions),
                 lr=5e-5,
                 discount=0.99,
                 eps=0.1) for _ in range(game.num_players)
    ]

    # load agents if resuming
    for i, a in enumerate(agents):
        path = find_latest(args.agents, f'agent_{i}_*.pt')
        print(f'Resuming agent {i} from path "{path}"')
        a.load(path)

    # create discriminator
    discriminator = RaceWinnerDiscriminator(game.num_players,
                                            lr=1e-5,
                                            betas=(0.5, 0.9))

    # create generator
    generator = RaceTrackGenerator(args.latent, lr=1e-5, betas=(0.3, 0.9))

    if args.resume_path:
        path = find_latest(args.resume_path, 'discriminator_[0-9]*.pt')
        print(f'Resuming discriminator from path "{path}"')
        discriminator.load(path)

        path = find_latest(args.resume_path, 'generator_[0-9]*.pt')
        print(f'Resuming generator from path "{path}"')
        generator.load(path)

        path = find_latest(args.resume_path, 'params_*.pt')
        print(f'Resuming params from path "{path}"')
        params = torch.load(path)
        episode = params['episode']
        finish_mean = params['finish_mean']

    summary_writer = SummaryWriter(os.path.join(run_path, 'summary'),
                                   purge_step=episode)
    result = {}

    while True:
        if episode % 30 == 0:
            print(f'-- episode {episode}')

        # -- training discriminator
        boards = generator.generate(RaceConfig.max_segments,
                                    args.batch_size).detach()
        boards = torch.cat((boards, predefined_tracks()), dim=0)
        boards = torch.cat(
            (boards, -boards),
            dim=0)  # mirror levels to train more robust discriminator
        rboards = boards.repeat(args.trials, 1, 1)

        states, any_valid = game.reset(rboards)
        game.record(0)

        # run agents to find who wins
        with torch.no_grad():
            while any_valid and not game.finished():
                actions = torch.stack(
                    [a.act(s, training=False) for a, s in zip(agents, states)],
                    dim=0)
                states, rewards = game.step(actions)

        for a in agents:
            a.reset()

        cur_mean = game.finishes.float().mean().item()
        finish_mean = 0.9 * finish_mean + 0.1 * cur_mean
        result['game/finishes'] = cur_mean

        # discriminator calculate loss and perform backward pass
        winners = one_hot(game.winners() + 1, num_classes=game.num_players + 1)
        winners = winners.view(args.trials, -1,
                               *winners.shape[1:]).float().mean(0)
        dloss, dacc = discriminator.train(boards.detach(), winners)
        result['discriminator/loss'] = dloss
        result['discriminator/accuracy'] = dacc

        # -- train generator
        for _ in range(args.generator_train_steps):
            generated = generator.generate(RaceConfig.max_segments,
                                           args.generator_batch_size)
            pred_winners = discriminator.forward(generated)
            gloss, galoss = generator.train(pred_winners, args.generator_beta)
            result['generator/loss'] = gloss
            if galoss:
                result['generator/aux_loss'] = galoss

        # log data
        for p in range(game.num_players):
            result[f'game/win_rates/player_{p}'] = winners[:, p +
                                                           1].mean().item()
        result['game/invalid'] = winners[:, 0].mean().item()

        # save episode
        if episode % 100 == 0:
            game.record_episode(
                os.path.join(run_path, 'videos', f'episode_{episode}'))
            # save boards as images in tensorboard
            for i, img in enumerate(game.tracks_images(top_n=args.batch_size)):
                result[f'game/boards_{i}'] = np.transpose(img, axes=(2, 0, 1))

        # save networks
        if episode % 500 == 0:
            discriminator.save(
                os.path.join(run_path, f'discriminator_{episode}.pt'))
            generator.save(os.path.join(run_path, f'generator_{episode}.pt'))
            torch.save({
                'episode': episode,
                'finish_mean': finish_mean
            }, os.path.join(run_path, f'params_{episode}.pt'))

        # save data to tensorboard
        for tag, data in result.items():
            if isinstance(data, np.ndarray):
                summary_writer.add_image(tag, data, global_step=episode)
            else:
                summary_writer.add_scalar(tag, data, global_step=episode)
        # -----
        if episode % 1000 == 0:
            gc.collect()
        result.clear()
        episode += 1
from agents import CommandLineAgent, DeepQLearningAgent, PPOAgent, RandomAgent
from environments.tictactoe import TicTacToeGameState
from runners import run_for_n_games_and_print_stats, run_step

if __name__ == "__main__":
    gs = TicTacToeGameState()
    agent0 = PPOAgent(state_space_size=gs.get_vectorized_state().shape[0],
                      action_space_size=gs.get_action_space_size())
    agent1 = RandomAgent()

    for i in range(100):
        run_for_n_games_and_print_stats([agent0, agent1], gs, 5000)

    run_for_n_games_and_print_stats([agent0, agent1], gs, 100)

    gs_clone = gs.clone()
    while not gs_clone.is_game_over():
        run_step([agent0, CommandLineAgent()], gs_clone)
        print(gs_clone)

    gs_clone = gs.clone()
    while not gs_clone.is_game_over():
        run_step([CommandLineAgent(), agent1], gs_clone)
        print(gs_clone)
    "MsPacman-v0",
    "SpaceInvaders-v0",
    "Seaquest-v0",
    "LunarLanderV2",
    "Reacher-v2",
    "FrozenLake-v0"
]

env = gym.make('MountainCar-v0')
obs, rew, done, ep_ret, ep_len = env.reset(), 0, False, 0, 0

epochs = 10
local_steps_per_epoch = 1000
# tf.set_random_seed(22222)

agent = PPOAgent(env.observation_space, env.action_space)
buffer = Buffer(env.observation_space.shape, env.action_space.shape, size=local_steps_per_epoch)

rewards = [0]
for epoch in tqdm(range(epochs)):
    # print("Epoch {} Reward {}".format(epoch, rewards[-1]))
    for t in range(local_steps_per_epoch):
        act, v_t, logp_pi = agent.get_action(obs)

        buffer.store(obs, act, rew, v_t, logp_pi) # Last var is logpi (not used in vpg)

        obs, rew, done, _ = env.step(act[0])
        ep_ret += rew
        ep_len += 1

        if done or (t==local_steps_per_epoch-1):
        'action_size': brain.vector_action_space_size,
        'number_of_agents': len(env_info.agents)
    },
    'pytorch': {
        'device':
        torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    },
    'hyperparameters': {
        'discount_rate': 0.99,
        'tau': 0.95,
        'gradient_clip': 5,
        'rollout_length': 2048,
        'optimization_epochs': 10,
        'ppo_clip': 0.2,
        'log_interval': 2048,
        'max_steps': 1e5,
        'mini_batch_number': 32,
        'entropy_coefficent': 0.01,
        'episode_count': 250,
        'hidden_size': 512,
        'adam_learning_rate': 3e-4,
        'adam_epsilon': 1e-5
    }
}

policy = PPOPolicyNetwork(config)
optimizier = optim.Adam(policy.parameters(),
                        config['hyperparameters']['adam_learning_rate'],
                        eps=config['hyperparameters']['adam_epsilon'])
agent = PPOAgent(env, brain_name, policy, optimizier, config)
agent.train(1500)
Exemple #9
0
    reset_pcnt = True
    using_pcnt = True
    bn_flag = 2

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    actor_critic = ActorCriticConv(num_dwell, input_channel, input_size,
                                   std_scale, bn_flag)
    actor_critic.to(device)

    agent = PPOAgent(actor_critic,
                     num_dwell,
                     horizon,
                     max_grad_norm,
                     coef_value_loss,
                     coef_entropy_loss,
                     clip_param,
                     training_batch,
                     ppo_epochs,
                     constraint,
                     data_folder="./pg_data")

    if load_pretrain_model:
        print("Loading trained model from ", model_save_dir,
              '{}-all-{}.ckpt'.format(total_supposed_steps, model_name))
        model_path = os.path.join(
            model_save_dir, '{}-all-{}.ckpt'.format(total_supposed_steps,
                                                    model_name))
        state_dict = torch.load(model_path)
        agent.model.load_state_dict(state_dict, strict=False)
Exemple #10
0
import random

from agents import DeepQLearningAgent, RandomAgent, TabQLearningAgent, DeepQLearningExperienceReplayAgent, PPOAgent
from environments.battle_royale import BattleRoyalGameWorldTerminal, BattleRoyale
from runners import run_for_n_games_and_print_stats, run_step
import tensorflow as tf
if __name__ == "__main__":
    tf.compat.v1.disable_eager_execution()
    state_space_size = 19
    list_agent = [
        PPOAgent(state_space_size=state_space_size,
                 action_space_size=48,
                 episodes_count_between_training=1)
        if i < 1 else RandomAgent() for i in range(6)
    ]
    for i in range(100):
        #random.shuffle(list_agent)
        gs = BattleRoyalGameWorldTerminal(i,
                                          numberofPlayer=6,
                                          list_agent=list_agent)
        gs.run()

    #list_agent[0].epsilon = -1
    #list_agent[1].epsilon = -1
    gs2 = BattleRoyale(numberofPlayer=6, list_agent=list_agent)
    gs2.run()