Esempio n. 1
0
def play_hexus(mode, episodes, board_level):
    print('<><><><>HEXUS<><><><>')
    if mode == 'train':
        # Train agent to go first
        agent = Agent(Hexus,
                      epsilon=5e-1,
                      learning_rate=25e-2,
                      board_level=board_level)
        n = episodes
        history = agent.train(n)
        print('After {} Episodes'.format(n))

    elif mode == 'hyper':
        # Hyper parameter optimization
        max_e = 0.0
        max_lr = 0.0
        max_reward = 0.0
        epsilons = [1e-1, 2e-1, 9e-2, 1e-2, 9e-3]
        learning_rates = [1e-1, 2e-1, 3e-1, 25e-2, 9e-2]
        for epsilon in epsilons:
            for learning_rate in learning_rates:
                agent = Agent(Hexus,
                              player='B',
                              epsilon=epsilon,
                              learning_rate=learning_rate)
                n = 10000
                history = agent.train(n, history=[])
                total = history[1][len(history[1]) - 1]
                print(total)
                if total > max_reward:
                    max_reward = total
                    max_e = epsilon
                    max_lr = learning_rate
        print('Max e: {}'.format(max_e))
        print('Max lr: {}'.format(max_lr))
        print('Max reward: {}'.format(max_reward))

    else:
        print('Mode {} is invalid.'.format(mode))
Esempio n. 2
0
def play_tictactoe(mode):
    """Start TicTacToe game with RL Agent."""
    print('==TIC TAC TOE==')
    game = TicTacToe()

    if mode == 'train':
        agent = Agent(game)
        history = agent.train(10000)
        print('After 10000 Episodes')

        # Plot Reward Stats
        rfig, raxs = plt.subplots(nrows=3, ncols=1)
        rax_reward1 = raxs[0]
        rax_reward1.grid()
        rax_reward2 = raxs[1]
        rax_reward2.grid()
        rax_reward3 = raxs[2]
        rax_reward3.grid()

        rax_reward1.plot(history[0][:100], history[1][:100])
        rax_reward1.set(ylabel='Cumulative Reward', title='Tic Tac Toe Cumulative Reward Episodes')

        rax_reward2.plot(history[0][:1000], history[1][:1000], color='g')
        rax_reward2.set(ylabel='Cumulative Reward')

        rax_reward3.plot(history[0][:10000], history[1][:10000], color='r')
        rax_reward3.set(xlabel='Episode', ylabel='Cumulative Reward')

        rfig.savefig('tictactoe_reward.png')

        # Plot Qtable Memory Usage Stats
        memfig, memaxs = plt.subplots(nrows=3, ncols=1)
        memax_reward1 = memaxs[0]
        memax_reward1.grid()
        memax_reward2 = memaxs[1]
        memax_reward2.grid()
        memax_reward3 = memaxs[2]
        memax_reward3.grid()

        memax_reward1.plot(history[0][:100], history[2][:100])
        memax_reward1.set(ylabel='Size (KB)', title='Tic Tac Toe QTable Size Episodes')

        memax_reward2.plot(history[0][:1000], history[2][:1000], color='g')
        memax_reward2.set(ylabel='Size (KB)')

        memax_reward3.plot(history[0][:10000], history[2][:10000], color='r')
        memax_reward3.set(xlabel='Episode', ylabel='Size (KB)')

        memfig.savefig('tictactoe_memory.png')
        plt.show()

        agent.save_values(path='data/tictactoe_qtable.json')
        agent.stats()
        agent.demo()

    elif mode == 'demo':
        qtable = json.load(open('data/tictactoe_qtable.json'))
        agent = Agent(game, qtable=qtable)
        agent.demo()

    else:
        print('Mode {} is invalid.'.format(mode))
Esempio n. 3
0
def play_chomp(mode):
    """Start Chomp game and training."""
    print('=====CHOMP=====')
    # Square board has optimal strategy to allow for easy sanity check that agent is learning.
    game = Chomp(rows=4, cols=4)
    if mode == 'train':
        # Train agent to go first
        agent = Agent(game, epsilon=9e-3, learning_rate=25e-2)
        n = 10000
        history = agent.train(n)
        print('After {} Episodes'.format(n))

        # Plot Reward Stats
        rfig, raxs = plt.subplots(nrows=3, ncols=1)
        rax_reward1 = raxs[0]
        rax_reward1.grid()
        rax_reward2 = raxs[1]
        rax_reward2.grid()
        rax_reward3 = raxs[2]
        rax_reward3.grid()

        rax_reward1.plot(history[0][:100], history[1][:100])
        rax_reward1.set(ylabel='Cumulative Reward', title='Chomp 4x4 Cumulative Reward')

        rax_reward2.plot(history[0][:1000], history[1][:1000], color='g')
        rax_reward2.set(ylabel='Cumulative Reward')

        rax_reward3.plot(history[0][:n], history[1][:n], color='r')
        rax_reward3.set(xlabel='Episode', ylabel='Cumulative Reward')

        rfig.savefig('chomp_reward.png')

        # Plot Qtable Memory Usage Stats
        memfig, memaxs = plt.subplots(nrows=3, ncols=1)
        memax_reward1 = memaxs[0]
        memax_reward1.grid()
        memax_reward2 = memaxs[1]
        memax_reward2.grid()
        memax_reward3 = memaxs[2]
        memax_reward3.grid()

        memax_reward1.plot(history[0][:100], history[2][:100])
        memax_reward1.set(ylabel='Size (KB)', title='Chomp 4x4 QTable Size')

        memax_reward2.plot(history[0][:1000], history[2][:1000], color='g')
        memax_reward2.set(ylabel='Size (KB)')

        memax_reward3.plot(history[0][:n], history[2][:n], color='r')
        memax_reward3.set(xlabel='Episode', ylabel='Size (KB)')
        plt.show()

        agent.save_values(path='data/chomp_qtable.json')
        agent.demo()

    elif mode == 'hyper':
        # Hyper parameter optimization
        max_e = 0.0
        max_lr = 0.0
        max_reward = 0.0
        epsilons = [1e-1, 2e-1, 9e-2, 1e-2, 9e-3]
        learning_rates = [1e-1, 2e-1, 3e-1, 25e-2, 9e-2]
        for epsilon in epsilons:
            for learning_rate in learning_rates:
                agent = Agent(game, qtable={}, player='X', epsilon=epsilon, learning_rate=learning_rate)
                n = 10000
                history = agent.train(n, history=[])
                total = history[1][len(history[1]) - 1]
                print(total)
                if total > max_reward:
                    max_reward = total
                    max_e = epsilon
                    max_lr = learning_rate
        print('Max e: {}'.format(max_e))
        print('Max lr: {}'.format(max_lr))
        print('Max reward: {}'.format(max_reward))

    elif mode == 'demo':
        qtable = json.load(open('data/chomp_qtable.json'))
        agent = Agent(game, qtable=qtable)
        agent.demo()
    else:
        print('Mode {} is invalid.'.format(mode))
Esempio n. 4
0
def play_connectfour(mode):
    """Start Connect Four game and training."""
    print('==CONNECT FOUR==')
    game = ConnectFour()

    if mode == 'train':
        agent = Agent(game)
        history = agent.train(10000)
        print('After 10000 Episodes')

        # Plot Reward Stats
        rfig, raxs = plt.subplots(nrows=3, ncols=1)
        rax_reward1 = raxs[0]
        rax_reward1.grid()
        rax_reward2 = raxs[1]
        rax_reward2.grid()
        rax_reward3 = raxs[2]
        rax_reward3.grid()

        rax_reward1.plot(history[0][:100], history[1][:100])
        rax_reward1.set(ylabel='Cumulative Reward', title='Connect Four Cumulative Reward (3 Column State)')

        rax_reward2.plot(history[0][:1000], history[1][:1000], color='g')
        rax_reward2.set(ylabel='Cumulative Reward')

        rax_reward3.plot(history[0][:10000], history[1][:10000], color='r')
        rax_reward3.set(xlabel='Episode', ylabel='Cumulative Reward')

        rfig.savefig('connectfour_reward.png')

        # Plot Qtable Memory Usage Stats
        memfig, memaxs = plt.subplots(nrows=3, ncols=1)
        memax_reward1 = memaxs[0]
        memax_reward1.grid()
        memax_reward2 = memaxs[1]
        memax_reward2.grid()
        memax_reward3 = memaxs[2]
        memax_reward3.grid()

        memax_reward1.plot(history[0][:100], history[2][:100])
        memax_reward1.set(ylabel='Size (KB)', title='Connect Four QTable Size (3 Column State)')

        memax_reward2.plot(history[0][:1000], history[2][:1000], color='g')
        memax_reward2.set(ylabel='Size (KB)')

        memax_reward3.plot(history[0][:10000], history[2][:10000], color='r')
        memax_reward3.set(xlabel='Episode', ylabel='Size (KB)')

        memfig.savefig('connectfour_memory.png')
        plt.show()

        agent.save_values(path='data/connectfour_qtable.json')
        agent.demo()

    elif mode == 'demo':
        qtable = json.load(open('data/connectfour_qtable.json'))
        agent = Agent(game, qtable=qtable)
        agent.demo()

    else:
        print('Mode {} is invalid.'.format(mode))
Esempio n. 5
0
                        use_nearby_obs=True)

    train_env_sets = [env1, env2]
    ##scale maximum possible returns to the same for balace learning
    rew_scale_factor = [1.2, 1.4]

    agent = Agent('GridWorldKey',
                  envs=train_env_sets,
                  rew_scale=rew_scale_factor,
                  batch_size=20,
                  n_ways=0)

    print('----Initing')
    agent.init_scaler(10)
    print("----Learning L0 on env1 and env2")
    agent.train(n_iter=3000, L='0', mask=[1, 1])

    print("----Learning L11 on env1")
    agent.add_module(L_name='11')
    agent.train(n_iter=1500, L='11', mask=[1, 0])

    print("----Learning L12 on env2")
    print("\tLearning combine-weights on env2 for several iteration")
    agent.train(n_iter=500, L='11', mask=[0, 1], trainWeight=True)
    print("\tLearning L12 module")
    agent.add_module(L_name='12')
    agent.train(n_iter=3500, L='12', mask=[0, 1])

    print('----Learning L13 on unseen env3')
    agent.addenv(env=env3, rew_scale=1.6)
    print('\tLearning combine-weights on env2 for several iteration')
Esempio n. 6
0
        idx_test = idx_test.to(args.device1)

    # Train model
    t_total = time.time()
    model = SGCNModel(K=2,
                      input_size=100,
                      hidden_size=args.hidden,
                      class_num=18,
                      pre_proj_num=2,
                      after_proj_num=2).to(args.device1)
    model.load_state_dict(torch.load('./saved/gcn.pth'))
    # with torch.no_grad():
    #     logits = model(features, edge_index, edge_weight)
    #     print(count_acc(logits[:len(labels)], labels))
    env = GCNEnv(args,
                 model,
                 labels,
                 features.size(0),
                 features=features,
                 edge_index=edge_index,
                 edge_weight=edge_weight)
    target_dict = torch.arange(features.size(0))
    # ----------------- rl code ------------------ #
    agent = Agent(args, env, target_dict, features.size(0))
    agent.train()
    agent.eval()
    # ----------------- rl code ------------------ #

    print("Optimization Finished!")
    print("Total time elapsed: {:.4f}s".format(time.time() - t_total))