def play_hexus(mode, episodes, board_level): print('<><><><>HEXUS<><><><>') if mode == 'train': # Train agent to go first agent = Agent(Hexus, epsilon=5e-1, learning_rate=25e-2, board_level=board_level) n = episodes history = agent.train(n) print('After {} Episodes'.format(n)) elif mode == 'hyper': # Hyper parameter optimization max_e = 0.0 max_lr = 0.0 max_reward = 0.0 epsilons = [1e-1, 2e-1, 9e-2, 1e-2, 9e-3] learning_rates = [1e-1, 2e-1, 3e-1, 25e-2, 9e-2] for epsilon in epsilons: for learning_rate in learning_rates: agent = Agent(Hexus, player='B', epsilon=epsilon, learning_rate=learning_rate) n = 10000 history = agent.train(n, history=[]) total = history[1][len(history[1]) - 1] print(total) if total > max_reward: max_reward = total max_e = epsilon max_lr = learning_rate print('Max e: {}'.format(max_e)) print('Max lr: {}'.format(max_lr)) print('Max reward: {}'.format(max_reward)) else: print('Mode {} is invalid.'.format(mode))
def play_tictactoe(mode): """Start TicTacToe game with RL Agent.""" print('==TIC TAC TOE==') game = TicTacToe() if mode == 'train': agent = Agent(game) history = agent.train(10000) print('After 10000 Episodes') # Plot Reward Stats rfig, raxs = plt.subplots(nrows=3, ncols=1) rax_reward1 = raxs[0] rax_reward1.grid() rax_reward2 = raxs[1] rax_reward2.grid() rax_reward3 = raxs[2] rax_reward3.grid() rax_reward1.plot(history[0][:100], history[1][:100]) rax_reward1.set(ylabel='Cumulative Reward', title='Tic Tac Toe Cumulative Reward Episodes') rax_reward2.plot(history[0][:1000], history[1][:1000], color='g') rax_reward2.set(ylabel='Cumulative Reward') rax_reward3.plot(history[0][:10000], history[1][:10000], color='r') rax_reward3.set(xlabel='Episode', ylabel='Cumulative Reward') rfig.savefig('tictactoe_reward.png') # Plot Qtable Memory Usage Stats memfig, memaxs = plt.subplots(nrows=3, ncols=1) memax_reward1 = memaxs[0] memax_reward1.grid() memax_reward2 = memaxs[1] memax_reward2.grid() memax_reward3 = memaxs[2] memax_reward3.grid() memax_reward1.plot(history[0][:100], history[2][:100]) memax_reward1.set(ylabel='Size (KB)', title='Tic Tac Toe QTable Size Episodes') memax_reward2.plot(history[0][:1000], history[2][:1000], color='g') memax_reward2.set(ylabel='Size (KB)') memax_reward3.plot(history[0][:10000], history[2][:10000], color='r') memax_reward3.set(xlabel='Episode', ylabel='Size (KB)') memfig.savefig('tictactoe_memory.png') plt.show() agent.save_values(path='data/tictactoe_qtable.json') agent.stats() agent.demo() elif mode == 'demo': qtable = json.load(open('data/tictactoe_qtable.json')) agent = Agent(game, qtable=qtable) agent.demo() else: print('Mode {} is invalid.'.format(mode))
def play_chomp(mode): """Start Chomp game and training.""" print('=====CHOMP=====') # Square board has optimal strategy to allow for easy sanity check that agent is learning. game = Chomp(rows=4, cols=4) if mode == 'train': # Train agent to go first agent = Agent(game, epsilon=9e-3, learning_rate=25e-2) n = 10000 history = agent.train(n) print('After {} Episodes'.format(n)) # Plot Reward Stats rfig, raxs = plt.subplots(nrows=3, ncols=1) rax_reward1 = raxs[0] rax_reward1.grid() rax_reward2 = raxs[1] rax_reward2.grid() rax_reward3 = raxs[2] rax_reward3.grid() rax_reward1.plot(history[0][:100], history[1][:100]) rax_reward1.set(ylabel='Cumulative Reward', title='Chomp 4x4 Cumulative Reward') rax_reward2.plot(history[0][:1000], history[1][:1000], color='g') rax_reward2.set(ylabel='Cumulative Reward') rax_reward3.plot(history[0][:n], history[1][:n], color='r') rax_reward3.set(xlabel='Episode', ylabel='Cumulative Reward') rfig.savefig('chomp_reward.png') # Plot Qtable Memory Usage Stats memfig, memaxs = plt.subplots(nrows=3, ncols=1) memax_reward1 = memaxs[0] memax_reward1.grid() memax_reward2 = memaxs[1] memax_reward2.grid() memax_reward3 = memaxs[2] memax_reward3.grid() memax_reward1.plot(history[0][:100], history[2][:100]) memax_reward1.set(ylabel='Size (KB)', title='Chomp 4x4 QTable Size') memax_reward2.plot(history[0][:1000], history[2][:1000], color='g') memax_reward2.set(ylabel='Size (KB)') memax_reward3.plot(history[0][:n], history[2][:n], color='r') memax_reward3.set(xlabel='Episode', ylabel='Size (KB)') plt.show() agent.save_values(path='data/chomp_qtable.json') agent.demo() elif mode == 'hyper': # Hyper parameter optimization max_e = 0.0 max_lr = 0.0 max_reward = 0.0 epsilons = [1e-1, 2e-1, 9e-2, 1e-2, 9e-3] learning_rates = [1e-1, 2e-1, 3e-1, 25e-2, 9e-2] for epsilon in epsilons: for learning_rate in learning_rates: agent = Agent(game, qtable={}, player='X', epsilon=epsilon, learning_rate=learning_rate) n = 10000 history = agent.train(n, history=[]) total = history[1][len(history[1]) - 1] print(total) if total > max_reward: max_reward = total max_e = epsilon max_lr = learning_rate print('Max e: {}'.format(max_e)) print('Max lr: {}'.format(max_lr)) print('Max reward: {}'.format(max_reward)) elif mode == 'demo': qtable = json.load(open('data/chomp_qtable.json')) agent = Agent(game, qtable=qtable) agent.demo() else: print('Mode {} is invalid.'.format(mode))
def play_connectfour(mode): """Start Connect Four game and training.""" print('==CONNECT FOUR==') game = ConnectFour() if mode == 'train': agent = Agent(game) history = agent.train(10000) print('After 10000 Episodes') # Plot Reward Stats rfig, raxs = plt.subplots(nrows=3, ncols=1) rax_reward1 = raxs[0] rax_reward1.grid() rax_reward2 = raxs[1] rax_reward2.grid() rax_reward3 = raxs[2] rax_reward3.grid() rax_reward1.plot(history[0][:100], history[1][:100]) rax_reward1.set(ylabel='Cumulative Reward', title='Connect Four Cumulative Reward (3 Column State)') rax_reward2.plot(history[0][:1000], history[1][:1000], color='g') rax_reward2.set(ylabel='Cumulative Reward') rax_reward3.plot(history[0][:10000], history[1][:10000], color='r') rax_reward3.set(xlabel='Episode', ylabel='Cumulative Reward') rfig.savefig('connectfour_reward.png') # Plot Qtable Memory Usage Stats memfig, memaxs = plt.subplots(nrows=3, ncols=1) memax_reward1 = memaxs[0] memax_reward1.grid() memax_reward2 = memaxs[1] memax_reward2.grid() memax_reward3 = memaxs[2] memax_reward3.grid() memax_reward1.plot(history[0][:100], history[2][:100]) memax_reward1.set(ylabel='Size (KB)', title='Connect Four QTable Size (3 Column State)') memax_reward2.plot(history[0][:1000], history[2][:1000], color='g') memax_reward2.set(ylabel='Size (KB)') memax_reward3.plot(history[0][:10000], history[2][:10000], color='r') memax_reward3.set(xlabel='Episode', ylabel='Size (KB)') memfig.savefig('connectfour_memory.png') plt.show() agent.save_values(path='data/connectfour_qtable.json') agent.demo() elif mode == 'demo': qtable = json.load(open('data/connectfour_qtable.json')) agent = Agent(game, qtable=qtable) agent.demo() else: print('Mode {} is invalid.'.format(mode))
use_nearby_obs=True) train_env_sets = [env1, env2] ##scale maximum possible returns to the same for balace learning rew_scale_factor = [1.2, 1.4] agent = Agent('GridWorldKey', envs=train_env_sets, rew_scale=rew_scale_factor, batch_size=20, n_ways=0) print('----Initing') agent.init_scaler(10) print("----Learning L0 on env1 and env2") agent.train(n_iter=3000, L='0', mask=[1, 1]) print("----Learning L11 on env1") agent.add_module(L_name='11') agent.train(n_iter=1500, L='11', mask=[1, 0]) print("----Learning L12 on env2") print("\tLearning combine-weights on env2 for several iteration") agent.train(n_iter=500, L='11', mask=[0, 1], trainWeight=True) print("\tLearning L12 module") agent.add_module(L_name='12') agent.train(n_iter=3500, L='12', mask=[0, 1]) print('----Learning L13 on unseen env3') agent.addenv(env=env3, rew_scale=1.6) print('\tLearning combine-weights on env2 for several iteration')
idx_test = idx_test.to(args.device1) # Train model t_total = time.time() model = SGCNModel(K=2, input_size=100, hidden_size=args.hidden, class_num=18, pre_proj_num=2, after_proj_num=2).to(args.device1) model.load_state_dict(torch.load('./saved/gcn.pth')) # with torch.no_grad(): # logits = model(features, edge_index, edge_weight) # print(count_acc(logits[:len(labels)], labels)) env = GCNEnv(args, model, labels, features.size(0), features=features, edge_index=edge_index, edge_weight=edge_weight) target_dict = torch.arange(features.size(0)) # ----------------- rl code ------------------ # agent = Agent(args, env, target_dict, features.size(0)) agent.train() agent.eval() # ----------------- rl code ------------------ # print("Optimization Finished!") print("Total time elapsed: {:.4f}s".format(time.time() - t_total))