def test(): #saver = tf.train.Saver() #saver.restore(sess, tf.train.latest_checkpoint('chk/dqvrand')) e = 0 epilog = [] logs = [] wins = [] game = Game(verbose=False) while e <= 1000: e += 1 if not game.game_over: action = game.random_space() game.move(action, 1) game.step(1) if game.game_over: wins.append(game.game_over) log = game.setup() logs.append(log) if e % 100 == 0: win_p1, comp1, bloc1, win_p2, comp2, bloc2 = 0, 0, 0, 0, 0, 0 c = Counter(wins) r = GameRate(verbose=False, list=logs, player=1, opponent=2) r2 = GameRate(verbose=False, list=logs, player=2, opponent=1) r.check_games() r2.check_games() win_p1 = c[1] / len(wins) print("1win percentage", win_p1) if r.completions + r.missed_completions > 0: comp1 = r.completions / (r.completions + r.missed_completions) else: comp1 = 0 print("1immediate completions", comp1) if r.blocks + r.missed_blocks > 0: bloc1 = r.blocks / (r.blocks + r.missed_blocks) win_p2 = c[2] / len(wins) print("2win percentage", win_p2) if r2.completions + r2.missed_completions > 0: comp2 = r2.completions / (r2.completions + r2.missed_completions) else: comp2 = 0 print("2immediate completions", comp2) if r2.blocks + r2.missed_blocks > 0: bloc2 = r2.blocks / (r2.blocks + r2.missed_blocks) epilog.append([e, win_p1, comp1, bloc1, win_p2, comp2, bloc2]) continue move = game.random_space() game.move(move, 2) game.step(2) return epilog
def test(mainQN,sess): saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint('chk/dqvrand')) e = 0 logs = [] wins = [] game = Game(verbose=False) while e <=1000: e+=1 if not game.game_over: state = game.space feed = {mainQN.inputs_: state.reshape((1, *state.shape))} As = sess.run(mainQN.output, feed_dict=feed) avail = game.avail() availQ = {} for k in avail: availQ[k]=As[0][k] action = max(availQ,key=availQ.get) game.move(action,1) game.step(1) if game.game_over: wins.append(game.game_over) log = game.setup() logs.append(log) continue move = game.random_space() game.move(move,2) game.step(2) win, comp, bloc = 0, 0, 0 c = Counter(wins) r = GameRate(verbose=False, list=logs,player=1,opponent=2) r.check_games() #print(r,c) win= c[1] / len(wins) print("win percentage",win) if (r.completions + r.missed_completions)>0: comp = r.completions / (r.completions + r.missed_completions) print("immediate completions",comp) if (r.blocks + r.missed_blocks)>0: bloc = r.blocks / (r.blocks + r.missed_blocks) print("blocks",bloc) #exit(1) if win ==0.0: print(wins) exit(1) return win,comp,bloc
# Network parameters hidden_size = 200 # number of units in each Q-network hidden layer learning_rate = 0.0001 # Q-network learning rate # Memory parameters memory_size = 10000 # memory capacity batch_size = 50 # experience mini-batch size pretrain_length = batch_size # number experiences to pretrain the memory tf.reset_default_graph() mainQN = QNetwork(name='main', hidden_size=hidden_size, learning_rate=learning_rate) #p2QN = QNetwork(name='p2', hidden_size=hidden_size, learning_rate=learning_rate) game = Game(verbose=False) memory = Memory(max_size=memory_size) saver = tf.train.Saver() action = game.random_space() game.move(action, 1) state , reward = game.step() space = game.random_space() game.move(space, 2) for ii in range(pretrain_length): action = game.random_space() game.move(action,1) next_state , reward = game.step() if game.game_over: next_state=np.zeros(state.shape) memory.add((state, action, reward, next_state))
def train(sess, scaler, a1, c1, a2, c2): game = Game(verbose=False) player1 = PlayerTrainer(actor=a1, critic=c1, buffersize=BUFFER_SIZE, game=game, player=1, batch_size=MINIBATCH_SIZE, gamma=GAMMA) player2 = PlayerTrainer(actor=a2, critic=c2, buffersize=BUFFER_SIZE, game=game, player=2, batch_size=MINIBATCH_SIZE, gamma=GAMMA) # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # Initialize target network weights a1.update_target_network() c1.update_target_network() a2.update_target_network() c2.update_target_network() # Initialize replay memory episode = 0 all_wins = [] all_logs = [] win_p1, comp1, bloc1 = 0, 0, 0 win_p2, comp2, bloc2 = 0, 0, 0 stat = [] for i in range(MAX_EPISODES): episode += 1 game.setup() state = game.space ep_reward = 0 #ep_ave_max_q = 0 ep_reward2 = 0 #ep_ave_max_q2 = 0 #explore_p=1 terminal = False for j in range(MAX_EP_STEPS): if not terminal: if episode < (5000): move = game.random_space() game.move(move, 1) state, reward = game.step(player=1) else: state, reward = player1.noisyMaxQMove() _, reward2 = game.step(player=2) ep_reward += reward ep_reward2 += reward2 terminal = game.game_over if terminal: all_wins.append(game.game_over) log = game.setup() s = game.space all_logs.append(log) print(scaler, win_p1, comp1, bloc1, win_p2, comp2, bloc2, " | Episode", i, ep_reward, ep_reward2) if episode % 1000 == 0: win_p1, comp1, bloc1, win_p2, comp2, bloc2 = test( sess, a1, a2) stat.append( [episode, win_p1, comp1, bloc1, win_p2, comp2, bloc2]) df = pd.DataFrame(stat) print(df) plt.close('all') xwinp = plt.plot(df[0], df[1], label="P1wins") xcomp = plt.plot(df[0], df[2], label="P1Imm Compl") xbloc = plt.plot(df[0], df[3], label="p1immbloc") xwinp2 = plt.plot(df[0], df[4], label="P2wins") xcomp2 = plt.plot(df[0], df[5], label="P2Imm Compl") xbloc2 = plt.plot(df[0], df[6], label="p2immbloc") plt.legend() plt.ylim(0, 1) plt.ylabel('percent') plt.show(block=False) #for# r in range(1000): #print(win_p,comp) #if (comp1> .75 and win_p1 >.9)or episode>=200000 or(episode==30000 and (win_p1<.50 or comp1<.1) ): #win_p1, comp1,bloc1, win_p2, comp2,bloc2 = test(sess, a1, a2) #print("epi ",i,ep_ave_max_q ) #return win_p1,comp1,episode,stat,win_p2,comp2 break else: if episode >= 5000 and episode < 10000: move = game.random_space() game.move(move, 2) state, reward2 = game.step(player=2) else: state, reward2 = player2.noisyMaxQMove() _, reward = game.step(player=1) terminal = game.game_over ep_reward2 += reward2 ep_reward += reward return stat
def test(sess, actor1, actor2): game = Game(verbose=False) logs = [] wins = [] for i in range(TEST_EPISODES): game.setup() s = game.space terminal = False for j in range(MAX_EP_STEPS): if not terminal: a = actor1.predict(np.reshape(game.space, (1, *s.shape))) avail = game.avail() availQ = {} for i in avail: availQ[i] = a[0][i] action = max(availQ, key=availQ.get) #game.random_space() # # print(a) game.move(action, 1) s2, r = game.step(1) terminal = game.game_over info = None if terminal: wins.append(game.game_over) log = game.setup() logs.append(log) s = game.space break else: a = actor2.predict(np.reshape(game.space, (1, *s.shape))) avail = game.avail() availQ = {} for i in avail: availQ[i] = a[0][i] action = max(availQ, key=availQ.get) # print(a) game.move(action, 2) s2, r = game.step(1) terminal = game.game_over info = None c = Counter(wins) r = GameRate(verbose=False, list=logs, player=1, opponent=2) r2 = GameRate(verbose=False, list=logs, player=2, opponent=1) bloc1, bloc2 = 0, 0 r.check_games() r2.check_games() win_p1 = c[1] / (TEST_EPISODES - 1) print("1win percentage", win_p1) if r.completions + r.missed_completions > 0: comp1 = r.completions / (r.completions + r.missed_completions) else: comp1 = 0 print("1immediate completions", comp1) if r.blocks + r.missed_blocks > 0: bloc1 = r.blocks / (r.blocks + r.missed_blocks) win_p2 = c[2] / (TEST_EPISODES - 1) print("2win percentage", win_p2) if r2.completions + r2.missed_completions > 0: comp2 = r2.completions / (r2.completions + r2.missed_completions) else: comp2 = 0 print("2immediate completions", comp2) if r2.blocks + r2.missed_blocks > 0: bloc2 = r2.blocks / (r2.blocks + r2.missed_blocks) return win_p1, comp1, bloc1, win_p2, comp2, bloc2
def train(sess, a1, c1, scaler): game = Game(verbose=False) player1 = PlayerTrainer(actor=a1, critic=c1, buffersize=BUFFER_SIZE, game=game, player=1, batch_size=MINIBATCH_SIZE, gamma=GAMMA) sess.run(tf.global_variables_initializer()) # Initialize target network weights a1.update_target_network() c1.update_target_network() episode = 0 all_wins = [] all_logs = [] win_p1, comp1, bloc1 = 0, 0, 0 win_p2, comp2, bloc2 = 0, 0, 0 stat = [] for i in range(MAX_EPISODES): episode += 1 game.setup() ep_reward = 0 ep_reward2 = 0 reward2 = 0 terminal = False for j in range(MAX_EP_STEPS): if not terminal: if episode < 7500: move = game.random_space() game.move(move, 1) state, reward = game.step(player=1) else: state, reward = player1.noisyMaxQMove() _, reward2 = game.step(player=2) ep_reward += reward ep_reward2 += reward2 terminal = game.game_over if terminal: all_wins.append(game.game_over) log = game.setup() s = game.space all_logs.append(log) print(scaler, win_p1, comp1, bloc1, win_p2, comp2, bloc2, " | Episode", i, ep_reward, ep_reward2) if episode % 1000 == 0: win_p1, comp1, bloc1, win_p2, comp2, bloc2 = test(sess, a1) stat.append([episode, win_p1, comp1, bloc1, win_p2, comp2, bloc2]) df = pd.DataFrame(stat) print(df) plt.close('all') xwinp = plt.plot(df[0], df[1], label="P1wins") xcomp = plt.plot(df[0], df[2], label="P1Imm Compl") xbloc = plt.plot(df[0], df[3], label="p1immbloc") xwinp2 = plt.plot(df[0], df[4], label="P2wins") xcomp2 = plt.plot(df[0], df[5], label="P2Imm Compl") xbloc2 = plt.plot(df[0], df[6], label="p2immbloc") plt.legend() plt.ylim(0, 1) plt.ylabel('percent') plt.show(block=False) break else: move = game.random_space() game.move(move, 2) _, reward = game.step(player=1) terminal = game.game_over ep_reward2 += reward2 ep_reward += reward return stat
def train(mainQN,sess): winp, comp, blocp = 0, 0, 0 saver = tf.train.Saver() game = Game(verbose=False) wins=[] logs=[] epi_log = [] #memory = Memory(max_size=memory_size) trainer = QPlayerTrainer(qnet=mainQN,buffersize=memory_size,game=game,player=1,batch_size=batch_size,gamma=gamma,sess=sess) # Now train with experiences rewards_list = [] loss = False with tf.Session() as sess: # Initialize variables sess.run(tf.global_variables_initializer()) step = 0 for ep in range(1, train_episodes+1): total_reward = 0 t = 0 explore_p=0 while t < max_steps: if not game.game_over: step += 1 #explore_p = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * step) #if explore_p > np.random.rand(): # Make a random action # next_state, reward, loss = trainer.randomMove() #else: next_state, reward,loss = trainer.noisyMaxQMove() total_reward += reward if game.game_over: # the episode ends so no next state next_state = np.zeros(state.shape) t = max_steps if loss: print(winp,comp,blocp,'Episode: {}'.format(ep), 'Total reward: {}'.format(total_reward), 'Training loss: {:.4f}'.format(loss),explore_p) rewards_list.append((ep, total_reward)) # Add experience to memory wins.append(game.game_over) log = game.setup() logs.append(log) if ep % 10000 == 0: #print(wins[-100:],logs[-100:]) #exit(0) time = str(localtime()) saver.save(sess, "chk/dqvrand/" + time + ".ckpt") winp, comp, blocp = test(mainQN,sess) epi_log.append([ep,winp,comp,blocp ]) state = game.space else: state = next_state t += 1 space = game.random_space() game.move(space,2) _,reward = game.step(player=2) total_reward += reward time = str(localtime()) saver.save(sess, "chk/dqvrand/"+time+".ckpt") with open('data/epi2', 'wb') as f: p.dump(epi_log, f)
from common.game import Game from collections import Counter game = Game(verbose=False) game.setup() logs = [] wins = [] test_episodes = 1000 for i in range(test_episodes): print(i) while not game.game_over: move = game.random_space() game.move(move, 1) #print(game.space) game.step() if not game.game_over: move = game.random_space() game.move(move, 2) game.step() wins.append(game.game_over) log = game.setup() logs.append(log) r = GameRate(verbose=False, list=logs) r.check_games(1)