def play_against_random(memory, episodes=10000, train=True): game = Connect4Game().copy_state() epsilon = 0 if not train else 1 p = QPlayer(memory, epsilon=epsilon) sync = Synchronizer(game, p, QPlayer(Memory(name="data/empty.json"))) win_rate = 0 t = 0 while t < episodes: winner, board_state, moves = sync.play() if board_state != 0: if winner[1] == p: win_rate += 1 if train: memory.update(1, moves[winner[0]]) memory.update(0, moves[winner[0] - 1]) game.reset_game() t += 1 if train: p.update_epsilon(episodes, t) if t % 10000 == 0: print("{}: winning {:.2f}% of games".format(t, win_rate / t * 100)) if t % 1000000 == 0: memory.save("data/data_r.json") print("WON: {:.2f}% of games".format(win_rate / episodes * 100))
def train_against_q(qplayer, episodes=10000): game = Connect4Game().copy_state() p = Agent() sync = Synchronizer(game, p, qplayer) batch_size = 32 update_target_ctr = 0 for t in range(episodes): winner, board_state, moves = sync.play() r = 0 if board_state != 0: r = 1 feed_moves(moves[winner[0]], r, p) feed_moves(moves[winner[0] - 1], -r, p) p.update_epsilon(episodes, t) game.reset_game() if len(p.memory) > batch_size: p.replay(batch_size) update_target_ctr += 1 if update_target_ctr == 10: p.update_target_model() update_target_ctr = 0 if t % 10000 == 0: print(t) p.save("data/dqn_weights")
def play_against_random(episodes=100): game = Connect4Game().copy_state() a = Agent(e_init=0) a.load("data/dqn_weights") sync = Synchronizer(game, a, QPlayer(Memory(name="data/empty.json"))) print("hello") win_rate = 0 t = 0 while t < episodes: winner, board_state, moves = sync.play() if board_state != 0: if winner[1] == a: win_rate += 1 game.reset_game() t += 1 print(win_rate)
def train_self_play(memory: Memory, episodes=10000): game = Connect4Game().copy_state() p1 = QPlayer(memory) sync = Synchronizer(game, p1, p1) t = 0 while t < episodes: winner, board_state, moves = sync.play() if board_state != 0: memory.update(1, moves[winner[0]]) memory.update(0, moves[winner[0] - 1]) game.reset_game() t += 1 p1.update_epsilon(episodes, t) if t % 10000 == 0: print(t) if t % 1000000 == 0: memory.save("data/data_s.json")
def play_human(p: Player, callback=None): game = Connect4Game() view = Connect4Viewer(game=game) view.initialize() sync = Synchronizer(game, p, HumanPlayer()) while True: winner, board_state, moves = sync.play(True) if callback: callback(winner, board_state, moves, p) while True: event = pygame.event.wait() if event.type == pygame.QUIT: pygame.quit() return if event.type == pygame.MOUSEBUTTONUP and event.button == 1: game.reset_game() break
def train_against_fixed(memory: Memory, episodes=10000): game = Connect4Game().copy_state() p1 = QPlayer(memory, epsilon=0.2) t = 0 while t < episodes: p2 = QPlayer(pickle.loads(pickle.dumps(memory)), epsilon=0.2) sync = Synchronizer(game, p1, p2) for i in range(t, t + 101): winner, board_state, moves = sync.play() if board_state != 0: if winner[1] == p1: memory.update(1, moves[winner[0]]) memory.update(0, moves[winner[0] - 1]) game.reset_game() t = i if t % 10000 == 0: print(t) if t % 1000000 == 0: memory.save("data/data_f.json")
def arena(players): game = Connect4Game().copy_state() scores = [[] for _ in range(len(players))] draws = [0] * len(players) for i in range(len(players) - 1): p1 = players[i] for j in range(i + 1, len(players)): p2 = players[j] sync = Synchronizer(game, p1, p2) wins = [0, 0] for _ in range(100): winner, board_state, moves = sync.play() if board_state != 0: if winner[1] == p1: wins[0] += 1 if winner[1] == p2: wins[1] += 1 else: draws[i] += 1 draws[j] += 1 game.reset_game() scores[i].append(wins[0]) scores[j].append(wins[1]) return scores, draws
def main(unused_argv): ''' check path ''' if FLAGS.data_dir == '' or not os.path.exists(FLAGS.data_dir): raise ValueError('invalid data directory {}'.format(FLAGS.data_dir)) if FLAGS.output_dir == '': raise ValueError('invalid output directory {}'.format( FLAGS.output_dir)) elif not os.path.exists(FLAGS.output_dir): os.makedirs(FLAGS.output_dir) event_log_dir = os.path.join(FLAGS.output_dir, '') checkpoint_path = os.path.join(FLAGS.output_dir, 'model.ckpt') ''' setup summaries ''' summ = Summaries() ''' setup the game environment ''' filenames_train = glob.glob( os.path.join(FLAGS.data_dir, 'train-{}'.format(FLAGS.sampling_rate), '*.mat')) filenames_val = glob.glob( os.path.join(FLAGS.data_dir, 'val-{}'.format(FLAGS.sampling_rate), '*.mat')) game_env_train = Env(decay=FLAGS.decay) game_env_val = Env(decay=FLAGS.decay) game_actions = list(game_env_train.actions.keys()) ''' setup the transition table for experience replay ''' stateDim = [FLAGS.num_chans, FLAGS.num_points] transition_args = { 'batchSize': FLAGS.batch_size, 'stateDim': stateDim, 'numActions': len(game_actions), 'maxSize': FLAGS.replay_memory, } transitions = TransitionMemory(transition_args) ''' setup agent ''' s_placeholder = tf.placeholder(tf.float32, [FLAGS.batch_size] + stateDim, 's_placeholder') s2_placeholder = tf.placeholder(tf.float32, [FLAGS.batch_size] + stateDim, 's2_placeholder') a_placeholder = tf.placeholder(tf.int32, [FLAGS.batch_size], 'a_placeholder') r_placeholder = tf.placeholder(tf.float32, [FLAGS.batch_size], 'r_placeholder') pcont_t = tf.constant(FLAGS.discount, tf.float32, [FLAGS.batch_size]) network = Model(FLAGS.batch_size, len(game_actions), FLAGS.num_chans, FLAGS.sampling_rate, \ FLAGS.num_filters, FLAGS.num_recurs, FLAGS.pooling_stride, name = "network") target_network = Model(FLAGS.batch_size, len(game_actions), FLAGS.num_chans, FLAGS.sampling_rate,\ FLAGS.num_filters, FLAGS.num_recurs, FLAGS.pooling_stride, name = "target_n") q = network(s_placeholder) q2 = target_network(s2_placeholder) q_selector = network(s2_placeholder) loss, q_learning = trfl.double_qlearning(q, a_placeholder, r_placeholder, pcont_t, q2, q_selector) synchronizer = Synchronizer(network, target_network) sychronize_ops = synchronizer() training_variables = network.variables opt = Adam(FLAGS.learning_rate, lr_decay=FLAGS.lr_decay, lr_decay_steps=FLAGS.lr_decay_steps, lr_decay_factor=FLAGS.lr_decay_factor, clip=True) reduced_loss = tf.reduce_mean(loss) graph_regularizers = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) total_regularization_loss = tf.reduce_sum(graph_regularizers) total_loss = reduced_loss + total_regularization_loss update_op = opt(total_loss, var_list=training_variables) summ_loss_op = tf.summary.scalar('loss', total_loss) state_placeholder = tf.placeholder(tf.float32, [1] + stateDim, 'state_placeholder') decayed_ep_placeholder = tf.placeholder(tf.float32, [], 'decayed_ep_placeholder') action_tensor_egreedy = eGreedy(state_placeholder, network, len(game_actions), decayed_ep_placeholder, FLAGS.debug) action_tensor_greedy = greedy(state_placeholder, network) ''' setup the training process ''' episode_reward_placeholder = tf.placeholder(tf.float32, [], "episode_reward_placeholder") average_reward_placeholder = tf.placeholder(tf.float32, [], "average_reward_placeholder") summ.register('train', 'episode_reward_train', episode_reward_placeholder) summ.register('train', 'average_reward_train', average_reward_placeholder) summ.register('val', 'episode_reward_val', episode_reward_placeholder) summ.register('val', 'average_reward_val', average_reward_placeholder) total_reward_train = 0 average_reward_train = 0 total_reward_val = 0 average_reward_val = 0 ''' gathering summary operators ''' train_summ_op = summ('train') val_summ_op = summ('val') ''' setup the training process ''' transitions.empty() # print("game_actions -> {}".format(game_actions)) writer = tf.summary.FileWriter(event_log_dir, tf.get_default_graph()) saver = tf.train.Saver(training_variables) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) assert (FLAGS.gpus != ''), 'invalid GPU specification' config.gpu_options.visible_device_list = FLAGS.gpus with tf.Session(config=config) as sess: sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer() ]) val_step = 0 for step in range(FLAGS.steps): print("Iteration: {}".format(step)) game_env_train.reset(filenames_train[np.random.randint( 0, len(filenames_train))]) last_state = None last_state_assigned = False episode_reward = 0 action_index = (len(game_actions) >> 2) for estep in range(FLAGS.eval_steps): # print("Evaluation step: {}".format(estep)) # print("{} - measured RT: {}".format(estep, game_env_train.measured_rt)) # print("{} - predicted RT: {}".format(estep, game_env_train.predicted_rt)) # print("{} - action -> {}".format(estep, game_actions[action])) state, reward, terminal = game_env_train.step( game_actions[action_index]) # game over? if terminal: break episode_reward += reward # Store transition s, a, r, t # if last_state_assigned and reward: if last_state_assigned: # print("reward -> {}".format(reward)) # print("action -> {}".format(game_actions[last_action])) transitions.add(last_state, last_action, reward, last_terminal) # Select action # decayed_ep = FLAGS.testing_ep decayed_ep = max(0.1, (FLAGS.steps - step) / FLAGS.steps * FLAGS.ep) if not terminal: action_index = sess.run(action_tensor_egreedy, feed_dict={ state_placeholder: np.expand_dims(state, axis=0), decayed_ep_placeholder: decayed_ep }) else: action_index = 0 # Do some Q-learning updates if estep > FLAGS.learn_start and estep % FLAGS.update_freq == 0: summ_str = None for _ in range(FLAGS.n_replay): if transitions.size > FLAGS.batch_size: s, a, r, s2 = transitions.sample() summ_str, _ = sess.run( [summ_loss_op, update_op], feed_dict={ s_placeholder: s, a_placeholder: a, r_placeholder: r, s2_placeholder: s2 }) if summ_str: writer.add_summary(summ_str, step * FLAGS.eval_steps + estep) last_state = state last_state_assigned = True last_action = action_index last_terminal = terminal if estep > FLAGS.learn_start and estep % FLAGS.target_q == 0: # print("duplicate model parameters") sess.run(sychronize_ops) total_reward_train += episode_reward average_reward_train = total_reward_train / (step + 1) train_summ_str = sess.run(train_summ_op, feed_dict={ episode_reward_placeholder: episode_reward, average_reward_placeholder: average_reward_train }) writer.add_summary(train_summ_str, step) if FLAGS.validation and step % FLAGS.validation_interval == 0: game_env_val.reset(filenames_val[0]) episode_reward = 0 count = 0 action_index = (len(game_actions) >> 2) while True: # print("Evaluation step: {}".format(count)) # print("action -> {}".format(game_actions[action_index])) state, reward, terminal = game_env_val.step( game_actions[action_index]) # game over? if terminal: break episode_reward += reward if not terminal: action_index = sess.run(action_tensor_greedy, feed_dict={ state_placeholder: np.expand_dims(state, axis=0) }) action_index = np.squeeze(action_index) # print('state -> {}'.format(state)) # print('action_index -> {}'.format(action_index)) else: action_index = 0 count += 1 total_reward_val += episode_reward average_reward_val = total_reward_val / (val_step + 1) val_step += 1 val_summ_str = sess.run(val_summ_op, feed_dict={ episode_reward_placeholder: episode_reward, average_reward_placeholder: average_reward_val }) writer.add_summary(val_summ_str, step) tf.logging.info('Saving model.') saver.save(sess, checkpoint_path) tf.logging.info('Training complete') writer.close()