def test_save_and_load_network(self): try: file_name = 'test.p' input_nodes = 20 hidden_nodes = (50, 40, 30) _, _, variables1 = create_network(input_nodes, hidden_nodes) _, _, variables2 = create_network(input_nodes, hidden_nodes) with tf.Session() as session: session.run(tf.initialize_all_variables()) save_network(session, variables1, file_name) load_network(session, variables2, file_name) for var1, var2 in zip(variables1, variables2): np.testing.assert_array_almost_equal( session.run(var1), session.run(var2)) finally: try: os.remove(file_name) except OSError: pass
game_spec.board_squares(), HIDDEN_NODES_REINFORCEMENT, game_spec.outputs()) value_input_layer, value_output_layer, value_variables = create_network(game_spec.board_squares(), HIDDEN_NODES_VALUE, output_nodes=1, output_softmax=False) target_placeholder = tf.placeholder("float", (None, 1)) error = tf.reduce_sum(tf.square(target_placeholder - value_output_layer)) train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(error) with tf.Session() as session: session.run(tf.initialize_all_variables()) load_network(session, reinforcement_variables, REINFORCEMENT_NETWORK_PATH) if os.path.isfile(VALUE_NETWORK_PATH): print("loading previous version of value network") load_network(session, value_variables, VALUE_NETWORK_PATH) def make_move(board_state, side): move = get_deterministic_network_move(session, reinforcement_input_layer, reinforcement_output_layer, board_state, side) return game_spec.flat_move_to_tuple(np.argmax(move)) board_states_training = {} board_states_test = []
NETWORK_FILE_PATH = 'current_network.p' game_spec = TicTacToeGameSpec() input_layer, output_layer, variables = create_network( game_spec.board_squares(), HIDDEN_NODES, output_nodes=game_spec.outputs()) actual_move_placeholder = tf.placeholder("float", (None, game_spec.outputs())) error = tf.reduce_sum(tf.square(actual_move_placeholder - output_layer)) train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(error) with tf.Session() as session: session.run(tf.initialize_all_variables()) if os.path.isfile(NETWORK_FILE_PATH): print("loading existing network") load_network(session, variables, NETWORK_FILE_PATH) episode_number = 1 positions_train, positions_test = load_games() test_error = session.run(error, feed_dict={ input_layer: [x[0] for x in positions_test], actual_move_placeholder: [[x[1]] for x in positions_test] }) while True: np.random.shuffle(positions_train) train_error = 0
def train_policy_gradient(network_file_path, save_network_file_path=None, learn_rate=1e-3, number_of_games=50000, print_results_every=1000, batch_size=100): print 'parameters => LR : ', learn_rate, ' Batch Size : ', batch_size save_network_file_path = save_network_file_path or network_file_path actual_move_placeholder = tf.placeholder("float", shape=(None, 100)) input_layer, output_layer, variables = create_network(100, (100, 100, 100), output_softmax=False) error = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=output_layer, labels=actual_move_placeholder)) #error = tf.reduce_sum(tf.square(tf.subtract(actual_move_placeholder, output_layer)), reduction_indices=1) train_step = tf.train.AdamOptimizer(learn_rate).minimize(error) with tf.Session() as session: session.run(tf.initialize_all_variables()) if network_file_path and os.path.isfile(network_file_path): print("loading pre-existing network") load_network(session, variables, network_file_path) mini_batch_board_states, mini_batch_moves = [], [] def my_player(board_state, side): #printboard(board_state) mini_batch_board_states.append(np.ravel(board_state) * side) a1 = Agent(side, lossval=-1) move_tuple = a1.random_greedy(board_state) move = np.zeros(100) move[move_tuple[0] * 10 + move_tuple[1]] = 1. mini_batch_moves.append(move) return move_tuple def make_training_move(board_state, side): a1 = Agent(side, lossval=-1) move = a1.action(board_state) return move game_length = 0 #count = set() for episode_number in range(1, number_of_games): #print 'episode no ',episode_number if bool(random.getrandbits(1)): #print 'network goes first' board_state = emptyboard() player_turn = 1 while True: _available_moves = list(available_moves(board_state)) if len(_available_moves) == 0: break if player_turn > 0: #move = random_player(board_state, 1) move = make_training_move(board_state, 1) #_ = my_player(board_state, 1) #print 'network move position ', move else: move = my_player(board_state, -1) #print 'player move position ', move if move not in _available_moves: print 'illegal move' break board_state = apply_move(board_state, move, player_turn) #print board_state winner = gameover(board_state) if winner != 0 and winner != 2: break player_turn = -player_turn #printboard(board_state) #count.add(tuple(np.array(board_state).ravel())) else: #print 'player goes first' board_state = emptyboard() player_turn = -1 while True: _available_moves = list(available_moves(board_state)) if len(_available_moves) == 0: break if player_turn > 0: #move = random_player(board_state, 1) move = make_training_move(board_state, 1) #_ = my_player(board_state, 1) #print 'network move position ', move else: move = my_player(board_state, -1) #print 'player move position ', move if move not in _available_moves: print 'illegal move' break board_state = apply_move(board_state, move, player_turn) #print board_state winner = gameover(board_state) if winner != 0 and winner != 2: break player_turn = -player_turn #printboard(board_state) #count.add(tuple(np.array(board_state).ravel())) last_game_length = len(mini_batch_board_states) - game_length game_length += last_game_length if episode_number % batch_size == 0: np_mini_batch_board_states = np.array( mini_batch_board_states).reshape( game_length, *input_layer.get_shape().as_list()[1:]) ol, _ = session.run( [output_layer, train_step], feed_dict={ input_layer: np_mini_batch_board_states, actual_move_placeholder: mini_batch_moves }) # print np.array(np_mini_batch_board_states).reshape(10,10) # print 'output_layer_move', np.argmax(ol) # print 'our_moves', np.argmax(mini_batch_moves) #print np.array(ol).shape,np.array(mini_batch_moves).shape correct = np.sum( np.argmax(ol, axis=1) == np.argmax(mini_batch_moves, axis=1)) del mini_batch_board_states[:] del mini_batch_moves[:] print episode_number, ': ', 'accuracy ', correct / float( game_length) #print 'distinct final states ', len(count) game_length = 0 if episode_number % print_results_every == 0: if network_file_path: save_network(session, variables, save_network_file_path) if network_file_path: print 'saving final network' save_network(session, variables, save_network_file_path) return variables
def train_policy_gradient(network_file_path, save_network_file_path=None, learn_rate=1e-3, number_of_games=50000, print_results_every=1000, batch_size=100): print 'parameters => LR : ', learn_rate, ' Batch Size : ', batch_size save_network_file_path = save_network_file_path or network_file_path target_placeholder = tf.placeholder("float", shape=(None, 1)) input_layer, output_layer, variables = create_network( 10, (100, 100, 100, 100, 100), output_nodes=1, output_softmax=False) error = tf.reduce_sum(tf.square(target_placeholder - output_layer)) train_step = tf.train.RMSPropOptimizer(learn_rate).minimize(error) with tf.Session() as session: session.run(tf.initialize_all_variables()) if network_file_path and os.path.isfile(network_file_path): print("loading pre-existing network") load_network(session, variables, network_file_path) def make_training_move(board_state, side): a1 = Agent(side, lossval=-1) move = a1.action(board_state) return move def make_move(board_state, side): a1 = Agent(side, lossval=-1) move = a1.random_greedy(board_state) return move board_states_training = {} board_states_test = [] episode_number = 0 board_states_training_input = {} while len(board_states_training_input) < TRAIN_SAMPLES + TEST_SAMPLES: #if len(board_states_training_input)%100 == 0: print 'total games ', len(board_states_training_input) board_state = emptyboard() current_board_states_test = [] if bool(random.getrandbits(1)): side = 1 else: side = -1 while True: board_state = apply_move(board_state, make_training_move(board_state, side), side) current_board_states_test.append(deepcopy(board_state)) winner = gameover(board_state) if winner != 0: if winner == 2: winner = 0 break side = -side for i in range(len(current_board_states_test)): board_state_flat = tuple(np.ravel( current_board_states_test[i])) # only accept the board_state if not already in the dict if board_state_flat not in board_states_training_input: board_states_training[state_key( current_board_states_test[i])] = float(winner) board_states_training_input[board_state_flat] = 1 # take a random selection from training into a test set for _ in range(TEST_SAMPLES): sample = random.choice(list(board_states_training.keys())) board_states_test.append((sample, board_states_training[sample])) del board_states_training[sample] board_states_training = list(board_states_training.items()) test_error = session.run(error, feed_dict={ input_layer: [x[0] for x in board_states_test], target_placeholder: [[x[1]] for x in board_states_test] }) while True: np.random.shuffle(board_states_training) train_error = 0 for start_index in range( 0, len(board_states_training) - batch_size + 1, batch_size): mini_batch = board_states_training[start_index:start_index + batch_size] batch_error, _ = session.run( [error, train_step], feed_dict={ input_layer: [x[0] for x in mini_batch], target_placeholder: [[x[1]] for x in mini_batch] }) train_error += batch_error new_test_error = session.run(error, feed_dict={ input_layer: [x[0] for x in board_states_test], target_placeholder: [[x[1]] for x in board_states_test] }) print( "episode: %s train_error: %s new_test_error: %s test_error: %s" % (episode_number, train_error, new_test_error, test_error)) if new_test_error > test_error: print("train error went up, stopping training") break test_error = new_test_error episode_number += 1 if network_file_path: print 'saving final network' save_network(session, variables, save_network_file_path) return variables
for i in range(BOARD_SIZE): print '{0}'.format(str(i).center(5)), print '\n' for i in range(BOARD_SIZE): print i, for j in range(BOARD_SIZE): print '{0}'.format(NAMES[state[i][j]].center(5)), print('\n') if __name__ == '__main__': input_layer, output_layer, variables = create_network(100,(100,100,100)) with tf.Session() as session: session.run(tf.initialize_all_variables()) # MoonGo_supervised_cross_prob MoonGo_reinforcement load_network(session, variables, 'MoonGo_supervised_cross_prob.pickle') while 1: board_state = emptyboard() player_turn = 1 while True: printboard(board_state) _available_moves = list(available_moves(board_state)) if len(_available_moves) == 0: print("no moves left, game ended a draw") break if player_turn > 0: action = raw_input('your move? ') move = (int(action.split(',')[0]), int(action.split(',')[1])) else:
print i, for j in range(BOARD_SIZE): print '{0}'.format(NAMES[state[i][j]].center(5)), print('\n') if __name__ == '__main__': input_layer, output_layer, variables = create_network(10, (10, 10, 10, 10, 10), output_nodes=1, output_softmax=False) with tf.Session() as session: session.run(tf.initialize_all_variables()) # MoonGo_supervised_cross_prob MoonGo_reinforcement load_network(session, variables, 'MoonGo_reinforcement.pickle') while 1: board_state = emptyboard() player_turn = 1 while True: printboard(board_state) _available_moves = list(available_moves(board_state)) if len(_available_moves) == 0: print("no moves left, game ended a draw") break if player_turn > 0: action = raw_input('your move? ') move = (int(action.split(',')[0]), int(action.split(',')[1]))
def make_move_historical(histoical_network_index, board_state, side): net = historical_networks[histoical_network_index] move = get_stochastic_network_move(session, net[0], net[1], board_state, side) return game_spec.flat_move_to_tuple(move.argmax()) def make_training_move(board_state, side): mini_batch_board_states.append(np.ravel(board_state) * side) move = get_stochastic_network_move(session, input_layer, output_layer, board_state, side) mini_batch_moves.append(move) return game_spec.flat_move_to_tuple(move.argmax()) if os.path.isfile(STARTING_NETWORK_WEIGHTS): print("loading pre existing weights") load_network(session, variables, STARTING_NETWORK_WEIGHTS) else: print("could not find previous weights so initialising randomly") for i in range(NUMBER_OF_HISTORICAL_COPIES_TO_KEEP): if os.path.isfile(BASE_HISTORICAL_NETWORK_PATH + str(i) + '.p'): load_network(session, historical_networks[i][2], BASE_HISTORICAL_NETWORK_PATH + str(i) + '.p') elif os.path.isfile(STARTING_NETWORK_WEIGHTS): # if we can't load a historical file use the current network weights load_network(session, historical_networks[i][2], STARTING_NETWORK_WEIGHTS) for episode_number in range(1, NUMBER_OF_GAMES_TO_PLAY): opponent_index = random.randint(0, NUMBER_OF_HISTORICAL_COPIES_TO_KEEP-1) make_move_historical_for_index = functools.partial(make_move_historical, opponent_index) # randomize if going first or second