def test_game_decrease(game): # Create "mock" state that ends after 20 moves with the learner losing lose_state = [MockState(go.WHITE, 20, size=19)] policy1 = CNNPolicy.load_model( os.path.join('tests', 'test_data', 'minimodel_policy.json')) policy2 = CNNPolicy.load_model( os.path.join('tests', 'test_data', 'minimodel_policy.json')) learner = MockPlayer(policy1, game) opponent = MockPlayer(policy2, game) optimizer = SGD(lr=0.001) policy1.model.compile(loss=log_loss, optimizer=optimizer) # Get initial (before learning) move probabilities for all moves made by black init_move_probs = get_sgf_move_probs(game, policy1, go.BLACK) init_probs = [prob for (mv, prob) in init_move_probs] # Run RL training run_n_games(optimizer, 0.001, learner, opponent, 1, mock_states=lose_state) # Get new move probabilities for black's moves having finished 1 round of training new_move_probs = get_sgf_move_probs(game, policy1, go.BLACK) new_probs = [prob for (mv, prob) in new_move_probs] # Assert that, on average, move probabilities for black decreased having lost. self.assertTrue( sum((new_probs[i] - init_probs[i]) for i in range(10)) < 0)
def run_and_get_new_weights(init_weights, winners, game): # Create "mock" states that end after 2 moves with a predetermined winner. states = [MockState(winner, 2, size=19) for winner in winners] policy1 = CNNPolicy.load_model( os.path.join('tests', 'test_data', 'minimodel_policy.json')) policy2 = CNNPolicy.load_model( os.path.join('tests', 'test_data', 'minimodel_policy.json')) policy1.model.set_weights(init_weights) optimizer = SGD(lr=0.001) policy1.model.compile(loss=log_loss, optimizer=optimizer) learner = MockPlayer(policy1, game) opponent = MockPlayer(policy2, game) # Run RL training run_n_games(optimizer, 0.001, learner, opponent, 2, mock_states=states) return policy1.model.get_weights()
def test_batch_eval_state(self): policy = CNNPolicy( ["board", "liberties", "sensibleness", "capture_size"]) results = policy.batch_eval_state([GameState(), GameState()]) self.assertEqual(len(results), 2) # one result per GameState self.assertEqual(len(results[0]), 361) # each one has 361 (move,prob) pairs
def test_output_size(self): policy19 = CNNPolicy(["board", "liberties", "sensibleness", "capture_size"], board=19) output = policy19.forward(policy19.preprocessor.state_to_tensor(GameState(19))) self.assertEqual(output.shape, (1, 19 * 19)) policy13 = CNNPolicy(["board", "liberties", "sensibleness", "capture_size"], board=13) output = policy13.forward(policy13.preprocessor.state_to_tensor(GameState(13))) self.assertEqual(output.shape, (1, 13 * 13))
def test_save_load(self): policy = CNNPolicy(["board", "liberties", "sensibleness", "capture_size"]) model_file = 'TESTPOLICY.json' weights_file = 'TESTWEIGHTS.h5' model_file2 = 'TESTPOLICY2.json' weights_file2 = 'TESTWEIGHTS2.h5' # test saving model/weights separately policy.save_model(model_file) policy.model.save_weights(weights_file, overwrite=True) # test saving them together policy.save_model(model_file2, weights_file2) copypolicy = CNNPolicy.load_model(model_file) copypolicy.model.load_weights(weights_file) copypolicy2 = CNNPolicy.load_model(model_file2) for w1, w2 in zip(copypolicy.model.get_weights(), copypolicy2.model.get_weights()): self.assertTrue(np.all(w1 == w2)) os.remove(model_file) os.remove(weights_file) os.remove(model_file2) os.remove(weights_file2)
def test_save_load(self): policy = CNNPolicy(["board", "liberties", "sensibleness", "capture_size"]) model_file = 'TESTPOLICY.json' weights_file = 'TESTWEIGHTS.h5' policy.save_model(model_file) policy.model.save_weights(weights_file) copypolicy = CNNPolicy.load_model(model_file) copypolicy.model.load_weights(weights_file) os.remove(model_file) os.remove(weights_file)
def test_save_load(self): policy = CNNPolicy( ["board", "liberties", "sensibleness", "capture_size"]) model_file = 'TESTPOLICY.json' weights_file = 'TESTWEIGHTS.h5' policy.save_model(model_file) policy.model.save_weights(weights_file) copypolicy = CNNPolicy.load_model(model_file) copypolicy.model.load_weights(weights_file) os.remove(model_file) os.remove(weights_file)
def test_game_gradient(game): policy = CNNPolicy.load_model( os.path.join('tests', 'test_data', 'minimodel_policy.json')) initial_parameters = policy.model.get_weights() # Cases 1 and 2 have identical starting models and identical (state, action) pairs, # but they differ in who won the games. parameters1 = run_and_get_new_weights(initial_parameters, [go.BLACK, go.WHITE], game) parameters2 = run_and_get_new_weights(initial_parameters, [go.WHITE, go.BLACK], game) # Assert that some parameters changed. any_change_1 = any( not np.array_equal(i, p1) for (i, p1) in zip(initial_parameters, parameters1)) any_change_2 = any( not np.array_equal(i, p2) for (i, p2) in zip(initial_parameters, parameters2)) self.assertTrue(any_change_1) self.assertTrue(any_change_2) # Changes in case 1 should be equal and opposite to changes in case 2. Allowing 0.1% # difference in precision. for (i, p1, p2) in zip(initial_parameters, parameters1, parameters2): diff1 = p1 - i diff2 = p2 - i npt.assert_allclose(diff1, -diff2, rtol=1e-3, atol=1e-11)
def test_probabilistic_player(self): gs = GameState() policy = CNNPolicy(["board", "ones", "turns_since"]) player = ProbabilisticPolicyPlayer(policy) for i in range(20): move = player.get_move(gs) self.assertIsNotNone(move) gs.do_move(move)
def test_save_load(self): policy = CNNPolicy(["board", "liberties", "sensibleness", "capture_size"]) model_file = "TESTPOLICY.json" weights_file = "TESTWEIGHTS.h5" model_file2 = "TESTPOLICY2.json" weights_file2 = "TESTWEIGHTS2.h5" # test saving model/weights separately policy.save_model(model_file) policy.model.save_weights(weights_file, overwrite=True) # test saving them together policy.save_model(model_file2, weights_file2) copypolicy = CNNPolicy.load_model(model_file) copypolicy.model.load_weights(weights_file) copypolicy2 = CNNPolicy.load_model(model_file2) for w1, w2 in zip(copypolicy.model.get_weights(), copypolicy2.model.get_weights()): self.assertTrue(np.all(w1 == w2)) os.remove(model_file) os.remove(weights_file) os.remove(model_file2) os.remove(weights_file2)
def test_greedy_player(self): gs = GameState() policy = CNNPolicy(["board", "ones", "turns_since"]) player = GreedyPolicyPlayer(policy) for _ in range(20): move = player.get_move(gs) self.assertNotEqual(move, go.PASS) gs.do_move(move)
def test_game_run_N(game): policy1 = CNNPolicy.load_model(os.path.join('tests', 'test_data', 'minimodel.json')) policy2 = CNNPolicy.load_model(os.path.join('tests', 'test_data', 'minimodel.json')) learner = MockPlayer(policy1, game) opponent = MockPlayer(policy2, game) optimizer = SGD() init_weights = policy1.model.get_weights() policy1.model.compile(loss=log_loss, optimizer=optimizer) # Run RL training run_n_games(optimizer, learner, opponent, 2) # Get new weights for comparison trained_weights = policy1.model.get_weights() # Assert that some parameters changed. any_change = any(not np.array_equal(i, t) for (i, t) in zip(init_weights, trained_weights)) self.assertTrue(any_change)
def test_sensible_greedy(self): gs = GameState() policy = CNNPolicy(["board", "ones", "turns_since"]) player = GreedyPolicyPlayer(policy) empty = (10, 10) for x in range(19): for y in range(19): if (x, y) != empty: gs.do_move((x, y), go.BLACK) gs.current_player = go.BLACK self.assertIsNone(player.get_move(gs))
def test_sensible_probabilistic(self): gs = GameState() policy = CNNPolicy(["board", "ones", "turns_since"]) player = ProbabilisticPolicyPlayer(policy) empty = (10, 10) for x in range(19): for y in range(19): if (x, y) != empty: gs.do_move((x, y), go.BLACK) gs.set_current_player(go.BLACK) self.assertEqual(player.get_move(gs), go.PASS)
def test_default_policy(self): policy = CNNPolicy(["board", "liberties", "sensibleness", "capture_size"]) policy.eval_state(GameState())
def run_training(cmd_line_args=None): """Run training. command-line args may be passed in as a list """ import argparse parser = argparse.ArgumentParser(description='Perform supervised training on a policy network.') # required args parser.add_argument("model", help="Path to a JSON model file (i.e. from CNNPolicy.save_model())") parser.add_argument("train_data", help="A .h5 file of training data") parser.add_argument("out_directory", help="directory where metadata and weights will be saved") # frequently used args parser.add_argument("--minibatch", "-B", help="Size of training data minibatches. Default: 16", type=int, default=16) parser.add_argument("--epochs", "-E", help="Total number of iterations on the data. Default: 10", type=int, default=10) parser.add_argument("--epoch-length", "-l", help="Number of training examples considered 'one epoch'. Default: # training data", type=int, default=None) parser.add_argument("--learning-rate", "-r", help="Learning rate - how quickly the model learns at first. Default: .03", type=float, default=.03) parser.add_argument("--decay", "-d", help="The rate at which learning decreases. Default: .0001", type=float, default=.0001) parser.add_argument("--verbose", "-v", help="Turn on verbose mode", default=False, action="store_true") # slightly fancier args parser.add_argument("--weights", help="Name of a .h5 weights file (in the output directory) to load to resume training", default=None) parser.add_argument("--train-val-test", help="Fraction of data to use for training/val/test. Must sum to 1. Invalid if restarting training", nargs=3, type=float, default=[0.93, .05, .02]) # TODO - an argument to specify which transformations to use, put it in metadata if cmd_line_args is None: args = parser.parse_args() else: args = parser.parse_args(cmd_line_args) # TODO - what follows here should be refactored into a series of small functions resume = args.weights is not None if args.verbose: if resume: print "trying to resume from %s with weights %s" % (args.out_directory, os.path.join(args.out_directory, args.weights)) else: if os.path.exists(args.out_directory): print "directory %s exists. any previous data will be overwritten" % args.out_directory else: print "starting fresh output directory %s" % args.out_directory # load model from json spec model = CNNPolicy.load_model(args.model).model if resume: model.load_weights(os.path.join(args.out_directory, args.weights)) # TODO - (waiting on game_converter) verify that features of model match features of training data dataset = h5.File(args.train_data) n_total_data = len(dataset["states"]) n_train_data = int(args.train_val_test[0] * n_total_data) n_val_data = int(args.train_val_test[1] * n_total_data) # n_test_data = n_total_data - (n_train_data + n_val_data) if args.verbose: print "datset loaded" print "\t%d total samples" % n_total_data print "\t%d training samples" % n_train_data print "\t%d validaion samples" % n_val_data # ensure output directory is available if not os.path.exists(args.out_directory): os.makedirs(args.out_directory) # create metadata file and the callback object that will write to it meta_file = os.path.join(args.out_directory, "metadata.json") meta_writer = MetadataWriterCallback(meta_file) # load prior data if it already exists if os.path.exists(meta_file) and resume: with open(meta_file, "r") as f: meta_writer.metadata = json.load(f) if args.verbose: print "previous metadata loadeda: %d epochs. new epochs will be appended." % len(meta_writer.metadata["epochs"]) elif args.verbose: print "starting with empty metadata" # the MetadataWriterCallback only sets 'epoch' and 'best_epoch'. We can add in anything else we like here # TODO - model and train_data are saved in meta_file; check that they match (and make args optional when restarting?) meta_writer.metadata["training_data"] = args.train_data meta_writer.metadata["model_file"] = args.model # create ModelCheckpoint to save weights every epoch checkpoint_template = os.path.join(args.out_directory, "weights.{epoch:05d}.hdf5") checkpointer = ModelCheckpoint(checkpoint_template) # load precomputed random-shuffle indices or create them # TODO - save each train/val/test indices separately so there's no danger of # changing args.train_val_test when resuming shuffle_file = os.path.join(args.out_directory, "shuffle.npz") if os.path.exists(shuffle_file) and resume: with open(shuffle_file, "r") as f: shuffle_indices = np.load(f) if args.verbose: print "loading previous data shuffling indices" else: # create shuffled indices shuffle_indices = np.random.permutation(n_total_data) with open(shuffle_file, "w") as f: np.save(f, shuffle_indices) if args.verbose: print "created new data shuffling indices" # training indices are the first consecutive set of shuffled indices, val next, then test gets the remainder train_indices = shuffle_indices[0:n_train_data] val_indices = shuffle_indices[n_train_data:n_train_data + n_val_data] # test_indices = shuffle_indices[n_train_data + n_val_data:] # create dataset generators train_data_generator = shuffled_hdf5_batch_generator( dataset["states"], dataset["actions"], train_indices, args.minibatch, BOARD_TRANSFORMATIONS) val_data_generator = shuffled_hdf5_batch_generator( dataset["states"], dataset["actions"], val_indices, args.minibatch, BOARD_TRANSFORMATIONS) sgd = SGD(lr=args.learning_rate, decay=args.decay) model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=["accuracy"]) samples_per_epoch = args.epoch_length or n_train_data if args.verbose: print "STARTING TRAINING" model.fit_generator( generator=train_data_generator, samples_per_epoch=samples_per_epoch, nb_epoch=args.epochs, callbacks=[checkpointer, meta_writer], validation_data=val_data_generator, nb_val_samples=n_val_data)
def train(metadata, out_directory, verbose, weight_file, meta_file): # set resume resume = weight_file is not None # load model from json spec policy = CNNPolicy.load_model(metadata["model_file"]) model_features = policy.preprocessor.get_feature_list() model = policy.model # load weights if resume: model.load_weights( os.path.join(out_directory, FOLDER_WEIGHT, weight_file)) # features of training data dataset = h5.File(metadata["training_data"]) # Verify that dataset's features match the model's expected features. validate_feature_planes(verbose, dataset, model_features) # create metadata file and the callback object that will write to it # and saves model at the same time # the MetadataWriterCallback only sets 'epoch', 'best_epoch' and 'current_batch'. # We can add in anything else we like here meta_writer = EpochDataSaverCallback(meta_file, out_directory, metadata) # get train/validation/test indices train_indices, val_indices, test_indices \ = load_train_val_test_indices(verbose, metadata['symmetries'], len(dataset["states"]), metadata["batch_size"], out_directory) # create dataset generators train_data_generator = threading_shuffled_hdf5_batch_generator( dataset["states"], dataset["actions"], train_indices, metadata["batch_size"], metadata) val_data_generator = threading_shuffled_hdf5_batch_generator( dataset["states"], dataset["actions"], val_indices, metadata["batch_size"], validation=True) # check if step decay has to be applied if metadata["decay_every"] is None: # use normal decay without momentum lr_scheduler_callback = LrDecayCallback(metadata) else: # use step decay lr_scheduler_callback = LrStepDecayCallback(metadata, verbose) sgd = SGD(lr=metadata["learning_rate"]) model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=["accuracy"]) if verbose: print("STARTING TRAINING") # check that remaining epoch > 0 if metadata["epochs"] <= len(metadata["epoch_logs"]): raise ValueError("No more epochs to train!") model.fit_generator( generator=train_data_generator, steps_per_epoch=(metadata["epoch_length"] / metadata["batch_size"]), epochs=(metadata["epochs"] - len(metadata["epoch_logs"])), callbacks=[meta_writer, lr_scheduler_callback], validation_data=val_data_generator, validation_steps=(len(val_indices) / metadata["batch_size"]))
def run_training(cmd_line_args=None): import argparse parser = argparse.ArgumentParser( description= 'Perform reinforcement learning to improve given policy network. Second phase of pipeline.' ) # noqa: E501 parser.add_argument("model_json", help="Path to policy model JSON.") parser.add_argument( "initial_weights", help= "Path to HDF5 file with inital weights (i.e. result of supervised training)." ) # noqa: E501 parser.add_argument( "out_directory", help= "Path to folder where the model params and metadata will be saved after each epoch." ) # noqa: E501 parser.add_argument("--learning-rate", help="Keras learning rate (Default: 0.001)", type=float, default=0.001) # noqa: E501 parser.add_argument( "--policy-temp", help= "Distribution temperature of players using policies (Default: 0.67)", type=float, default=0.67) # noqa: E501 parser.add_argument( "--save-every", help="Save policy as a new opponent every n batches (Default: 500)", type=int, default=500) # noqa: E501 parser.add_argument( "--record-every", help="Save learner's weights every n batches (Default: 1)", type=int, default=1) # noqa: E501 parser.add_argument("--game-batch", help="Number of games per mini-batch (Default: 20)", type=int, default=20) # noqa: E501 parser.add_argument("--move-limit", help="Maximum number of moves per game", type=int, default=500) # noqa: E501 parser.add_argument( "--iterations", help="Number of training batches/iterations (Default: 10000)", type=int, default=10000) # noqa: E501 parser.add_argument("--resume", help="Load latest weights in out_directory and resume", default=False, action="store_true") # noqa: E501 parser.add_argument("--verbose", "-v", help="Turn on verbose mode", default=False, action="store_true") # noqa: E501 # Baseline function (TODO) default lambda state: 0 (receives either file # paths to JSON and weights or None, in which case it uses default baseline 0) if cmd_line_args is None: args = parser.parse_args() else: args = parser.parse_args(cmd_line_args) ZEROTH_FILE = "weights.00000.hdf5" if args.resume: if not os.path.exists(os.path.join(args.out_directory, "metadata.json")): raise ValueError("Cannot resume without existing output directory") if not os.path.exists(args.out_directory): if args.verbose: print("creating output directory {}".format(args.out_directory)) os.makedirs(args.out_directory) if not args.resume: # make a copy of weights file, "weights.00000.hdf5" in the output directory copyfile(args.initial_weights, os.path.join(args.out_directory, ZEROTH_FILE)) if args.verbose: print("copied {} to {}".format( args.initial_weights, os.path.join(args.out_directory, ZEROTH_FILE))) player_weights = ZEROTH_FILE iter_start = 1 else: # if resuming, we expect initial_weights to be just a # "weights.#####.hdf5" file, not a full path if not re.match(r"weights\.\d{5}\.hdf5", args.initial_weights): raise ValueError( "Expected to resume from weights file with name 'weights.#####.hdf5'" ) args.initial_weights = os.path.join( args.out_directory, os.path.basename(args.initial_weights)) if not os.path.exists(args.initial_weights): raise ValueError("Cannot resume; weights {} do not exist".format( args.initial_weights)) elif args.verbose: print("Resuming with weights {}".format(args.initial_weights)) player_weights = os.path.basename(args.initial_weights) iter_start = 1 + int(player_weights[8:13]) # Set initial conditions policy = CNNPolicy.load_model(args.model_json) policy.model.load_weights(args.initial_weights) player = ProbabilisticPolicyPlayer(policy, temperature=args.policy_temp, move_limit=args.move_limit) # different opponents come from simply changing the weights of 'opponent.policy.model'. That # is, only 'opp_policy' needs to be changed, and 'opponent' will change. opp_policy = CNNPolicy.load_model(args.model_json) opponent = ProbabilisticPolicyPlayer(opp_policy, temperature=args.policy_temp, move_limit=args.move_limit) if args.verbose: print("created player and opponent with temperature {}".format( args.policy_temp)) if not args.resume: metadata = { "model_file": args.model_json, "init_weights": args.initial_weights, "learning_rate": args.learning_rate, "temperature": args.policy_temp, "game_batch": args.game_batch, "opponents": [ZEROTH_FILE ], # which weights from which to sample an opponent each batch "win_ratio": {} # map from player to tuple of (opponent, win ratio) Useful for # validating in lieu of 'accuracy/loss' } else: with open(os.path.join(args.out_directory, "metadata.json"), "r") as f: metadata = json.load(f) # Append args of current run to history of full command args. metadata["cmd_line_args"] = metadata.get("cmd_line_args", []) metadata["cmd_line_args"].append(vars(args)) def save_metadata(): with open(os.path.join(args.out_directory, "metadata.json"), "w") as f: json.dump(metadata, f, sort_keys=True, indent=2) optimizer = SGD(lr=args.learning_rate) player.policy.model.compile(loss=log_loss, optimizer=optimizer) for i_iter in range(iter_start, args.iterations + 1): # Note that player_weights will only be saved as a file every args.record_every iterations. # Regardless, player_weights enters into the metadata to keep track of the win ratio over # time. player_weights = "weights.%05d.hdf5" % i_iter # Randomly choose opponent from pool (possibly self), and playing # game_batch games against them. opp_weights = np.random.choice(metadata["opponents"]) opp_path = os.path.join(args.out_directory, opp_weights) # Load new weights into opponent's network, but keep the same opponent object. opponent.policy.model.load_weights(opp_path) if args.verbose: print("Batch {}\tsampled opponent is {}".format( i_iter, opp_weights)) # Run games (and learn from results). Keep track of the win ratio vs each opponent over # time. win_ratio = run_n_games(optimizer, args.learning_rate, player, opponent, args.game_batch) metadata["win_ratio"][player_weights] = (opp_weights, win_ratio) # Save intermediate models. if i_iter % args.record_every == 0: player.policy.model.save_weights( os.path.join(args.out_directory, player_weights)) # Add player to batch of oppenents once in a while. if i_iter % args.save_every == 0: metadata["opponents"].append(player_weights) save_metadata()
def test_batch_eval_state(self): policy = CNNPolicy(["board", "liberties", "sensibleness", "capture_size"]) results = policy.batch_eval_state([GameState(), GameState()]) self.assertEqual(len(results), 2) # one result per GameState self.assertEqual(len(results[0]), 361) # each one has 361 (move,prob) pairs
def run_training(cmd_line_args=None): parser = argparse.ArgumentParser(description='Perform reinforcement learning to improve given policy network. Second phase of pipeline.') parser.add_argument("model_json", help="Path to policy model JSON.") parser.add_argument("initial_weights", help="Path to HDF5 file with inital weights (i.e. result of supervised training).") parser.add_argument("out_directory", help="Path to folder where the model params and metadata will be saved after each epoch.") parser.add_argument("--learning-rate", help="Keras learning rate (Default: .03)", type=float, default=.03) parser.add_argument("--policy-temp", help="Distribution temperature of players using policies (Default: 0.67)", type=float, default=0.67) parser.add_argument("--save-every", help="Save policy as a new opponent every n batches (Default: 500)", type=int, default=500) parser.add_argument("--game-batch", help="Number of games per mini-batch (Default: 20)", type=int, default=20) parser.add_argument("--iterations", help="Number of training batches/iterations (Default: 10000)", type=int, default=10000) parser.add_argument("--resume", help="Load latest weights in out_directory and resume", default=False, action="store_true") parser.add_argument("--verbose", "-v", help="Turn on verbose mode", default=False, action="store_true") # Baseline function (TODO) default lambda state: 0 (receives either file # paths to JSON and weights or None, in which case it uses default baseline 0) if cmd_line_args is None: args = parser.parse_args() else: args = parser.parse_args(cmd_line_args) ZEROTH_FILE = "weights.00000.hdf5" if args.resume: if not os.path.exists(os.path.join(args.out_directory, "metadata.json")): raise ValueError("Cannot resume without existing output directory") if not os.path.exists(args.out_directory): if args.verbose: print ("creating output directory {}".format(args.out_directory)) os.makedirs(args.out_directory) if not args.resume: # make a copy of weights file, "weights.00000.hdf5" in the output directory copyfile(args.initial_weights, os.path.join(args.out_directory, ZEROTH_FILE)) if args.verbose: print ("copied {} to {}".format(args.initial_weights, os.path.join(args.out_directory, ZEROTH_FILE))) player_weights = ZEROTH_FILE else: # if resuming, we expect initial_weights to be just a "weights.#####.hdf5" file, not a full path args.initial_weights = os.path.join(args.out_directory, os.path.basename(args.initial_weights)) if not os.path.exists(args.initial_weights): raise ValueError("Cannot resume; weights {} do not exist".format(args.initial_weights)) elif args.verbose: print ("Resuming with weights {}".format(args.initial_weights)) player_weights = os.path.basename(args.initial_weights) # Set initial conditions policy = CNNPolicy.load_model(args.model_json) policy.model.load_weights(args.initial_weights) player = ProbabilisticPolicyPlayer(policy, temperature=args.policy_temp) features = policy.preprocessor.feature_list # different opponents come from simply changing the weights of # opponent.policy.model "behind the scenes" opp_policy = CNNPolicy.load_model(args.model_json) opponent = ProbabilisticPolicyPlayer(opp_policy, temperature=args.policy_temp) if args.verbose: print ("created player and opponent with temperature {}".format(args.policy_temp)) if not args.resume: metadata = { "model_file": args.model_json, "init_weights": args.initial_weights, "learning_rate": args.learning_rate, "temperature": args.policy_temp, "game_batch": args.game_batch, "opponents": [ZEROTH_FILE], # which weights from which to sample an opponent each batch "win_ratio": {} # map from player to tuple of (opponent, win ratio) Useful for validating in lieu of 'accuracy/loss' } else: with open(os.path.join(args.out_directory, "metadata.json"), "r") as f: metadata = json.load(f) def save_metadata(): with open(os.path.join(args.out_directory, "metadata.json"), "w") as f: json.dump(metadata, f) # Set SGD and compile sgd = SGD(lr=args.learning_rate) player.policy.model.compile(loss='binary_crossentropy', optimizer=sgd) board_size = player.policy.model.input_shape[-1] for i_iter in xrange(1, args.iterations + 1): # Train mini-batches by randomly choosing opponent from pool (possibly self) # and playing game_batch games against them opp_weights = np.random.choice(metadata["opponents"]) opp_path = os.path.join(args.out_directory, opp_weights) # load new weights into opponent, but otherwise its the same opponent.policy.model.load_weights(opp_path) if args.verbose: print ("Batch {}\tsampled opponent is {}".format(i_iter, opp_weights)) # Make training pairs and do RL X_list, y_list, winners = make_training_pairs(player, opponent, features, args.game_batch, board_size) win_ratio = np.sum(np.array(winners) == 1) / float(args.game_batch) metadata["win_ratio"][player_weights] = (opp_weights, win_ratio) train_batch(player, X_list, y_list, winners, args.learning_rate) # Save intermediate models player_weights = "weights.%05d.hdf5" % i_iter player.policy.model.save_weights(os.path.join(args.out_directory, player_weights)) # add player to batch of oppenents once in a while if i_iter % args.save_every == 0: metadata["opponents"].append(player_weights) save_metadata()
def run_training(cmd_line_args=None): parser = argparse.ArgumentParser( description= 'Perform reinforcement learning to improve given policy network. Second phase of pipeline.' ) parser.add_argument("model_json", help="Path to policy model JSON.") parser.add_argument( "initial_weights", help= "Path to HDF5 file with inital weights (i.e. result of supervised training)." ) parser.add_argument( "out_directory", help= "Path to folder where the model params and metadata will be saved after each epoch." ) parser.add_argument("--learning-rate", help="Keras learning rate (Default: .03)", type=float, default=.03) parser.add_argument( "--policy-temp", help= "Distribution temperature of players using policies (Default: 0.67)", type=float, default=0.67) parser.add_argument( "--save-every", help="Save policy as a new opponent every n batches (Default: 500)", type=int, default=500) parser.add_argument("--game-batch", help="Number of games per mini-batch (Default: 20)", type=int, default=20) parser.add_argument( "--iterations", help="Number of training batches/iterations (Default: 10000)", type=int, default=10000) parser.add_argument("--resume", help="Load latest weights in out_directory and resume", default=False, action="store_true") parser.add_argument("--verbose", "-v", help="Turn on verbose mode", default=False, action="store_true") # Baseline function (TODO) default lambda state: 0 (receives either file # paths to JSON and weights or None, in which case it uses default baseline 0) if cmd_line_args is None: args = parser.parse_args() else: args = parser.parse_args(cmd_line_args) ZEROTH_FILE = "weights.00000.hdf5" if args.resume: if not os.path.exists(os.path.join(args.out_directory, "metadata.json")): raise ValueError("Cannot resume without existing output directory") if not os.path.exists(args.out_directory): if args.verbose: print("creating output directory {}".format(args.out_directory)) os.makedirs(args.out_directory) if not args.resume: # make a copy of weights file, "weights.00000.hdf5" in the output directory copyfile(args.initial_weights, os.path.join(args.out_directory, ZEROTH_FILE)) if args.verbose: print("copied {} to {}".format( args.initial_weights, os.path.join(args.out_directory, ZEROTH_FILE))) player_weights = ZEROTH_FILE else: # if resuming, we expect initial_weights to be just a "weights.#####.hdf5" file, not a full path args.initial_weights = os.path.join( args.out_directory, os.path.basename(args.initial_weights)) if not os.path.exists(args.initial_weights): raise ValueError("Cannot resume; weights {} do not exist".format( args.initial_weights)) elif args.verbose: print("Resuming with weights {}".format(args.initial_weights)) player_weights = os.path.basename(args.initial_weights) # Set initial conditions policy = CNNPolicy.load_model(args.model_json) policy.model.load_weights(args.initial_weights) player = ProbabilisticPolicyPlayer(policy, temperature=args.policy_temp) features = policy.preprocessor.feature_list # different opponents come from simply changing the weights of # opponent.policy.model "behind the scenes" opp_policy = CNNPolicy.load_model(args.model_json) opponent = ProbabilisticPolicyPlayer(opp_policy, temperature=args.policy_temp) if args.verbose: print("created player and opponent with temperature {}".format( args.policy_temp)) if not args.resume: metadata = { "model_file": args.model_json, "init_weights": args.initial_weights, "learning_rate": args.learning_rate, "temperature": args.policy_temp, "game_batch": args.game_batch, "opponents": [ZEROTH_FILE ], # which weights from which to sample an opponent each batch "win_ratio": { } # map from player to tuple of (opponent, win ratio) Useful for validating in lieu of 'accuracy/loss' } else: with open(os.path.join(args.out_directory, "metadata.json"), "r") as f: metadata = json.load(f) def save_metadata(): with open(os.path.join(args.out_directory, "metadata.json"), "w") as f: json.dump(metadata, f) # Set SGD and compile sgd = SGD(lr=args.learning_rate) player.policy.model.compile(loss='binary_crossentropy', optimizer=sgd) board_size = player.policy.model.input_shape[-1] for i_iter in xrange(1, args.iterations + 1): # Train mini-batches by randomly choosing opponent from pool (possibly self) # and playing game_batch games against them opp_weights = np.random.choice(metadata["opponents"]) opp_path = os.path.join(args.out_directory, opp_weights) # load new weights into opponent, but otherwise its the same opponent.policy.model.load_weights(opp_path) if args.verbose: print("Batch {}\tsampled opponent is {}".format( i_iter, opp_weights)) # Make training pairs and do RL X_list, y_list, winners = make_training_pairs(player, opponent, features, args.game_batch, board_size) win_ratio = np.sum(np.array(winners) == 1) / float(args.game_batch) metadata["win_ratio"][player_weights] = (opp_weights, win_ratio) train_batch(player, X_list, y_list, winners, args.learning_rate) # Save intermediate models player_weights = "weights.%05d.hdf5" % i_iter player.policy.model.save_weights( os.path.join(args.out_directory, player_weights)) # add player to batch of oppenents once in a while if i_iter % args.save_every == 0: metadata["opponents"].append(player_weights) save_metadata()
from AlphaGo.training.reinforcement_policy_trainer import run_training from AlphaGo.models.policy import CNNPolicy import os from cProfile import Profile # make a miniature model for playing on a miniature 7x7 board architecture = {'filters_per_layer': 32, 'layers': 4, 'board': 7} features = ['board', 'ones', 'turns_since', 'liberties', 'capture_size', 'self_atari_size', 'liberties_after', 'sensibleness'] policy = CNNPolicy(features, **architecture) datadir = os.path.join('benchmarks', 'data') modelfile = os.path.join(datadir, 'mini_rl_model.json') weights = os.path.join(datadir, 'init_weights.hdf5') outdir = os.path.join(datadir, 'rl_output') stats_file = os.path.join(datadir, 'reinforcement_policy_trainer.prof') if not os.path.exists(datadir): os.makedirs(datadir) if not os.path.exists(weights): policy.model.save_weights(weights) policy.save_model(modelfile) profile = Profile() arguments = (modelfile, weights, outdir, '--learning-rate', '0.001', '--save-every', '2', '--game-batch', '20', '--iterations', '10', '--verbose') profile.runcall(run_training, arguments) profile.dump_stats(stats_file)
def run_training(cmd_line_args=None): """Run training. command-line args may be passed in as a list """ import argparse parser = argparse.ArgumentParser( description='Perform supervised training on a policy network.') # required args parser.add_argument( "model", help="Path to a JSON model file (i.e. from CNNPolicy.save_model())") parser.add_argument("train_data", help="A .h5 file of training data") parser.add_argument( "out_directory", help="directory where metadata and weights will be saved") # frequently used args parser.add_argument("--minibatch", "-B", help="Size of training data minibatches. Default: 16", type=int, default=16) parser.add_argument( "--epochs", "-E", help="Total number of iterations on the data. Default: 10", type=int, default=10) parser.add_argument( "--epoch-length", "-l", help= "Number of training examples considered 'one epoch'. Default: # training data", type=int, default=None) parser.add_argument( "--learning-rate", "-r", help= "Learning rate - how quickly the model learns at first. Default: .03", type=float, default=.03) parser.add_argument( "--decay", "-d", help="The rate at which learning decreases. Default: .0001", type=float, default=.0001) parser.add_argument("--verbose", "-v", help="Turn on verbose mode", default=False, action="store_true") # slightly fancier args parser.add_argument( "--weights", help= "Name of a .h5 weights file (in the output directory) to load to resume training", default=None) parser.add_argument( "--train-val-test", help= "Fraction of data to use for training/val/test. Must sum to 1. Invalid if restarting training", nargs=3, type=float, default=[0.93, .05, .02]) parser.add_argument( "--symmetries", help= "Comma-separated list of transforms, subset of noop,rot90,rot180,rot270,fliplr,flipud,diag1,diag2", default='noop,rot90,rot180,rot270,fliplr,flipud,diag1,diag2') # TODO - an argument to specify which transformations to use, put it in metadata if cmd_line_args is None: args = parser.parse_args() else: args = parser.parse_args(cmd_line_args) # TODO - what follows here should be refactored into a series of small functions resume = args.weights is not None if args.verbose: if resume: print("trying to resume from %s with weights %s" % (args.out_directory, os.path.join(args.out_directory, args.weights))) else: if os.path.exists(args.out_directory): print( "directory %s exists. any previous data will be overwritten" % args.out_directory) else: print("starting fresh output directory %s" % args.out_directory) # load model from json spec model = CNNPolicy.load_model(args.model).model if resume: model.load_weights(os.path.join(args.out_directory, args.weights)) # TODO - (waiting on game_converter) verify that features of model match features of training data dataset = h5.File(args.train_data) n_total_data = len(dataset["states"]) n_train_data = int(args.train_val_test[0] * n_total_data) n_val_data = int(args.train_val_test[1] * n_total_data) # n_test_data = n_total_data - (n_train_data + n_val_data) if args.verbose: print("datset loaded") print("\t%d total samples" % n_total_data) print("\t%d training samples" % n_train_data) print("\t%d validaion samples" % n_val_data) # ensure output directory is available if not os.path.exists(args.out_directory): os.makedirs(args.out_directory) # create metadata file and the callback object that will write to it meta_file = os.path.join(args.out_directory, "metadata.json") meta_writer = MetadataWriterCallback(meta_file) # load prior data if it already exists if os.path.exists(meta_file) and resume: with open(meta_file, "r") as f: meta_writer.metadata = json.load(f) if args.verbose: print( "previous metadata loaded: %d epochs. new epochs will be appended." % len(meta_writer.metadata["epochs"])) elif args.verbose: print("starting with empty metadata") # the MetadataWriterCallback only sets 'epoch' and 'best_epoch'. We can add in anything else we like here # TODO - model and train_data are saved in meta_file; check that they match (and make args optional when restarting?) meta_writer.metadata["training_data"] = args.train_data meta_writer.metadata["model_file"] = args.model # create ModelCheckpoint to save weights every epoch checkpoint_template = os.path.join(args.out_directory, "weights.{epoch:05d}.hdf5") checkpointer = ModelCheckpoint(checkpoint_template) # load precomputed random-shuffle indices or create them # TODO - save each train/val/test indices separately so there's no danger of # changing args.train_val_test when resuming shuffle_file = os.path.join(args.out_directory, "shuffle.npz") if os.path.exists(shuffle_file) and resume: with open(shuffle_file, "r") as f: shuffle_indices = np.load(f) if args.verbose: print("loading previous data shuffling indices") else: # create shuffled indices shuffle_indices = np.random.permutation(n_total_data) with open(shuffle_file, "w") as f: np.save(f, shuffle_indices) if args.verbose: print("created new data shuffling indices") # training indices are the first consecutive set of shuffled indices, val next, then test gets the remainder train_indices = shuffle_indices[0:n_train_data] val_indices = shuffle_indices[n_train_data:n_train_data + n_val_data] # test_indices = shuffle_indices[n_train_data + n_val_data:] symmetries = [ BOARD_TRANSFORMATIONS[name] for name in args.symmetries.strip().split(",") ] # create dataset generators train_data_generator = shuffled_hdf5_batch_generator( dataset["states"], dataset["actions"], train_indices, args.minibatch, symmetries) val_data_generator = shuffled_hdf5_batch_generator(dataset["states"], dataset["actions"], val_indices, args.minibatch, symmetries) sgd = SGD(lr=args.learning_rate, decay=args.decay) model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=["accuracy"]) samples_per_epoch = args.epoch_length or n_train_data if args.verbose: print("STARTING TRAINING") model.fit_generator(generator=train_data_generator, samples_per_epoch=samples_per_epoch, nb_epoch=args.epochs, callbacks=[checkpointer, meta_writer], validation_data=val_data_generator, nb_val_samples=n_val_data)
from AlphaGo.training.supervised_policy_trainer import run_training from AlphaGo.models.policy import CNNPolicy from cProfile import Profile architecture = {'filters_per_layer': 128, 'layers': 12} features = ['board', 'ones', 'turns_since'] policy = CNNPolicy(features, **architecture) policy.save_model('model.json') profile = Profile() # --epochs 5 --minibatch 32 --learning-rate 0.01 arguments = ('model.json', 'debug_feature_planes.hdf5', 'training_results/', 5, 32, .01) def run_supervised_policy_training(): run_training(*arguments) profile.runcall(run_supervised_policy_training) profile.dump_stats('supervised_policy_training_bench_results.prof')
def handle_arguments(cmd_line_args=None): """Run generate data. command-line args may be passed in as a list """ import argparse parser = argparse.ArgumentParser( description='Play games used for training' 'value network (third phase of pipeline). ' 'The final policy from the RL phase plays ' 'against itself and training pairs for value ' 'network are generated from the outcome in each ' 'games, following an off-policy, uniform random move') # required arguments parser.add_argument( "SL_weights_path", help="Path to file with supervised learning policy weights." ) # noqa: E501 parser.add_argument( "RL_weights_path", help="Path to file with reinforcement learning policy weights." ) # noqa: E501 parser.add_argument("model_path", help="Path to network architecture file.") # optional arguments parser.add_argument("--verbose", "-v", help="Turn on verbose mode", default=False, action="store_true") # noqa: E501 parser.add_argument( "--outfile", "-o", help="Destination to write data (hdf5 file) Default: " + DEFAULT_FILE_NAME, default=DEFAULT_FILE_NAME) # noqa: E501 parser.add_argument( "--sgf-path", help="If set all sgf will be saved here. Default: None", default=None) # noqa: E501 parser.add_argument( "--n-training-pairs", help="Number of training pairs to generate. Default: " + str(DEFAULT_N_TRAINING_PAIRS), type=int, default=DEFAULT_N_TRAINING_PAIRS) # noqa: E501 parser.add_argument("--batch-size", help="Number of games to run in parallel. Default: " + str(DEFAULT_BATCH_SIZE), type=int, default=DEFAULT_BATCH_SIZE) # noqa: E501 parser.add_argument( "--features", "-f", help= "Comma-separated list of features to compute and store or 'all'. Default: all", default='all') # noqa: E501 parser.add_argument( "--sl-temperature", help="Distribution temperature of players using SL policies. Default: " + str(DEFAULT_TEMPERATURE_SL), type=float, default=DEFAULT_TEMPERATURE_SL) # noqa: E501 parser.add_argument( "--rl-temperature", help="Distribution temperature of players using RL policies. Default: " + str(DEFAULT_TEMPERATURE_RL), type=float, default=DEFAULT_TEMPERATURE_RL) # noqa: E501 # show help or parse arguments if cmd_line_args is None: args = parser.parse_args() else: args = parser.parse_args(cmd_line_args) # list with features used for value network # features = policy_SL.preprocessor.feature_list if args.features.lower() == 'all': features = [ "board", "ones", "turns_since", "liberties", "capture_size", "self_atari_size", "liberties_after", "ladder_capture", "ladder_escape", "sensibleness", "color" ] else: features = args.features.split(",") # always add color feature if "color" not in features: features.append("color") # Load SL architecture and weights from file policy_SL = CNNPolicy.load_model(args.model_path) policy_SL.model.load_weights(args.SL_weights_path) # create SL player player_SL = ProbabilisticPolicyPlayer(policy_SL, temperature=args.sl_temperature, move_limit=DEFAULT_MAX_GAME_DEPTH) # Load RL architecture and weights from file policy_RL = CNNPolicy.load_model(args.model_path) policy_RL.model.load_weights(args.RL_weights_path) # Create RL player # TODO is it better to use greedy player? player_RL = ProbabilisticPolicyPlayer(policy_RL, temperature=args.rl_temperature, move_limit=DEFAULT_MAX_GAME_DEPTH) # check if folder exists if args.sgf_path is not None and not os.path.exists(args.sgf_path): os.makedirs(args.sgf_path) # generate data generate_data(player_RL, player_SL, args.outfile, args.n_training_pairs, args.batch_size, policy_SL.model.input_shape[-1], features, args.verbose, args.sgf_path)