def test_probabilistic_player(self): gs = GameState() policy = CNNPolicy(["board", "ones", "turns_since"]) player = ProbabilisticPolicyPlayer(policy) for i in range(20): move = player.get_move(gs) self.assertIsNotNone(move) gs.do_move(move)
def test_sensible_probabilistic(self): gs = GameState() policy = CNNPolicy(["board", "ones", "turns_since"]) player = ProbabilisticPolicyPlayer(policy) empty = (10, 10) for x in range(19): for y in range(19): if (x, y) != empty: gs.do_move((x, y), go.BLACK) gs.current_player = go.BLACK self.assertIsNone(player.get_move(gs))
def test_extreme_temperature_is_numerically_stable(self): player_low = ProbabilisticPolicyPlayer(None, temperature=1e-12) player_high = ProbabilisticPolicyPlayer(None, temperature=1e+12) distribution = np.random.random(361) distribution = distribution / distribution.sum() self.assertFalse(any(np.isnan(player_low.apply_temperature(distribution)))) self.assertFalse(any(np.isnan(player_high.apply_temperature(distribution))))
def test_temperature_increases_entropy(self): # helper function to get the entropy of a distribution def entropy(distribution): distribution = np.array(distribution).flatten() return -np.dot(np.log(distribution), distribution.T) player_low = ProbabilisticPolicyPlayer(None, temperature=0.9) player_high = ProbabilisticPolicyPlayer(None, temperature=1.1) distribution = np.random.random(361) distribution = distribution / distribution.sum() base_entropy = entropy(distribution) high_entropy = entropy(player_high.apply_temperature(distribution)) low_entropy = entropy(player_low.apply_temperature(distribution)) self.assertGreater(high_entropy, base_entropy) self.assertLess(low_entropy, base_entropy)
def run_training(cmd_line_args=None): parser = argparse.ArgumentParser( description= 'Perform reinforcement learning to improve given policy network. Second phase of pipeline.' ) parser.add_argument("model_json", help="Path to policy model JSON.") parser.add_argument( "initial_weights", help= "Path to HDF5 file with inital weights (i.e. result of supervised training)." ) parser.add_argument( "out_directory", help= "Path to folder where the model params and metadata will be saved after each epoch." ) parser.add_argument("--learning-rate", help="Keras learning rate (Default: .03)", type=float, default=.03) parser.add_argument( "--policy-temp", help= "Distribution temperature of players using policies (Default: 0.67)", type=float, default=0.67) parser.add_argument( "--save-every", help="Save policy as a new opponent every n batches (Default: 500)", type=int, default=500) parser.add_argument("--game-batch", help="Number of games per mini-batch (Default: 20)", type=int, default=20) parser.add_argument( "--iterations", help="Number of training batches/iterations (Default: 10000)", type=int, default=10000) parser.add_argument("--resume", help="Load latest weights in out_directory and resume", default=False, action="store_true") parser.add_argument("--verbose", "-v", help="Turn on verbose mode", default=False, action="store_true") # Baseline function (TODO) default lambda state: 0 (receives either file # paths to JSON and weights or None, in which case it uses default baseline 0) if cmd_line_args is None: args = parser.parse_args() else: args = parser.parse_args(cmd_line_args) ZEROTH_FILE = "weights.00000.hdf5" if args.resume: if not os.path.exists(os.path.join(args.out_directory, "metadata.json")): raise ValueError("Cannot resume without existing output directory") if not os.path.exists(args.out_directory): if args.verbose: print("creating output directory {}".format(args.out_directory)) os.makedirs(args.out_directory) if not args.resume: # make a copy of weights file, "weights.00000.hdf5" in the output directory copyfile(args.initial_weights, os.path.join(args.out_directory, ZEROTH_FILE)) if args.verbose: print("copied {} to {}".format( args.initial_weights, os.path.join(args.out_directory, ZEROTH_FILE))) player_weights = ZEROTH_FILE else: # if resuming, we expect initial_weights to be just a "weights.#####.hdf5" file, not a full path args.initial_weights = os.path.join( args.out_directory, os.path.basename(args.initial_weights)) if not os.path.exists(args.initial_weights): raise ValueError("Cannot resume; weights {} do not exist".format( args.initial_weights)) elif args.verbose: print("Resuming with weights {}".format(args.initial_weights)) player_weights = os.path.basename(args.initial_weights) # Set initial conditions policy = CNNPolicy.load_model(args.model_json) policy.model.load_weights(args.initial_weights) player = ProbabilisticPolicyPlayer(policy, temperature=args.policy_temp) features = policy.preprocessor.feature_list # different opponents come from simply changing the weights of # opponent.policy.model "behind the scenes" opp_policy = CNNPolicy.load_model(args.model_json) opponent = ProbabilisticPolicyPlayer(opp_policy, temperature=args.policy_temp) if args.verbose: print("created player and opponent with temperature {}".format( args.policy_temp)) if not args.resume: metadata = { "model_file": args.model_json, "init_weights": args.initial_weights, "learning_rate": args.learning_rate, "temperature": args.policy_temp, "game_batch": args.game_batch, "opponents": [ZEROTH_FILE ], # which weights from which to sample an opponent each batch "win_ratio": { } # map from player to tuple of (opponent, win ratio) Useful for validating in lieu of 'accuracy/loss' } else: with open(os.path.join(args.out_directory, "metadata.json"), "r") as f: metadata = json.load(f) def save_metadata(): with open(os.path.join(args.out_directory, "metadata.json"), "w") as f: json.dump(metadata, f) # Set SGD and compile sgd = SGD(lr=args.learning_rate) player.policy.model.compile(loss='binary_crossentropy', optimizer=sgd) board_size = player.policy.model.input_shape[-1] for i_iter in xrange(1, args.iterations + 1): # Train mini-batches by randomly choosing opponent from pool (possibly self) # and playing game_batch games against them opp_weights = np.random.choice(metadata["opponents"]) opp_path = os.path.join(args.out_directory, opp_weights) # load new weights into opponent, but otherwise its the same opponent.policy.model.load_weights(opp_path) if args.verbose: print("Batch {}\tsampled opponent is {}".format( i_iter, opp_weights)) # Make training pairs and do RL X_list, y_list, winners = make_training_pairs(player, opponent, features, args.game_batch, board_size) win_ratio = np.sum(np.array(winners) == 1) / float(args.game_batch) metadata["win_ratio"][player_weights] = (opp_weights, win_ratio) train_batch(player, X_list, y_list, winners, args.learning_rate) # Save intermediate models player_weights = "weights.%05d.hdf5" % i_iter player.policy.model.save_weights( os.path.join(args.out_directory, player_weights)) # add player to batch of oppenents once in a while if i_iter % args.save_every == 0: metadata["opponents"].append(player_weights) save_metadata()
def handle_arguments(cmd_line_args=None): """Run generate data. command-line args may be passed in as a list """ import argparse parser = argparse.ArgumentParser( description='Play games used for training' 'value network (third phase of pipeline). ' 'The final policy from the RL phase plays ' 'against itself and training pairs for value ' 'network are generated from the outcome in each ' 'games, following an off-policy, uniform random move') # required arguments parser.add_argument( "SL_weights_path", help="Path to file with supervised learning policy weights." ) # noqa: E501 parser.add_argument( "RL_weights_path", help="Path to file with reinforcement learning policy weights." ) # noqa: E501 parser.add_argument("model_path", help="Path to network architecture file.") # optional arguments parser.add_argument("--verbose", "-v", help="Turn on verbose mode", default=False, action="store_true") # noqa: E501 parser.add_argument( "--outfile", "-o", help="Destination to write data (hdf5 file) Default: " + DEFAULT_FILE_NAME, default=DEFAULT_FILE_NAME) # noqa: E501 parser.add_argument( "--sgf-path", help="If set all sgf will be saved here. Default: None", default=None) # noqa: E501 parser.add_argument( "--n-training-pairs", help="Number of training pairs to generate. Default: " + str(DEFAULT_N_TRAINING_PAIRS), type=int, default=DEFAULT_N_TRAINING_PAIRS) # noqa: E501 parser.add_argument("--batch-size", help="Number of games to run in parallel. Default: " + str(DEFAULT_BATCH_SIZE), type=int, default=DEFAULT_BATCH_SIZE) # noqa: E501 parser.add_argument( "--features", "-f", help= "Comma-separated list of features to compute and store or 'all'. Default: all", default='all') # noqa: E501 parser.add_argument( "--sl-temperature", help="Distribution temperature of players using SL policies. Default: " + str(DEFAULT_TEMPERATURE_SL), type=float, default=DEFAULT_TEMPERATURE_SL) # noqa: E501 parser.add_argument( "--rl-temperature", help="Distribution temperature of players using RL policies. Default: " + str(DEFAULT_TEMPERATURE_RL), type=float, default=DEFAULT_TEMPERATURE_RL) # noqa: E501 # show help or parse arguments if cmd_line_args is None: args = parser.parse_args() else: args = parser.parse_args(cmd_line_args) # list with features used for value network # features = policy_SL.preprocessor.feature_list if args.features.lower() == 'all': features = [ "board", "ones", "turns_since", "liberties", "capture_size", "self_atari_size", "liberties_after", "ladder_capture", "ladder_escape", "sensibleness", "color" ] else: features = args.features.split(",") # always add color feature if "color" not in features: features.append("color") # Load SL architecture and weights from file policy_SL = CNNPolicy.load_model(args.model_path) policy_SL.model.load_weights(args.SL_weights_path) # create SL player player_SL = ProbabilisticPolicyPlayer(policy_SL, temperature=args.sl_temperature, move_limit=DEFAULT_MAX_GAME_DEPTH) # Load RL architecture and weights from file policy_RL = CNNPolicy.load_model(args.model_path) policy_RL.model.load_weights(args.RL_weights_path) # Create RL player # TODO is it better to use greedy player? player_RL = ProbabilisticPolicyPlayer(policy_RL, temperature=args.rl_temperature, move_limit=DEFAULT_MAX_GAME_DEPTH) # check if folder exists if args.sgf_path is not None and not os.path.exists(args.sgf_path): os.makedirs(args.sgf_path) # generate data generate_data(player_RL, player_SL, args.outfile, args.n_training_pairs, args.batch_size, policy_SL.model.input_shape[-1], features, args.verbose, args.sgf_path)
def run_training(cmd_line_args=None): import argparse parser = argparse.ArgumentParser( description= 'Perform reinforcement learning to improve given policy network. Second phase of pipeline.' ) # noqa: E501 parser.add_argument("model_json", help="Path to policy model JSON.") parser.add_argument( "initial_weights", help= "Path to HDF5 file with inital weights (i.e. result of supervised training)." ) # noqa: E501 parser.add_argument( "out_directory", help= "Path to folder where the model params and metadata will be saved after each epoch." ) # noqa: E501 parser.add_argument("--learning-rate", help="Keras learning rate (Default: 0.001)", type=float, default=0.001) # noqa: E501 parser.add_argument( "--policy-temp", help= "Distribution temperature of players using policies (Default: 0.67)", type=float, default=0.67) # noqa: E501 parser.add_argument( "--save-every", help="Save policy as a new opponent every n batches (Default: 500)", type=int, default=500) # noqa: E501 parser.add_argument( "--record-every", help="Save learner's weights every n batches (Default: 1)", type=int, default=1) # noqa: E501 parser.add_argument("--game-batch", help="Number of games per mini-batch (Default: 20)", type=int, default=20) # noqa: E501 parser.add_argument("--move-limit", help="Maximum number of moves per game", type=int, default=500) # noqa: E501 parser.add_argument( "--iterations", help="Number of training batches/iterations (Default: 10000)", type=int, default=10000) # noqa: E501 parser.add_argument("--resume", help="Load latest weights in out_directory and resume", default=False, action="store_true") # noqa: E501 parser.add_argument("--verbose", "-v", help="Turn on verbose mode", default=False, action="store_true") # noqa: E501 # Baseline function (TODO) default lambda state: 0 (receives either file # paths to JSON and weights or None, in which case it uses default baseline 0) if cmd_line_args is None: args = parser.parse_args() else: args = parser.parse_args(cmd_line_args) ZEROTH_FILE = "weights.00000.hdf5" if args.resume: if not os.path.exists(os.path.join(args.out_directory, "metadata.json")): raise ValueError("Cannot resume without existing output directory") if not os.path.exists(args.out_directory): if args.verbose: print("creating output directory {}".format(args.out_directory)) os.makedirs(args.out_directory) if not args.resume: # make a copy of weights file, "weights.00000.hdf5" in the output directory copyfile(args.initial_weights, os.path.join(args.out_directory, ZEROTH_FILE)) if args.verbose: print("copied {} to {}".format( args.initial_weights, os.path.join(args.out_directory, ZEROTH_FILE))) player_weights = ZEROTH_FILE iter_start = 1 else: # if resuming, we expect initial_weights to be just a # "weights.#####.hdf5" file, not a full path if not re.match(r"weights\.\d{5}\.hdf5", args.initial_weights): raise ValueError( "Expected to resume from weights file with name 'weights.#####.hdf5'" ) args.initial_weights = os.path.join( args.out_directory, os.path.basename(args.initial_weights)) if not os.path.exists(args.initial_weights): raise ValueError("Cannot resume; weights {} do not exist".format( args.initial_weights)) elif args.verbose: print("Resuming with weights {}".format(args.initial_weights)) player_weights = os.path.basename(args.initial_weights) iter_start = 1 + int(player_weights[8:13]) # Set initial conditions policy = CNNPolicy.load_model(args.model_json) policy.model.load_weights(args.initial_weights) player = ProbabilisticPolicyPlayer(policy, temperature=args.policy_temp, move_limit=args.move_limit) # different opponents come from simply changing the weights of 'opponent.policy.model'. That # is, only 'opp_policy' needs to be changed, and 'opponent' will change. opp_policy = CNNPolicy.load_model(args.model_json) opponent = ProbabilisticPolicyPlayer(opp_policy, temperature=args.policy_temp, move_limit=args.move_limit) if args.verbose: print("created player and opponent with temperature {}".format( args.policy_temp)) if not args.resume: metadata = { "model_file": args.model_json, "init_weights": args.initial_weights, "learning_rate": args.learning_rate, "temperature": args.policy_temp, "game_batch": args.game_batch, "opponents": [ZEROTH_FILE ], # which weights from which to sample an opponent each batch "win_ratio": {} # map from player to tuple of (opponent, win ratio) Useful for # validating in lieu of 'accuracy/loss' } else: with open(os.path.join(args.out_directory, "metadata.json"), "r") as f: metadata = json.load(f) # Append args of current run to history of full command args. metadata["cmd_line_args"] = metadata.get("cmd_line_args", []) metadata["cmd_line_args"].append(vars(args)) def save_metadata(): with open(os.path.join(args.out_directory, "metadata.json"), "w") as f: json.dump(metadata, f, sort_keys=True, indent=2) optimizer = SGD(lr=args.learning_rate) player.policy.model.compile(loss=log_loss, optimizer=optimizer) for i_iter in range(iter_start, args.iterations + 1): # Note that player_weights will only be saved as a file every args.record_every iterations. # Regardless, player_weights enters into the metadata to keep track of the win ratio over # time. player_weights = "weights.%05d.hdf5" % i_iter # Randomly choose opponent from pool (possibly self), and playing # game_batch games against them. opp_weights = np.random.choice(metadata["opponents"]) opp_path = os.path.join(args.out_directory, opp_weights) # Load new weights into opponent's network, but keep the same opponent object. opponent.policy.model.load_weights(opp_path) if args.verbose: print("Batch {}\tsampled opponent is {}".format( i_iter, opp_weights)) # Run games (and learn from results). Keep track of the win ratio vs each opponent over # time. win_ratio = run_n_games(optimizer, args.learning_rate, player, opponent, args.game_batch) metadata["win_ratio"][player_weights] = (opp_weights, win_ratio) # Save intermediate models. if i_iter % args.record_every == 0: player.policy.model.save_weights( os.path.join(args.out_directory, player_weights)) # Add player to batch of oppenents once in a while. if i_iter % args.save_every == 0: metadata["opponents"].append(player_weights) save_metadata()