def test_probabilistic_player(self):
     gs = GameState()
     policy = CNNPolicy(["board", "ones", "turns_since"])
     player = ProbabilisticPolicyPlayer(policy)
     for i in range(20):
         move = player.get_move(gs)
         self.assertIsNotNone(move)
         gs.do_move(move)
Example #2
0
	def test_probabilistic_player(self):
		gs = GameState()
		policy = CNNPolicy(["board", "ones", "turns_since"])
		player = ProbabilisticPolicyPlayer(policy)
		for i in range(20):
			move = player.get_move(gs)
			self.assertIsNotNone(move)
			gs.do_move(move)
 def test_sensible_probabilistic(self):
     gs = GameState()
     policy = CNNPolicy(["board", "ones", "turns_since"])
     player = ProbabilisticPolicyPlayer(policy)
     empty = (10, 10)
     for x in range(19):
         for y in range(19):
             if (x, y) != empty:
                 gs.do_move((x, y), go.BLACK)
     gs.current_player = go.BLACK
     self.assertIsNone(player.get_move(gs))
Example #4
0
	def test_sensible_probabilistic(self):
		gs = GameState()
		policy = CNNPolicy(["board", "ones", "turns_since"])
		player = ProbabilisticPolicyPlayer(policy)
		empty = (10, 10)
		for x in range(19):
			for y in range(19):
				if (x, y) != empty:
					gs.do_move((x, y), go.BLACK)
		gs.current_player = go.BLACK
		self.assertIsNone(player.get_move(gs))
    def test_extreme_temperature_is_numerically_stable(self):
        player_low = ProbabilisticPolicyPlayer(None, temperature=1e-12)
        player_high = ProbabilisticPolicyPlayer(None, temperature=1e+12)

        distribution = np.random.random(361)
        distribution = distribution / distribution.sum()

        self.assertFalse(any(np.isnan(player_low.apply_temperature(distribution))))
        self.assertFalse(any(np.isnan(player_high.apply_temperature(distribution))))
    def test_temperature_increases_entropy(self):
        # helper function to get the entropy of a distribution
        def entropy(distribution):
            distribution = np.array(distribution).flatten()
            return -np.dot(np.log(distribution), distribution.T)
        player_low = ProbabilisticPolicyPlayer(None, temperature=0.9)
        player_high = ProbabilisticPolicyPlayer(None, temperature=1.1)

        distribution = np.random.random(361)
        distribution = distribution / distribution.sum()

        base_entropy = entropy(distribution)
        high_entropy = entropy(player_high.apply_temperature(distribution))
        low_entropy = entropy(player_low.apply_temperature(distribution))

        self.assertGreater(high_entropy, base_entropy)
        self.assertLess(low_entropy, base_entropy)
Example #7
0
def run_training(cmd_line_args=None):
    parser = argparse.ArgumentParser(
        description=
        'Perform reinforcement learning to improve given policy network. Second phase of pipeline.'
    )
    parser.add_argument("model_json", help="Path to policy model JSON.")
    parser.add_argument(
        "initial_weights",
        help=
        "Path to HDF5 file with inital weights (i.e. result of supervised training)."
    )
    parser.add_argument(
        "out_directory",
        help=
        "Path to folder where the model params and metadata will be saved after each epoch."
    )
    parser.add_argument("--learning-rate",
                        help="Keras learning rate (Default: .03)",
                        type=float,
                        default=.03)
    parser.add_argument(
        "--policy-temp",
        help=
        "Distribution temperature of players using policies (Default: 0.67)",
        type=float,
        default=0.67)
    parser.add_argument(
        "--save-every",
        help="Save policy as a new opponent every n batches (Default: 500)",
        type=int,
        default=500)
    parser.add_argument("--game-batch",
                        help="Number of games per mini-batch (Default: 20)",
                        type=int,
                        default=20)
    parser.add_argument(
        "--iterations",
        help="Number of training batches/iterations (Default: 10000)",
        type=int,
        default=10000)
    parser.add_argument("--resume",
                        help="Load latest weights in out_directory and resume",
                        default=False,
                        action="store_true")
    parser.add_argument("--verbose",
                        "-v",
                        help="Turn on verbose mode",
                        default=False,
                        action="store_true")
    # Baseline function (TODO) default lambda state: 0  (receives either file
    # paths to JSON and weights or None, in which case it uses default baseline 0)
    if cmd_line_args is None:
        args = parser.parse_args()
    else:
        args = parser.parse_args(cmd_line_args)

    ZEROTH_FILE = "weights.00000.hdf5"

    if args.resume:
        if not os.path.exists(os.path.join(args.out_directory,
                                           "metadata.json")):
            raise ValueError("Cannot resume without existing output directory")

    if not os.path.exists(args.out_directory):
        if args.verbose:
            print("creating output directory {}".format(args.out_directory))
        os.makedirs(args.out_directory)

    if not args.resume:
        # make a copy of weights file, "weights.00000.hdf5" in the output directory
        copyfile(args.initial_weights,
                 os.path.join(args.out_directory, ZEROTH_FILE))
        if args.verbose:
            print("copied {} to {}".format(
                args.initial_weights,
                os.path.join(args.out_directory, ZEROTH_FILE)))
        player_weights = ZEROTH_FILE
    else:
        # if resuming, we expect initial_weights to be just a "weights.#####.hdf5" file, not a full path
        args.initial_weights = os.path.join(
            args.out_directory, os.path.basename(args.initial_weights))
        if not os.path.exists(args.initial_weights):
            raise ValueError("Cannot resume; weights {} do not exist".format(
                args.initial_weights))
        elif args.verbose:
            print("Resuming with weights {}".format(args.initial_weights))
        player_weights = os.path.basename(args.initial_weights)

    # Set initial conditions
    policy = CNNPolicy.load_model(args.model_json)
    policy.model.load_weights(args.initial_weights)
    player = ProbabilisticPolicyPlayer(policy, temperature=args.policy_temp)
    features = policy.preprocessor.feature_list

    # different opponents come from simply changing the weights of
    # opponent.policy.model "behind the scenes"
    opp_policy = CNNPolicy.load_model(args.model_json)
    opponent = ProbabilisticPolicyPlayer(opp_policy,
                                         temperature=args.policy_temp)

    if args.verbose:
        print("created player and opponent with temperature {}".format(
            args.policy_temp))

    if not args.resume:
        metadata = {
            "model_file": args.model_json,
            "init_weights": args.initial_weights,
            "learning_rate": args.learning_rate,
            "temperature": args.policy_temp,
            "game_batch": args.game_batch,
            "opponents":
            [ZEROTH_FILE
             ],  # which weights from which to sample an opponent each batch
            "win_ratio": {
            }  # map from player to tuple of (opponent, win ratio) Useful for validating in lieu of 'accuracy/loss'
        }
    else:
        with open(os.path.join(args.out_directory, "metadata.json"), "r") as f:
            metadata = json.load(f)

    def save_metadata():
        with open(os.path.join(args.out_directory, "metadata.json"), "w") as f:
            json.dump(metadata, f)

    # Set SGD and compile
    sgd = SGD(lr=args.learning_rate)
    player.policy.model.compile(loss='binary_crossentropy', optimizer=sgd)
    board_size = player.policy.model.input_shape[-1]
    for i_iter in xrange(1, args.iterations + 1):
        # Train mini-batches by randomly choosing opponent from pool (possibly self)
        # and playing game_batch games against them
        opp_weights = np.random.choice(metadata["opponents"])
        opp_path = os.path.join(args.out_directory, opp_weights)
        # load new weights into opponent, but otherwise its the same
        opponent.policy.model.load_weights(opp_path)
        if args.verbose:
            print("Batch {}\tsampled opponent is {}".format(
                i_iter, opp_weights))
        # Make training pairs and do RL
        X_list, y_list, winners = make_training_pairs(player, opponent,
                                                      features,
                                                      args.game_batch,
                                                      board_size)
        win_ratio = np.sum(np.array(winners) == 1) / float(args.game_batch)
        metadata["win_ratio"][player_weights] = (opp_weights, win_ratio)
        train_batch(player, X_list, y_list, winners, args.learning_rate)
        # Save intermediate models
        player_weights = "weights.%05d.hdf5" % i_iter
        player.policy.model.save_weights(
            os.path.join(args.out_directory, player_weights))
        # add player to batch of oppenents once in a while
        if i_iter % args.save_every == 0:
            metadata["opponents"].append(player_weights)
        save_metadata()
def handle_arguments(cmd_line_args=None):
    """Run generate data. command-line args may be passed in as a list
    """

    import argparse
    parser = argparse.ArgumentParser(
        description='Play games used for training'
        'value network (third phase of pipeline). '
        'The final policy from the RL phase plays '
        'against itself and training pairs for value '
        'network are generated from the outcome in each '
        'games, following an off-policy, uniform random move')
    # required arguments
    parser.add_argument(
        "SL_weights_path",
        help="Path to file with supervised learning policy weights."
    )  # noqa: E501
    parser.add_argument(
        "RL_weights_path",
        help="Path to file with reinforcement learning policy weights."
    )  # noqa: E501
    parser.add_argument("model_path",
                        help="Path to network architecture file.")
    # optional arguments
    parser.add_argument("--verbose",
                        "-v",
                        help="Turn on verbose mode",
                        default=False,
                        action="store_true")  # noqa: E501
    parser.add_argument(
        "--outfile",
        "-o",
        help="Destination to write data (hdf5 file) Default: " +
        DEFAULT_FILE_NAME,
        default=DEFAULT_FILE_NAME)  # noqa: E501
    parser.add_argument(
        "--sgf-path",
        help="If set all sgf will be saved here. Default: None",
        default=None)  # noqa: E501
    parser.add_argument(
        "--n-training-pairs",
        help="Number of training pairs to generate. Default: " +
        str(DEFAULT_N_TRAINING_PAIRS),
        type=int,
        default=DEFAULT_N_TRAINING_PAIRS)  # noqa: E501
    parser.add_argument("--batch-size",
                        help="Number of games to run in parallel. Default: " +
                        str(DEFAULT_BATCH_SIZE),
                        type=int,
                        default=DEFAULT_BATCH_SIZE)  # noqa: E501
    parser.add_argument(
        "--features",
        "-f",
        help=
        "Comma-separated list of features to compute and store or 'all'. Default: all",
        default='all')  # noqa: E501
    parser.add_argument(
        "--sl-temperature",
        help="Distribution temperature of players using SL policies. Default: "
        + str(DEFAULT_TEMPERATURE_SL),
        type=float,
        default=DEFAULT_TEMPERATURE_SL)  # noqa: E501
    parser.add_argument(
        "--rl-temperature",
        help="Distribution temperature of players using RL policies. Default: "
        + str(DEFAULT_TEMPERATURE_RL),
        type=float,
        default=DEFAULT_TEMPERATURE_RL)  # noqa: E501

    # show help or parse arguments
    if cmd_line_args is None:
        args = parser.parse_args()
    else:
        args = parser.parse_args(cmd_line_args)

    # list with features used for value network
    # features = policy_SL.preprocessor.feature_list
    if args.features.lower() == 'all':
        features = [
            "board", "ones", "turns_since", "liberties", "capture_size",
            "self_atari_size", "liberties_after", "ladder_capture",
            "ladder_escape", "sensibleness", "color"
        ]
    else:
        features = args.features.split(",")

    # always add color feature
    if "color" not in features:
        features.append("color")

    # Load SL architecture and weights from file
    policy_SL = CNNPolicy.load_model(args.model_path)
    policy_SL.model.load_weights(args.SL_weights_path)
    # create SL player
    player_SL = ProbabilisticPolicyPlayer(policy_SL,
                                          temperature=args.sl_temperature,
                                          move_limit=DEFAULT_MAX_GAME_DEPTH)

    # Load RL architecture and weights from file
    policy_RL = CNNPolicy.load_model(args.model_path)
    policy_RL.model.load_weights(args.RL_weights_path)
    # Create RL player
    # TODO is it better to use greedy player?
    player_RL = ProbabilisticPolicyPlayer(policy_RL,
                                          temperature=args.rl_temperature,
                                          move_limit=DEFAULT_MAX_GAME_DEPTH)

    # check if folder exists
    if args.sgf_path is not None and not os.path.exists(args.sgf_path):
        os.makedirs(args.sgf_path)

    # generate data
    generate_data(player_RL, player_SL, args.outfile, args.n_training_pairs,
                  args.batch_size, policy_SL.model.input_shape[-1], features,
                  args.verbose, args.sgf_path)
Example #9
0
def run_training(cmd_line_args=None):
    import argparse
    parser = argparse.ArgumentParser(
        description=
        'Perform reinforcement learning to improve given policy network. Second phase of pipeline.'
    )  # noqa: E501
    parser.add_argument("model_json", help="Path to policy model JSON.")
    parser.add_argument(
        "initial_weights",
        help=
        "Path to HDF5 file with inital weights (i.e. result of supervised training)."
    )  # noqa: E501
    parser.add_argument(
        "out_directory",
        help=
        "Path to folder where the model params and metadata will be saved after each epoch."
    )  # noqa: E501
    parser.add_argument("--learning-rate",
                        help="Keras learning rate (Default: 0.001)",
                        type=float,
                        default=0.001)  # noqa: E501
    parser.add_argument(
        "--policy-temp",
        help=
        "Distribution temperature of players using policies (Default: 0.67)",
        type=float,
        default=0.67)  # noqa: E501
    parser.add_argument(
        "--save-every",
        help="Save policy as a new opponent every n batches (Default: 500)",
        type=int,
        default=500)  # noqa: E501
    parser.add_argument(
        "--record-every",
        help="Save learner's weights every n batches (Default: 1)",
        type=int,
        default=1)  # noqa: E501
    parser.add_argument("--game-batch",
                        help="Number of games per mini-batch (Default: 20)",
                        type=int,
                        default=20)  # noqa: E501
    parser.add_argument("--move-limit",
                        help="Maximum number of moves per game",
                        type=int,
                        default=500)  # noqa: E501
    parser.add_argument(
        "--iterations",
        help="Number of training batches/iterations (Default: 10000)",
        type=int,
        default=10000)  # noqa: E501
    parser.add_argument("--resume",
                        help="Load latest weights in out_directory and resume",
                        default=False,
                        action="store_true")  # noqa: E501
    parser.add_argument("--verbose",
                        "-v",
                        help="Turn on verbose mode",
                        default=False,
                        action="store_true")  # noqa: E501
    # Baseline function (TODO) default lambda state: 0  (receives either file
    # paths to JSON and weights or None, in which case it uses default baseline 0)
    if cmd_line_args is None:
        args = parser.parse_args()
    else:
        args = parser.parse_args(cmd_line_args)

    ZEROTH_FILE = "weights.00000.hdf5"

    if args.resume:
        if not os.path.exists(os.path.join(args.out_directory,
                                           "metadata.json")):
            raise ValueError("Cannot resume without existing output directory")

    if not os.path.exists(args.out_directory):
        if args.verbose:
            print("creating output directory {}".format(args.out_directory))
        os.makedirs(args.out_directory)

    if not args.resume:
        # make a copy of weights file, "weights.00000.hdf5" in the output directory
        copyfile(args.initial_weights,
                 os.path.join(args.out_directory, ZEROTH_FILE))
        if args.verbose:
            print("copied {} to {}".format(
                args.initial_weights,
                os.path.join(args.out_directory, ZEROTH_FILE)))
        player_weights = ZEROTH_FILE
        iter_start = 1
    else:
        # if resuming, we expect initial_weights to be just a
        # "weights.#####.hdf5" file, not a full path
        if not re.match(r"weights\.\d{5}\.hdf5", args.initial_weights):
            raise ValueError(
                "Expected to resume from weights file with name 'weights.#####.hdf5'"
            )
        args.initial_weights = os.path.join(
            args.out_directory, os.path.basename(args.initial_weights))
        if not os.path.exists(args.initial_weights):
            raise ValueError("Cannot resume; weights {} do not exist".format(
                args.initial_weights))
        elif args.verbose:
            print("Resuming with weights {}".format(args.initial_weights))
        player_weights = os.path.basename(args.initial_weights)
        iter_start = 1 + int(player_weights[8:13])

    # Set initial conditions
    policy = CNNPolicy.load_model(args.model_json)
    policy.model.load_weights(args.initial_weights)
    player = ProbabilisticPolicyPlayer(policy,
                                       temperature=args.policy_temp,
                                       move_limit=args.move_limit)

    # different opponents come from simply changing the weights of 'opponent.policy.model'. That
    # is, only 'opp_policy' needs to be changed, and 'opponent' will change.
    opp_policy = CNNPolicy.load_model(args.model_json)
    opponent = ProbabilisticPolicyPlayer(opp_policy,
                                         temperature=args.policy_temp,
                                         move_limit=args.move_limit)

    if args.verbose:
        print("created player and opponent with temperature {}".format(
            args.policy_temp))

    if not args.resume:
        metadata = {
            "model_file": args.model_json,
            "init_weights": args.initial_weights,
            "learning_rate": args.learning_rate,
            "temperature": args.policy_temp,
            "game_batch": args.game_batch,
            "opponents":
            [ZEROTH_FILE
             ],  # which weights from which to sample an opponent each batch
            "win_ratio":
            {}  # map from player to tuple of (opponent, win ratio) Useful for
            # validating in lieu of 'accuracy/loss'
        }
    else:
        with open(os.path.join(args.out_directory, "metadata.json"), "r") as f:
            metadata = json.load(f)

    # Append args of current run to history of full command args.
    metadata["cmd_line_args"] = metadata.get("cmd_line_args", [])
    metadata["cmd_line_args"].append(vars(args))

    def save_metadata():
        with open(os.path.join(args.out_directory, "metadata.json"), "w") as f:
            json.dump(metadata, f, sort_keys=True, indent=2)

    optimizer = SGD(lr=args.learning_rate)
    player.policy.model.compile(loss=log_loss, optimizer=optimizer)
    for i_iter in range(iter_start, args.iterations + 1):
        # Note that player_weights will only be saved as a file every args.record_every iterations.
        # Regardless, player_weights enters into the metadata to keep track of the win ratio over
        # time.
        player_weights = "weights.%05d.hdf5" % i_iter

        # Randomly choose opponent from pool (possibly self), and playing
        # game_batch games against them.
        opp_weights = np.random.choice(metadata["opponents"])
        opp_path = os.path.join(args.out_directory, opp_weights)

        # Load new weights into opponent's network, but keep the same opponent object.
        opponent.policy.model.load_weights(opp_path)
        if args.verbose:
            print("Batch {}\tsampled opponent is {}".format(
                i_iter, opp_weights))

        # Run games (and learn from results). Keep track of the win ratio vs each opponent over
        # time.
        win_ratio = run_n_games(optimizer, args.learning_rate, player,
                                opponent, args.game_batch)
        metadata["win_ratio"][player_weights] = (opp_weights, win_ratio)

        # Save intermediate models.
        if i_iter % args.record_every == 0:
            player.policy.model.save_weights(
                os.path.join(args.out_directory, player_weights))

        # Add player to batch of oppenents once in a while.
        if i_iter % args.save_every == 0:
            metadata["opponents"].append(player_weights)
        save_metadata()