Beispiel #1
0
def muzero(config: MuZeroConfig):
    """
    MuZero training is split into two independent parts: Network training and
    self-play data generation.
    These two parts only communicate by transferring the latest networks checkpoint
    from the training to the self-play, and the finished games from the self-play
    to the training.
    In contrast to the original MuZero algorithm this version doesn't works with
    multiple threads, therefore the training and self-play is done alternately.
    """
    storage = SharedStorage(config.new_network(), config.uniform_network(),
                            config.new_optimizer())
    replay_buffer = ReplayBuffer(config)

    for loop in range(config.nb_training_loop):
        print("Training loop", loop)
        score_train = run_selfplay(config, storage, replay_buffer,
                                   config.nb_episodes)
        train_network(config, storage, replay_buffer, config.nb_epochs)

        print("Train score:", score_train)
        print("Eval score:", run_eval(config, storage, NUM_EVAL_EPISODES))
        print(
            f"MuZero played {config.nb_episodes * (loop + 1)} "
            f"episodes and trained for {config.nb_epochs * (loop + 1)} epochs.\n"
        )

    return storage.latest_network()
Beispiel #2
0
def multiprocess_play_game_helper(config: MuZeroConfig,
                                  initial: bool,
                                  train: bool,
                                  result_queue: Queue = None,
                                  sema=None):
    sema.acquire()
    # Prevent child processes from overallocating GPU
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
    pretrained = True
    if initial:
        if config.load_directory is not None:
            # User specified directory to load network from
            network = config.old_network(config.load_directory)
        else:
            network = config.old_network('blank_network')
            pretrained = False
    else:
        network = config.old_network('checkpoint')

    storage = SharedStorage(network=network,
                            uniform_network=config.uniform_network(),
                            optimizer=config.new_optimizer(),
                            save_directory=config.save_directory,
                            config=config,
                            pretrained=pretrained)

    play_game(config=config,
              storage=storage,
              train=train,
              visual=False,
              queue=result_queue)
    sema.release()
def muzero(config: MuZeroConfig):
    """
    MuZero training is split into two independent parts: Network training and
    self-play data generation.
    These two parts only communicate by transferring the latest networks checkpoint
    from the training to the self-play, and the finished games from the self-play
    to the training.
    In contrast to the original MuZero algorithm this version doesn't works with
    multiple threads, therefore the training and self-play is done alternately.
    """
    network = config.new_network()
    storage = SharedStorage(network, config.uniform_network(),
                            config.new_optimizer(network))
    replay_buffer = ReplayBuffer(config)

    train_scores = []
    eval_scores = []
    train_losses = []
    for loop in range(config.nb_training_loop):
        print("Training loop", loop)
        score_train = run_selfplay(config, storage, replay_buffer,
                                   config.nb_episodes)
        train_losses += train_network(config, storage, replay_buffer,
                                      config.nb_epochs)
        print("Train score:", score_train)
        score_eval = run_eval(config, storage, 50)
        print("Eval score:", score_eval)
        print(
            f"MuZero played {config.nb_episodes * (loop + 1)} "
            f"episodes and trained for {config.nb_epochs * (loop + 1)} epochs.\n"
        )
        train_scores.append(score_train)
        eval_scores.append(score_eval)

    plt.figure(1)
    plt.plot(train_scores)
    plt.plot(eval_scores)
    plt.title('MuZero Average Rewards')
    plt.xlabel('MuZero Iterations (Train/Eval)')
    plt.ylabel('Reward Score')
    plt.legend(['Train score', 'Eval score'])

    plt.figure(2)
    plt.plot(train_losses, color='green')
    plt.title('MuZero Training Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.show()

    return storage.latest_network()
Beispiel #4
0
def train_network_helper(config: MuZeroConfig, replay_buffer: ReplayBuffer, epochs: int):
    try:
        network = config.old_network('checkpoint')
        optimizer = config.old_optimizer('checkpoint')
        print('Loaded optimizer')
    except FileNotFoundError:
        print('No checkpoint. Loading blank')
        network = config.old_network('blank_network')
        optimizer = config.new_optimizer()

    for _ in range(epochs):
        print('Epoch {}'.format(_), end='\r')
        batch = replay_buffer.sample_batch(config.num_unroll_steps, config.td_steps)
        update_weights(optimizer, network, batch)
    SharedStorage.save_network_to_disk(network, config, optimizer)
def make_cartpole_config():
    def visit_softmax_temperature(num_moves, training_steps):
        return 1.0

    return MuZeroConfig(game=CartPole,
                        action_space_size=2,
                        max_moves=500,
                        discount=0.997,
                        dirichlet_alpha=0.25,
                        num_simulations=50,
                        num_training_loop=50,
                        num_epochs=5000,
                        batch_size=128,
                        td_steps=50,
                        num_train_episodes=20,
                        num_eval_episodes=1,
                        lr_init=0.05,
                        lr_decay_steps=1000,
                        max_priority=False,
                        visit_softmax_temperature_fn=visit_softmax_temperature,
                        network_args={
                            'support_size': 10,
                            'encoding_size': 8,
                            'rep_hidden': [],
                            'dyn_hidden': [16],
                            'rew_hidden': [16],
                            'val_hidden': [],
                            'pol_hidden': [],
                            'observation_shape': (1, 1, 4),
                        },
                        result_path="cartpole.weights")
Beispiel #6
0
def play_game(config: MuZeroConfig,
              network: AbstractNetwork,
              train: bool = True) -> AbstractGame:
    """
    Each game is produced by starting at the initial board position, then
    repeatedly executing a Monte Carlo Tree Search to generate moves until the end
    of the game is reached.
    """
    game = config.new_game()
    mode_action_select = 'softmax' if train else 'max'

    while not game.terminal() and len(game.history) < config.max_moves:
        # At the root of the search tree we use the representation function to
        # obtain a hidden state given the current observation.
        root = Node(0)
        current_observation = game.make_image(-1)
        expand_node(root, game.to_play(), game.legal_actions(),
                    network.initial_inference(current_observation))
        if train:
            add_exploration_noise(config, root)

        # We then run a Monte Carlo Tree Search using only action sequences and the
        # model learned by the networks.
        run_mcts(config, root, game.action_history(), network)
        action = select_action(config,
                               len(game.history),
                               root,
                               network,
                               mode=mode_action_select)
        game.apply(action)
        game.store_search_statistics(root)
    return game
Beispiel #7
0
def muzero(config: MuZeroConfig):
    storage = SharedStorage(config.new_network(), config.uniform_network(), config.new_optimizer())
    replay_buffer = ReplayBuffer(config)

    for loop in range(config.nb_training_loop):
        print("Training loop", loop)
        score_train = run_selfplay(config, storage, replay_buffer, config.nb_episodes)
        train_network(config, storage, replay_buffer, config.nb_epochs)

        print("Train score:", score_train)
        print("Eval score:", run_eval(config, storage, 50))
        print(f"MuZero played {config.nb_episodes * (loop + 1)} "
              f"episodes and trained for {config.nb_epochs * (loop + 1)} epochs.\n")

    storage.save_network_dir(config.nb_training_loop)

    return storage.latest_network()
Beispiel #8
0
def make_config() -> MuZeroConfig:
    game_config = GameConfig(name='TicTacToe',
                             environment_class=TicTacToeEnvironment,
                             environment_parameters={},
                             action_space_size=9,
                             num_players=2,
                             discount=1.0
                             )

    replay_buffer_config = ReplayBufferConfig(window_size=int(1e4),
                                              prefetch_buffer_size=10
                                              )

    mcts_config = MCTSConfig(max_moves=9,
                             root_dirichlet_alpha=1.0,
                             root_exploration_fraction=0.25,
                             num_simulations=20,
                             temperature=1.0,
                             freezing_moves=9,
                             default_value=Value(0.0)
                             )

    network_config = NetworkConfig(network_class=TicTacToeNetwork,
                                   regularizer=tf.keras.regularizers.l2(l=1e-4),
                                   hidden_state_size=128,
                                   hidden_size=128
                                   )

    training_config = TrainingConfig(optimizer=tf.keras.optimizers.Adam(),
                                     batch_size=128,
                                     training_steps=int(5e4),
                                     checkpoint_interval=int(5e2),
                                     replay_buffer_loginterval=50,
                                     num_unroll_steps=2,
                                     td_steps=9,
                                     steps_per_execution=1
                                     )

    reward_config = ScalarConfig(known_bounds=KnownBounds(minv=Value(0.0), maxv=Value(1.0)),
                                 support_size=None,
                                 loss_decay=0.0)

    value_config = ScalarConfig(known_bounds=KnownBounds(minv=None, maxv=Value(1.0)),
                                support_size=None,
                                loss_decay=4.0)

    return MuZeroConfig(game_config=game_config,
                        replay_buffer_config=replay_buffer_config,
                        mcts_config=mcts_config,
                        training_config=training_config,
                        network_config=network_config,
                        value_config=value_config,
                        reward_config=reward_config)
Beispiel #9
0
def play_game(config: MuZeroConfig,
              storage: SharedStorage,
              train: bool = True,
              visual: bool = False,
              queue: Queue = None) -> AbstractGame:
    """
    Each game is produced by starting at the initial board position, then
    repeatedly executing a Monte Carlo Tree Search to generate moves until the end
    of the game is reached.
    """
    if queue:
        network = storage.latest_network_for_process()
    else:
        network = storage.current_network

    start = time()
    game = config.new_game()
    mode_action_select = 'softmax' if train else 'max'
    while not game.terminal() and len(game.history) < config.max_moves:
        # At the root of the search tree we use the representation function to
        # obtain a hidden state given the current observation.
        root = Node(0)
        current_observation = game.make_image(-1)
        expand_node(root, game.to_play(), game.legal_actions(),
                    network.initial_inference(current_observation))
        if train:
            add_exploration_noise(config, root)

        # We then run a Monte Carlo Tree Search using only action sequences and the
        # model learned by the networks.
        run_mcts(config, root, game.action_history(), network)
        action = select_action(config,
                               len(game.history),
                               root,
                               network,
                               mode=mode_action_select)
        game.apply(action)
        game.store_search_statistics(root)
        if visual:
            game.env.render()
    if visual:
        if game.terminal():
            print('Model lost game')
        else:
            print('Exceeded max moves')
        game.env.close()

    if queue:
        queue.put(game)
    print("Finished game episode after " + str(time() - start) +
          " seconds. Exceeded max moves? " + str(not game.terminal()))
    print("Score: ", sum(game.rewards))
    return game
Beispiel #10
0
def select_action(config: MuZeroConfig, num_moves: int, node: Node):
    visit_counts = [(child.visit_count, action)
                    for action, child in node.children.items()]
    temperature = config.visit_softmax_temperature_fn(num_moves=num_moves)
    if temperature == 0:
        action_pos = np.argmax([v for v, _ in visit_counts])
    else:
        action_probs = [
            visit_count_i**(1 / temperature)
            for visit_count_i, _ in visit_counts
        ]
        total_count = sum(action_probs)
        action_probs = [x / total_count for x in action_probs]
        action_pos = np.random.choice(len(visit_counts), p=action_probs)
    return visit_counts[action_pos][1]
Beispiel #11
0
def select_action(config: MuZeroConfig, num_moves: int, node: Node, network: BaseNetwork, mode: str = 'softmax'):
    """
    After running simulations inside in MCTS, we select an action based on the root's children visit counts.
    During training we use a softmax sample for exploration.
    During evaluation we select the most visited child.
    """
    visit_counts = [child.visit_count for child in node.children.values()]
    actions = [action for action in node.children.keys()]
    action = None
    if mode == 'softmax':
        t = config.visit_softmax_temperature_fn(
            num_moves=num_moves, training_steps=network.training_steps)
        action = softmax_sample(visit_counts, actions, t)
    elif mode == 'max':
        action, _ = max(node.children.items(), key=lambda item: item[1].visit_count)
    return action
Beispiel #12
0
def play_game(config: MuZeroConfig, network: AbstractNetwork, train: bool = True) -> AbstractGame:
  
    game = config.new_game()
    mode_action_select = 'softmax' if train else 'max'

    while not game.terminal() and len(game.history) < config.max_moves:
        root = Node(0)
        current_observation = game.make_image(-1)
        expand_node(root, game.to_play(), game.legal_actions(), network.initial_inference(current_observation))
        if train:
            add_exploration_noise(config, root)

        run_mcts(config, root, game.action_history(), network)
        action = select_action(config, len(game.history), root, network, mode=mode_action_select)
        game.apply(action)
        game.store_search_statistics(root)
    return game
Beispiel #13
0
def play_game(config: MuZeroConfig, network: Network) -> Game:
    game = config.new_game()

    while not game.terminal and len(game.history) < config.max_moves:
        # At the root of the search tree we use the representation function to
        # obtain a hidden state given the current observation.
        root = Node(0)
        current_observation = game.make_image(-1, network.device)
        expand_node(root, game.to_play(), game.legal_actions(),
                    network.initial_inference(current_observation))
        add_exploration_noise(config, root)

        # We then run a Monte Carlo Tree Search using only action sequences and the
        # model learned by the network.
        run_mcts(config, root, game.action_history(), network)
        action = select_action(config, len(game.history), root)
        game.apply(action)
        game.store_search_statistics(root)
    return game
Beispiel #14
0
def make_config(environment: Env) -> MuZeroConfig:
    return MuZeroConfig(
        env=environment,
        state_space_size=int(np.prod(env.observation_space.shape)),
        action_space_size=env.action_space.n,
        max_moves=500,  # Half an hour at action repeat 4.
        discount=0.997,
        dirichlet_alpha=0.25,
        num_simulations=15,  # Number of future moves self-simulated
        batch_size=64,
        td_steps=
        10,  # Number of steps in the future to take into account for calculating the target value
        num_actors=4,
        training_steps=int(1e8),  # Max number of training steps
        checkpoint_interval=100,
        save_interval=10000,
        lr_init=1e-4,
        lr_decay_steps=1000,
        lr_decay_rate=0.9)
Beispiel #15
0
def make_lunarlander_config():
    def visit_softmax_temperature(num_moves, training_steps):
        if num_moves < 0.5 * training_steps:
            return 1.0
        elif num_moves < 0.75 * training_steps:
            return 0.5
        else:
            return 0.25

    return MuZeroConfig(
        game=LunarLander,
        action_space_size=4,
        max_moves=500,
        discount=0.997,
        dirichlet_alpha=0.25,
        num_simulations=50,
        num_training_loop=50,
        num_epochs=200000,
        batch_size=32,
        td_steps=50,
        num_train_episodes=30,
        num_eval_episodes=10,
        lr_init=0.05,
        lr_decay_steps=1000,
        max_priority=False,
        visit_softmax_temperature_fn=visit_softmax_temperature,
        network_args={'support_size': 10,
                      'encoding_size': 10,
                      'rep_hidden': [],
                      'dyn_hidden': [64],
                      'rew_hidden': [64],
                      'val_hidden': [64],
                      'pol_hidden': [],
                      'observation_shape': (1, 1, 8),
                      },
        result_path="lunarlander.weights"
        )
Beispiel #16
0
def muzero(config: MuZeroConfig, save_directory: str, load_directory: str,
           test: bool, visual: bool, new_config: bool):
    """
    MuZero training is split into two independent parts: Network training and
    self-play data generation.
    These two parts only communicate by transferring the latest networks checkpoint
    from the training to the self-play, and the finished games from the self-play
    to the training.
    In contrast to the original MuZero algorithm this version doesn't works with
    multiple threads, therefore the training and self-play is done alternately.
    """
    config.load_directory = load_directory
    config.save_directory = save_directory
    replay_buffer = ReplayBuffer(config)
    # Remove old checkpoint network
    base_dir = os.path.dirname(os.path.realpath(__file__))

    d = base_dir + '/checkpoint'
    to_remove = [os.path.join(d, f) for f in os.listdir(d)]
    for f in to_remove:
        if f.split('/')[-1] != '.gitignore':
            os.remove(f)

    if load_directory:
        # Copy load directory to checkpoint directory
        copy_tree(src=load_directory, dst=d)

    if new_config:
        network = config.new_network()
        SharedStorage.save_network_to_disk(network, config, None,
                                           'blank_network')
        exit(0)

    if test:
        if load_directory is not None:
            # User specified directory to load network from
            network = config.old_network(load_directory)
        else:
            network = config.new_network()
        storage = SharedStorage(network, config.uniform_network(),
                                config.new_optimizer(), save_directory, config,
                                load_directory != None)
        # Single process for simple testing, can refactor later
        print("Eval score:", run_eval(config, storage, 5, visual=visual))
        print(f"MuZero played {5} " f"episodes.\n")
        return storage.latest_network()

    for loop in range(config.nb_training_loop):
        initial = True if loop == 0 else False
        start = time()
        o_start = time()
        print("Training loop", loop)
        episodes = config.nb_episodes

        score_train = multiprocess_play_game(config,
                                             initial=initial,
                                             episodes=episodes,
                                             train=True,
                                             replay_buffer=replay_buffer)
        print("Self play took " + str(time() - start) + " seconds")
        print("Train score: " + str(score_train) + " after " +
              str(time() - start) + " seconds")

        start = time()
        print("Training network...")
        train_network(config, replay_buffer, config.nb_epochs)
        print("Network weights updated after " + str(time() - start) +
              " seconds")
        """