コード例 #1
0
def muzero(config: MuZeroConfig):
    """
    MuZero training is split into two independent parts: Network training and
    self-play data generation.
    These two parts only communicate by transferring the latest networks checkpoint
    from the training to the self-play, and the finished games from the self-play
    to the training.
    In contrast to the original MuZero algorithm this version doesn't works with
    multiple threads, therefore the training and self-play is done alternately.
    """
    storage = SharedStorage(config.new_network(), config.uniform_network(),
                            config.new_optimizer())
    replay_buffer = ReplayBuffer(config)

    for loop in range(config.nb_training_loop):
        print("Training loop", loop)
        score_train = run_selfplay(config, storage, replay_buffer,
                                   config.nb_episodes)
        train_network(config, storage, replay_buffer, config.nb_epochs)

        print("Train score:", score_train)
        print("Eval score:", run_eval(config, storage, NUM_EVAL_EPISODES))
        print(
            f"MuZero played {config.nb_episodes * (loop + 1)} "
            f"episodes and trained for {config.nb_epochs * (loop + 1)} epochs.\n"
        )

    return storage.latest_network()
コード例 #2
0
def train_network(config: MuZeroConfig, storage: SharedStorage, replay_buffer: ReplayBuffer, epochs: int):
    network = storage.current_network
    optimizer = storage.optimizer

    for _ in range(epochs):
        batch = replay_buffer.sample_batch(config.num_unroll_steps, config.td_steps)
        update_weights(optimizer, network, batch)
        storage.save_network(network.training_steps, network)
コード例 #3
0
def train_network(config: MuZeroConfig, storage: SharedStorage,
                  replay_buffer: ReplayBuffer, epochs: int):
    losses = []
    network = storage.current_network
    optimizer = storage.optimizer
    optimizer.zero_grad()

    for _ in range(epochs):
        batch = replay_buffer.sample_batch(config.num_unroll_steps,
                                           config.td_steps)
        losses.append(update_weights(optimizer, network, batch))
        storage.save_network(network.training_steps, network)
    return losses
コード例 #4
0
def muzero(config: MuZeroConfig):
    """
    MuZero training is split into two independent parts: Network training and
    self-play data generation.
    These two parts only communicate by transferring the latest networks checkpoint
    from the training to the self-play, and the finished games from the self-play
    to the training.
    In contrast to the original MuZero algorithm this version doesn't works with
    multiple threads, therefore the training and self-play is done alternately.
    """
    network = config.new_network()
    storage = SharedStorage(network, config.uniform_network(),
                            config.new_optimizer(network))
    replay_buffer = ReplayBuffer(config)

    train_scores = []
    eval_scores = []
    train_losses = []
    for loop in range(config.nb_training_loop):
        print("Training loop", loop)
        score_train = run_selfplay(config, storage, replay_buffer,
                                   config.nb_episodes)
        train_losses += train_network(config, storage, replay_buffer,
                                      config.nb_epochs)
        print("Train score:", score_train)
        score_eval = run_eval(config, storage, 50)
        print("Eval score:", score_eval)
        print(
            f"MuZero played {config.nb_episodes * (loop + 1)} "
            f"episodes and trained for {config.nb_epochs * (loop + 1)} epochs.\n"
        )
        train_scores.append(score_train)
        eval_scores.append(score_eval)

    plt.figure(1)
    plt.plot(train_scores)
    plt.plot(eval_scores)
    plt.title('MuZero Average Rewards')
    plt.xlabel('MuZero Iterations (Train/Eval)')
    plt.ylabel('Reward Score')
    plt.legend(['Train score', 'Eval score'])

    plt.figure(2)
    plt.plot(train_losses, color='green')
    plt.title('MuZero Training Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.show()

    return storage.latest_network()
コード例 #5
0
ファイル: training.py プロジェクト: zpear/FastCentipede
def train_network_helper(config: MuZeroConfig, replay_buffer: ReplayBuffer, epochs: int):
    try:
        network = config.old_network('checkpoint')
        optimizer = config.old_optimizer('checkpoint')
        print('Loaded optimizer')
    except FileNotFoundError:
        print('No checkpoint. Loading blank')
        network = config.old_network('blank_network')
        optimizer = config.new_optimizer()

    for _ in range(epochs):
        print('Epoch {}'.format(_), end='\r')
        batch = replay_buffer.sample_batch(config.num_unroll_steps, config.td_steps)
        update_weights(optimizer, network, batch)
    SharedStorage.save_network_to_disk(network, config, optimizer)
コード例 #6
0
def run_eval(config: MuZeroConfig, storage: SharedStorage, eval_episodes: int):
    network = storage.latest_network()
    returns = []
    for _ in range(eval_episodes):
        game = play_game(config, network, train=False)
        returns.append(sum(game.rewards))
    return sum(returns) / eval_episodes if eval_episodes else 0
コード例 #7
0
ファイル: self_play.py プロジェクト: zpear/FastCentipede
def multiprocess_play_game_helper(config: MuZeroConfig,
                                  initial: bool,
                                  train: bool,
                                  result_queue: Queue = None,
                                  sema=None):
    sema.acquire()
    # Prevent child processes from overallocating GPU
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
    pretrained = True
    if initial:
        if config.load_directory is not None:
            # User specified directory to load network from
            network = config.old_network(config.load_directory)
        else:
            network = config.old_network('blank_network')
            pretrained = False
    else:
        network = config.old_network('checkpoint')

    storage = SharedStorage(network=network,
                            uniform_network=config.uniform_network(),
                            optimizer=config.new_optimizer(),
                            save_directory=config.save_directory,
                            config=config,
                            pretrained=pretrained)

    play_game(config=config,
              storage=storage,
              train=train,
              visual=False,
              queue=result_queue)
    sema.release()
コード例 #8
0
def muzero(config: MuZeroConfig):
    storage = SharedStorage(config.new_network(), config.uniform_network(), config.new_optimizer())
    replay_buffer = ReplayBuffer(config)

    for loop in range(config.nb_training_loop):
        print("Training loop", loop)
        score_train = run_selfplay(config, storage, replay_buffer, config.nb_episodes)
        train_network(config, storage, replay_buffer, config.nb_epochs)

        print("Train score:", score_train)
        print("Eval score:", run_eval(config, storage, 50))
        print(f"MuZero played {config.nb_episodes * (loop + 1)} "
              f"episodes and trained for {config.nb_epochs * (loop + 1)} epochs.\n")

    storage.save_network_dir(config.nb_training_loop)

    return storage.latest_network()
コード例 #9
0
def run_selfplay(config: MuZeroConfig, storage: SharedStorage, replay_buffer: ReplayBuffer, train_episodes: int):
    network = storage.latest_network()
    returns = []
    for _ in range(train_episodes):
        game = play_game(config, network)
        replay_buffer.save_game(game)
        returns.append(sum(game.rewards))
    return sum(returns) / train_episodes
コード例 #10
0
def run_eval(config: MuZeroConfig, storage: SharedStorage, eval_episodes: int):
    """Evaluate MuZero without noise added to the prior of the root and without softmax action selection"""
    network = storage.latest_network()
    returns = []
    for _ in range(eval_episodes):
        game = play_game(config, network, train=False)
        returns.append(sum(game.rewards))
    return sum(returns) / eval_episodes if eval_episodes else 0
コード例 #11
0
def run_selfplay(config: MuZeroConfig, storage: SharedStorage,
                 replay_buffer: ReplayBuffer, train_episodes: int):
    """Take the latest network, produces multiple games and save them in the shared replay buffer"""
    network = storage.latest_network()
    returns = []
    for _ in range(train_episodes):
        game = play_game(config, network)
        replay_buffer.save_game(game)
        returns.append(sum(game.rewards))
    return sum(returns) / train_episodes
コード例 #12
0
ファイル: self_play.py プロジェクト: zpear/FastCentipede
def play_game(config: MuZeroConfig,
              storage: SharedStorage,
              train: bool = True,
              visual: bool = False,
              queue: Queue = None) -> AbstractGame:
    """
    Each game is produced by starting at the initial board position, then
    repeatedly executing a Monte Carlo Tree Search to generate moves until the end
    of the game is reached.
    """
    if queue:
        network = storage.latest_network_for_process()
    else:
        network = storage.current_network

    start = time()
    game = config.new_game()
    mode_action_select = 'softmax' if train else 'max'
    while not game.terminal() and len(game.history) < config.max_moves:
        # At the root of the search tree we use the representation function to
        # obtain a hidden state given the current observation.
        root = Node(0)
        current_observation = game.make_image(-1)
        expand_node(root, game.to_play(), game.legal_actions(),
                    network.initial_inference(current_observation))
        if train:
            add_exploration_noise(config, root)

        # We then run a Monte Carlo Tree Search using only action sequences and the
        # model learned by the networks.
        run_mcts(config, root, game.action_history(), network)
        action = select_action(config,
                               len(game.history),
                               root,
                               network,
                               mode=mode_action_select)
        game.apply(action)
        game.store_search_statistics(root)
        if visual:
            game.env.render()
    if visual:
        if game.terminal():
            print('Model lost game')
        else:
            print('Exceeded max moves')
        game.env.close()

    if queue:
        queue.put(game)
    print("Finished game episode after " + str(time() - start) +
          " seconds. Exceeded max moves? " + str(not game.terminal()))
    print("Score: ", sum(game.rewards))
    return game
コード例 #13
0
ファイル: self_play.py プロジェクト: marioyc/MuZero
def run_eval(config: MuZeroConfig, storage: SharedStorage, eval_episodes: int):
    """Evaluate MuZero without noise added to the prior of the root and without softmax action selection"""
    network = storage.latest_network()
    returns = []
    for _ in range(eval_episodes):
        game = play_game(config, network, train=False)
        returns.append(sum(game.rewards))
    # Calculate statistics
    score_mean = np.mean(returns)
    score_std = np.std(returns)
    score_min = np.min(returns)
    score_max = np.max(returns)
    return score_mean, score_std, score_min, score_max
コード例 #14
0
ファイル: self_play.py プロジェクト: marioyc/MuZero
def run_selfplay(config: MuZeroConfig, storage: SharedStorage,
                 replay_buffer: ReplayBuffer, train_episodes: int):
    """Take the latest network, produces multiple games and save them in the shared replay buffer"""
    network = storage.latest_network()
    returns = []
    for _ in range(train_episodes):
        game = play_game(config, network)
        replay_buffer.save_game(game)
        returns.append(sum(game.rewards))
    # Calculate statistics
    score_mean = np.mean(returns)
    score_std = np.std(returns)
    score_min = np.min(returns)
    score_max = np.max(returns)
    return score_mean, score_std, score_min, score_max
コード例 #15
0
ファイル: muzero.py プロジェクト: zpear/FastCentipede
def muzero(config: MuZeroConfig, save_directory: str, load_directory: str,
           test: bool, visual: bool, new_config: bool):
    """
    MuZero training is split into two independent parts: Network training and
    self-play data generation.
    These two parts only communicate by transferring the latest networks checkpoint
    from the training to the self-play, and the finished games from the self-play
    to the training.
    In contrast to the original MuZero algorithm this version doesn't works with
    multiple threads, therefore the training and self-play is done alternately.
    """
    config.load_directory = load_directory
    config.save_directory = save_directory
    replay_buffer = ReplayBuffer(config)
    # Remove old checkpoint network
    base_dir = os.path.dirname(os.path.realpath(__file__))

    d = base_dir + '/checkpoint'
    to_remove = [os.path.join(d, f) for f in os.listdir(d)]
    for f in to_remove:
        if f.split('/')[-1] != '.gitignore':
            os.remove(f)

    if load_directory:
        # Copy load directory to checkpoint directory
        copy_tree(src=load_directory, dst=d)

    if new_config:
        network = config.new_network()
        SharedStorage.save_network_to_disk(network, config, None,
                                           'blank_network')
        exit(0)

    if test:
        if load_directory is not None:
            # User specified directory to load network from
            network = config.old_network(load_directory)
        else:
            network = config.new_network()
        storage = SharedStorage(network, config.uniform_network(),
                                config.new_optimizer(), save_directory, config,
                                load_directory != None)
        # Single process for simple testing, can refactor later
        print("Eval score:", run_eval(config, storage, 5, visual=visual))
        print(f"MuZero played {5} " f"episodes.\n")
        return storage.latest_network()

    for loop in range(config.nb_training_loop):
        initial = True if loop == 0 else False
        start = time()
        o_start = time()
        print("Training loop", loop)
        episodes = config.nb_episodes

        score_train = multiprocess_play_game(config,
                                             initial=initial,
                                             episodes=episodes,
                                             train=True,
                                             replay_buffer=replay_buffer)
        print("Self play took " + str(time() - start) + " seconds")
        print("Train score: " + str(score_train) + " after " +
              str(time() - start) + " seconds")

        start = time()
        print("Training network...")
        train_network(config, replay_buffer, config.nb_epochs)
        print("Network weights updated after " + str(time() - start) +
              " seconds")
        """