Ejemplo n.º 1
0
def _worker_run(games_idxs):
    global _env_
    import self_play
    from dots_boxes.dots_boxes_game import BoxesState
    from utils.utils import write_to_hdf
    import time
    loop = asyncio.get_event_loop()

    tick = time.time()
    try:
        _env_.sp = self_play.SelfPlay(_env_.nnet, _env_.params)
        _env_.sp.set_player_change_callback(_player_change_callback)
        loop.run_until_complete(_env_.sp.play_games(BoxesState(), games_idxs, show_progress=False))
    except Exception as e:
        print(e, flush=True)
        raise e
    tack = time.time()

    df = _env_.sp.get_datasets(_env_.generations, not _env_.compare_models)
    if not _env_.compare_models:
        df["training"] = np.zeros(len(df.index), dtype=np.int8)

    with _env_.hdf_lock:
        write_to_hdf(_env_.hdf_file_name, "fresh", df)

    tock = time.time()

    logger.warning("Worker %s played %d games (%d samples) in %.0fs (save=%.3fs)", 
        _env_.name, len(games_idxs), len(df.index), tock-tick, tock-tack)
Ejemplo n.º 2
0
    def test(self, render, opponent, muzero_player):
        """
        Test the model in a dedicated thread.

        Args:
            render: Boolean to display or not the environment.

            opponent: "self" for self-play, "human" for playing against MuZero and "random"
            for a random agent.

            muzero_player: Integer with the player number of MuZero in case of multiplayer
            games, None let MuZero play all players turn by turn.
        """
        print("\nTesting...")
        # ray.init()
        # self_play_workers = self_play.SelfPlay.remote(
        self_play_workers = self_play.SelfPlay(
            copy.deepcopy(self.muzero_weights),
            self.Game(numpy.random.randint(1000)),
            self.config,
        )
        # history = ray.get(
        #     self_play_workers.play_game.remote(0, 0, render, opponent, muzero_player)
        # )
        history = self_play_workers.play_game(0, 0, render, opponent,
                                              muzero_player)
        # ray.shutdown()
        return sum(history.reward_history)
Ejemplo n.º 3
0
    def selfplay_and_train(self):
        self.model = network.NNmodel(self.config)
        self.DQNagent = agent.DQNagent(self.config, self.model)
        self.data = replay_buffer.DataContainer(self.config)

        self.selfplay = self_play.SelfPlay(self.config)
        self.selfplay.play_games(self.DQNagent, self.data)

        observation, action, value = self.data.get_data()
        observation = tf.reshape(observation, [observation.shape[0], -1])
        action = tf.one_hot(action, 9)
        input = tf.concat([observation, action], axis=1)
        self.model.fit(input, value, batch_size=1024, epochs=50)
Ejemplo n.º 4
0
    def train(self):
        os.makedirs(self.config.results_path, exist_ok=True)

        # Initialize workers
        training_worker = trainer.Trainer(copy.deepcopy(self.muzero_weights),
                                          self.config)
        shared_storage_worker = shared_storage.SharedStorage(
            copy.deepcopy(self.muzero_weights),
            self.game_name,
            self.config,
        )
        replay_buffer_worker = replay_buffer.ReplayBuffer(self.config)
        # Pre-load buffer if pulling from persistent storage
        if self.replay_buffer:
            for game_history_id in self.replay_buffer:
                replay_buffer_worker.save_game(
                    self.replay_buffer[game_history_id])
            print("\nLoaded {} games from replay buffer.".format(
                len(self.replay_buffer)))
        self_play_workers = [
            self_play.SelfPlay(
                copy.deepcopy(self.muzero_weights),
                self.Game(self.config.seed + seed),
                self.config,
            ) for seed in range(self.config.num_actors)
        ]

        # Launch workers
        [
            self_play_worker.continuous_self_play(shared_storage_worker,
                                                  replay_buffer_worker)
            for self_play_worker in self_play_workers
        ]
        training_worker.continuous_update_weights(replay_buffer_worker,
                                                  shared_storage_worker)

        # Save performance in TensorBoard
        print("Printing Logging info")
        self._logging_loop(shared_storage_worker, replay_buffer_worker)

        self.muzero_weights = shared_storage.get_weights()
        self.replay_buffer = replay_buffer_worker.get_buffer()
        # Persist replay buffer to disk
        print("\n\nPersisting replay buffer games to disk...")
        pickle.dump(
            self.replay_buffer,
            open(os.path.join(self.config.results_path, "replay_buffer.pkl"),
                 "wb"),
        )
Ejemplo n.º 5
0
    def train(self):
        # Manage GPUs
        '''
        if 0 < self.num_gpus:
            num_gpus_per_worker = self.num_gpus / (
                self.config.train_on_gpu
                + self.config.num_workers * self.config.selfplay_on_gpu
                + log_in_tensorboard * self.config.selfplay_on_gpu
                + self.config.use_last_model_value * self.config.reanalyse_on_gpu
            )
            if 1 < num_gpus_per_worker:
                num_gpus_per_worker = math.floor(num_gpus_per_worker)
        else:
            num_gpus_per_worker = 0
        '''

        # Initialize Worker Threads
        for SP_worker_index in range(self.config.num_workers):
            self.self_play_workers.append(
                self_play.SelfPlay(self.checkpoint, self.Game, self.config,
                                   self.config.seed + SP_worker_index))
        self.training_worker = trainer.Trainer(self.checkpoint, self.config)

        self.replay_buffer_worker = replay_buffer.ReplayBuffer(
            self.checkpoint, self.replay_buffer, self.config)
        self.shared_storage_worker = shared_storage.SharedStorage(
            self.checkpoint, self.config)
        self.shared_storage_worker.set_info("terminate", False)
        #Launch Workers
        play_thread = threading.Thread(
            target=self.self_play_workers[0].continuous_self_play,
            args=(self.shared_storage_worker, self.replay_buffer_worker))
        train_thread = threading.Thread(
            target=self.training_worker.continuous_update_weights,
            args=(self.shared_storage_worker, self.replay_buffer_worker))
        play_thread.start()
        train_thread.start()
Ejemplo n.º 6
0
    def _logging_loop(self, shared_storage_worker, replay_buffer_worker):
        """
        Keep track of the training performance
        """
        # Launch the test worker to get performance metrics
        test_worker = self_play.SelfPlay(
            copy.deepcopy(self.muzero_weights),
            self.Game(self.config.seed + self.config.num_actors),
            self.config,
        )
        test_worker.continuous_self_play(shared_storage_worker, None, True)

        # Write everything in TensorBoard
        writer = SummaryWriter(self.config.results_path)

        print(
            "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
        )

        # Save hyperparameters to TensorBoard
        hp_table = [
            "| {} | {} |".format(key, value)
            for key, value in self.config.__dict__.items()
        ]
        writer.add_text(
            "Hyperparameters",
            "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table),
        )
        # Save model representation
        writer.add_text(
            "Model summary",
            str(models.MuZeroNetwork(self.config)).replace("\n", " \n\n"),
        )
        # Loop for updating the training performance
        counter = 0
        info = shared_storage_worker.get_info()
        try:
            while info["training_step"] < self.config.training_steps:
                info = shared_storage_worker.get_info()
                writer.add_scalar(
                    "1.Total reward/1.Total reward",
                    info["total_reward"],
                    counter,
                )
                writer.add_scalar(
                    "1.Total reward/2.Mean value",
                    info["mean_value"],
                    counter,
                )
                writer.add_scalar(
                    "1.Total reward/3.Episode length",
                    info["episode_length"],
                    counter,
                )
                writer.add_scalar(
                    "1.Total reward/4.MuZero reward",
                    info["muzero_reward"],
                    counter,
                )
                writer.add_scalar(
                    "1.Total reward/5.Opponent reward",
                    info["opponent_reward"],
                    counter,
                )
                writer.add_scalar(
                    "2.Workers/1.Self played games",
                    replay_buffer_worker.get_self_play_count(),
                    counter,
                )
                writer.add_scalar("2.Workers/2.Training steps",
                                  info["training_step"], counter)
                writer.add_scalar(
                    "2.Workers/3.Self played games per training step ratio",
                    replay_buffer_worker.get_self_play_count() /
                    max(1, info["training_step"]),
                    counter,
                )
                writer.add_scalar("2.Workers/4.Learning rate", info["lr"],
                                  counter)
                writer.add_scalar("3.Loss/1.Total weighted loss",
                                  info["total_loss"], counter)
                writer.add_scalar("3.Loss/Value loss", info["value_loss"],
                                  counter)
                writer.add_scalar("3.Loss/Reward loss", info["reward_loss"],
                                  counter)
                writer.add_scalar("3.Loss/Policy loss", info["policy_loss"],
                                  counter)
                print(
                    "Last test reward: {:.2f}. Training step: {}/{}. Played games: {}. Loss: {:.2f}"
                    .format(
                        info["total_reward"],
                        info["training_step"],
                        self.config.training_steps,
                        replay_buffer_worker.get_self_play_count(),
                        info["total_loss"],
                    ),
                    end="\r",
                )
                counter += 1
                time.sleep(0.5)
        except KeyboardInterrupt as err:
            # Comment the line below to be able to stop the training but keep running
            # raise err
            pass
Ejemplo n.º 7
0
    def train(self):
        # ray.init()
        os.makedirs(self.config.results_path, exist_ok=True)

        # Initialize workers
        # training_worker = trainer.Trainer.options(
        #     num_gpus=1 if "cuda" in self.config.training_device else 0
        # ).remote(copy.deepcopy(self.muzero_weights), self.config)
        training_worker = trainer.Trainer(copy.deepcopy(self.muzero_weights),
                                          self.config)
        # shared_storage_worker = shared_storage.SharedStorage.remote(
        #     copy.deepcopy(self.muzero_weights), self.game_name, self.config,
        # )
        shared_storage_worker = shared_storage.SharedStorage(
            copy.deepcopy(self.muzero_weights),
            self.game_name,
            self.config,
        )
        # replay_buffer_worker = replay_buffer.ReplayBuffer.remote(self.config)
        replay_buffer_worker = replay_buffer.ReplayBuffer(self.config)
        # Pre-load buffer if pulling from persistent storage
        if self.replay_buffer:
            for game_history_id in self.replay_buffer:
                # replay_buffer_worker.save_game.remote(
                replay_buffer_worker.save_game(
                    self.replay_buffer[game_history_id])
            print("\nLoaded {} games from replay buffer.".format(
                len(self.replay_buffer)))
        self_play_workers = [
            # self_play.SelfPlay.remote(
            self_play.SelfPlay(
                copy.deepcopy(self.muzero_weights),
                self.Game(self.config.seed + seed),
                self.config,
            ) for seed in range(self.config.num_actors)
        ]

        # # Launch workers
        # [
        #     # self_play_worker.continuous_self_play.remote(
        #     self_play_worker.continuous_self_play(
        #         shared_storage_worker, replay_buffer_worker
        #     )
        #     for self_play_worker in self_play_workers
        # ]
        # # training_worker.continuous_update_weights.remote(
        # training_worker.continuous_update_weights(
        #     replay_buffer_worker, shared_storage_worker
        # )
        # # Save performance in TensorBoard
        # self._logging_loop(shared_storage_worker, replay_buffer_worker)

        while True:
            # play a game
            [
                self_play_worker.joe_self_play(shared_storage_worker,
                                               replay_buffer_worker)
                for self_play_worker in self_play_workers
            ]
            self._joe_logging(shared_storage_worker, replay_buffer_worker)
            training_worker.joe_update_weights(replay_buffer_worker,
                                               shared_storage_worker)
            info = shared_storage_worker.get_info()
            if info["training_step"] >= self.config.training_steps:
                break

        # self.muzero_weights = ray.get(shared_storage_worker.get_weights.remote())
        self.muzero_weights = shared_storage_worker.get_weights()
        # self.replay_buffer = ray.get(replay_buffer_worker.get_buffer.remote())
        self.replay_buffer = replay_buffer_worker.get_buffer()
        # Persist replay buffer to disk
        print("\n\nPersisting replay buffer games to disk...")
        pickle.dump(
            self.replay_buffer,
            open(os.path.join(self.config.results_path, "replay_buffer.pkl"),
                 "wb"),
        )
Ejemplo n.º 8
0
    def _joe_logging(self, shared_storage_worker, replay_buffer_worker):
        """
        Keep track of the training performance
        """

        if not hasattr(self, '_has_logged_one'):
            # Launch the test worker to get performance metrics
            self._test_worker = self_play.SelfPlay(
                copy.deepcopy(self.muzero_weights),
                self.Game(self.config.seed + self.config.num_actors),
                self.config,
            )

            # Write everything in TensorBoard
            writer = SummaryWriter(self.config.results_path)

            print(
                "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
            )

            # Save hyperparameters to TensorBoard
            hp_table = [
                "| {} | {} |".format(key, value)
                for key, value in self.config.__dict__.items()
            ]
            writer.add_text(
                "Hyperparameters",
                "| Parameter | Value |\n|-------|-------|\n" +
                "\n".join(hp_table),
            )
            # Save model representation
            writer.add_text(
                "Model summary",
                str(models.MuZeroNetwork(self.config)).replace("\n", " \n\n"),
            )
            self._has_logged_one = True
            self._writer = writer
            self._counter = 0
            self._last_game_played = 0
            return

        info = shared_storage_worker.get_info()
        writer = self._writer
        counter = info['training_step']
        if info['training_step'] % self.config.checkpoint_interval != 0:
            return

        games_played = replay_buffer_worker.get_self_play_count()
        if games_played % 3 == 0 and games_played != self._last_game_played:
            self._test_worker.joe_self_play(shared_storage_worker, None, True)
            # self._test_worker.joe_self_play(shared_storage_worker, replay_buffer_worker, True)
            self._last_game_played = games_played

        writer.add_scalar(
            "1.Total reward/1.Total reward",
            info["total_reward"],
            counter,
        )
        writer.add_scalar(
            "1.Total reward/2.Mean value",
            info["mean_value"],
            counter,
        )
        writer.add_scalar(
            "1.Total reward/3.Episode length",
            info["episode_length"],
            counter,
        )
        writer.add_scalar(
            "1.Total reward/4.MuZero reward",
            info["muzero_reward"],
            counter,
        )
        writer.add_scalar(
            "1.Total reward/5.Opponent reward",
            info["opponent_reward"],
            counter,
        )
        writer.add_scalar(
            "2.Workers/1.Self played games",
            # ray.get(replay_buffer_worker.get_self_play_count.remote()),
            replay_buffer_worker.get_self_play_count(),
            counter,
        )
        writer.add_scalar("2.Workers/2.Training steps", info["training_step"],
                          counter)
        writer.add_scalar(
            "2.Workers/3.Self played games per training step ratio",
            # ray.get(replay_buffer_worker.get_self_play_count.remote())
            replay_buffer_worker.get_self_play_count() /
            max(1, info["training_step"]),
            counter,
        )
        writer.add_scalar("2.Workers/4.Learning rate", info["lr"], counter)
        writer.add_scalar("3.Loss/1.Total weighted loss", info["total_loss"],
                          counter)
        writer.add_scalar("3.Loss/Value loss", info["value_loss"], counter)
        writer.add_scalar("3.Loss/Reward loss", info["reward_loss"], counter)
        writer.add_scalar("3.Loss/Policy loss", info["policy_loss"], counter)
        print(
            "Last test reward: {:.2f}. Training step: {}/{}. Played games: {}. Loss: {:.2f}"
            .format(
                info["total_reward"],
                info["training_step"],
                self.config.training_steps,
                # ray.get(replay_buffer_worker.get_self_play_count.remote()),
                replay_buffer_worker.get_self_play_count(),
                info["total_loss"],
            ),
            end="\r",
        )