Beispiel #1
0
    def __init__(self, policy, env, args, test_env=None):
        self._policy = policy
        self._env = env
        self._test_env = self._env if test_env is None else test_env
        self._set_from_args(args)

        # prepare log directory
        self._output_dir = prepare_output_dir(args=args,
                                              user_specified_dir="./results",
                                              suffix="{}_{}".format(
                                                  self._policy.policy_name,
                                                  args.dir_suffix))
        self.logger = initialize_logger(logging_level=logging.getLevelName(
            args.logging_level),
                                        output_dir=self._output_dir)

        # Save and restore model
        checkpoint = tf.train.Checkpoint(policy=self._policy)
        self.checkpoint_manager = tf.train.CheckpointManager(
            checkpoint, directory=self._output_dir, max_to_keep=5)
        if args.model_dir is not None:
            assert os.path.isdir(args.model_dir)
            path_ckpt = tf.train.latest_checkpoint(args.model_dir)
            checkpoint.restore(path_ckpt)
            self.logger.info("Restored {}".format(path_ckpt))

        # prepare TensorBoard output
        self.writer = tf.summary.create_file_writer(self._output_dir)
        self.writer.set_as_default()
Beispiel #2
0
    def __init__(
            self,
            policy,
            env,
            args,
            test_env=None):
        self._set_from_args(args)
        self._policy = policy
        self._env = env
        self._test_env = self._env if test_env is None else test_env
        if self._normalize_obs:
            assert isinstance(env.observation_space, Box)
            self._obs_normalizer = EmpiricalNormalizer(
                shape=env.observation_space.shape)

        # prepare log directory
        self._output_dir = prepare_output_dir(
            args=args, user_specified_dir=self._logdir,
            suffix="{}_{}".format(self._policy.policy_name, args.dir_suffix))
        self.logger = initialize_logger(
            logging_level=logging.getLevelName(args.logging_level),
            output_dir=self._output_dir)

        if args.evaluate:
            assert args.model_dir is not None
        self._set_check_point(args.model_dir)

        # prepare TensorBoard output
        self.writer = tf.summary.create_file_writer(self._output_dir)
        self.writer.set_as_default()
Beispiel #3
0
    def __init__(
            self,
            policy,
            env,
            params,
            test_env=None):
        """Initializing the training instance."""

        self._params = params
        self._set_from_params()
        self._policy = policy
        self._env = env
        self._test_env = self._env if test_env is None else test_env
        args = self._get_args_from_params()

        # Convolutional Autoencoder:
        self._CAE = CAE(pooling=self._params["cae"]["pooling"],
                        latent_dim=self._params["cae"]["latent_dim"],
                        input_shape=self._env.workspace.shape,
                        conv_filters=self._params["cae"]["conv_filters"])
        self._CAE.build(input_shape=(1, self._env.workspace.shape[0], self._env.workspace.shape[1], 1))
        self._CAE.load_weights(filepath=self._params["cae"]["weights_path"])
        for layer, _ in self._CAE._get_trainable_state().items():
            layer.trainable = False

        #Initialize array for trajectory storage
        self.trajectory=[]

        # Initialize workspace relabeler:
        self._relabeler = PointrobotRelabeler(
            ws_shape=(self._env.grid_size, self._env.grid_size),
            mode=params["trainer"]["relabeling_mode"],
            remove_zigzaging=params["trainer"]["remove_zigzaging"]
            )

        # prepare log directory
        self._output_dir = prepare_output_dir(
            args=args, user_specified_dir=self._logdir,
            suffix="{}_{}".format(self._policy.policy_name, params["trainer"]["dir_suffix"]))
        self.logger = initialize_logger(
            logging_level=logging.getLevelName(params["trainer"]["logging_level"]),
            output_dir=self._output_dir)
        if self._save_test_path_sep:
            sep_logdirs = ['successful_trajs', 'unsuccessful_trajs', 'unfinished_trajs']
            for logdir in sep_logdirs:
                if not os.path.exists(os.path.join(self._logdir, logdir)):
                    os.makedirs(os.path.join(self._logdir, logdir))

        if params["trainer"]["mode"] == "evaluate":
            assert glob.glob(os.path.join(params["trainer"]["model_dir"], '*'))
        self._set_check_point(params["trainer"]["model_dir"])

        # prepare TensorBoard output
        self.writer = tf.summary.create_file_writer(self._output_dir)
        self.writer.set_as_default()

        # relabeling visualization:
        self._relabel_fig = plt.figure(2)
Beispiel #4
0
    def __init__(self, policy, env, args, test_env=None):
        """
        Initialize Trainer class

        Args:
            policy: Policy to be trained
            env (gym.Env): Environment for train
            args (Namespace or dict): config parameters specified with command line
            test_env (gym.Env): Environment for test.
        """
        if isinstance(args, dict):
            _args = args
            args = policy.__class__.get_argument(Trainer.get_argument())
            args = args.parse_args([])
            for k, v in _args.items():
                if hasattr(args, k):
                    setattr(args, k, v)
                else:
                    raise ValueError(f"{k} is invalid parameter.")

        self._set_from_args(args)
        self._policy = policy
        self._env = env
        self._test_env = self._env if test_env is None else test_env
        if self._normalize_obs:
            assert isinstance(env.observation_space, Box)
            self._obs_normalizer = EmpiricalNormalizer(
                shape=env.observation_space.shape)

        # prepare log directory
        self._output_dir = prepare_output_dir(args=args,
                                              user_specified_dir=self._logdir,
                                              suffix="{}_{}".format(
                                                  self._policy.policy_name,
                                                  args.dir_suffix))
        self.logger = initialize_logger(logging_level=logging.getLevelName(
            args.logging_level),
                                        output_dir=self._output_dir)

        if args.evaluate:
            assert args.model_dir is not None
        self._set_check_point(args.model_dir)

        # prepare TensorBoard output
        self.writer = tf.summary.create_file_writer(self._output_dir)
        self.writer.set_as_default()
Beispiel #5
0
    def __init__(self, policy, env, args, test_env=None):
        self._policy = policy
        self._env = env
        self._test_env = self._env if test_env is None else test_env
        self._set_from_args(args)

        # prepare log directory
        self._output_dir = prepare_output_dir(args=args,
                                              user_specified_dir="./results")
        logging.basicConfig(level=logging.getLevelName(args.logging_level))
        self.logger = logging.getLogger(__name__)

        # prepare TensorBoard output
        self.checkpoint_manager = tf.contrib.checkpoint.CheckpointManager(
            tf.train.Checkpoint(policy=self._policy),
            directory=self._output_dir,
            max_to_keep=5)
        self.writer = tf.contrib.summary.create_file_writer(self._output_dir)
        self.writer.set_as_default()
        tf.contrib.summary.initialize()
Beispiel #6
0
    def __init__(
            self,
            policy,
            env,
            args,
            test_env=None):
        self._set_from_args(args)
        self._policy = policy
        self._env = env
        self._test_env = self._env if test_env is None else test_env
        if self._normalize_obs:
            assert isinstance(env.observation_space, Box)
            self._obs_normalizer = EmpiricalNormalizer(
                shape=env.observation_space.shape)

        # prepare log directory
        self._output_dir = prepare_output_dir(
            args=args, user_specified_dir=self._logdir,
            suffix="{}_{}".format(self._policy.policy_name, args.dir_suffix))
        self.logger = initialize_logger(
            logging_level=logging.getLevelName(args.logging_level),
            output_dir=self._output_dir)

        # Save and restore model
        self._checkpoint = tf.train.Checkpoint(policy=self._policy)
        self.checkpoint_manager = tf.train.CheckpointManager(
            self._checkpoint, directory=self._output_dir, max_to_keep=5)
        if args.evaluate:
            assert args.model_dir is not None
        if args.model_dir is not None:
            assert os.path.isdir(args.model_dir)
            self._latest_path_ckpt = tf.train.latest_checkpoint(args.model_dir)
            self._checkpoint.restore(self._latest_path_ckpt)
            self.logger.info("Restored {}".format(self._latest_path_ckpt))

        # prepare TensorBoard output
        self.writer = tf.summary.create_file_writer(self._output_dir)
        self.writer.set_as_default()
Beispiel #7
0
    opt.apply_gradients(zip(grads, actor.trainable_weights))
    return loss


print('Train...')
losses = []  # Keep track of the losses over time.
for epoch in range(5):
    # Iterate over the batches of a dataset.
    for step, x in enumerate(train_dataset):
        loss = training_step(x)
        # Logging.
        losses.append(float(loss))
        if step % 100 == 0:
            print("Epoch", epoch, "Step:", step, "Loss:",
                  sum(losses) / len(losses))

        # Stop after 1000 steps.
        # Training the model to convergence is left
        # as an exercise to the reader.
        # if step >= 10000:
        #     break

output_dir = prepare_output_dir(args=None,
                                user_specified_dir=None,
                                suffix="{}_{}".format(policy.policy_name,
                                                      args.dir_suffix))
checkpoint = tf.train.Checkpoint(policy=policy)
checkpoint_manager = tf.train.CheckpointManager(_checkpoint,
                                                directory=output_dir,
                                                max_to_keep=5)
Beispiel #8
0
def evaluator(is_training_done,
              env,
              policy_fn,
              set_weights_fn,
              queue,
              gpu,
              save_model_interval=int(1e6),
              n_evaluation=10,
              episode_max_steps=1000,
              show_test_progress=False):
    """
    Evaluate trained network weights periodically.

    :param is_training_done (multiprocessing.Event):
        multiprocessing.Event object to share the status of training.
    :param env (Gym environment):
        Environment object.
    :param policy_fn (function):
        Method object to generate an explorer.
    :param set_weights_fn (function):
        Method object to set network weights gotten from queue.
    :param queue (multiprocessing.Queue):
        A FIFO shared with the learner to get the latest network weights.
        This is process safe, so you don't need to lock process when use this.
    :param gpu (int):
        GPU id. If this is set to -1, then this process uses only CPU.
    :param save_model_interval (int):
        Interval to save model.
    :param n_evaluation (int):
        Number of episodes to evaluate.
    :param episode_max_steps (int):
        Maximum number of steps of an episode.
    :param show_test_progress (bool):
        If true, `render` will be called to visualize evaluation process.
    """
    tf = import_tf()
    logger = logging.getLogger("tf2rl")

    output_dir = prepare_output_dir(args=None,
                                    user_specified_dir="./results",
                                    suffix="evaluator")
    writer = tf.summary.create_file_writer(output_dir,
                                           filename_suffix="_evaluation")
    writer.set_as_default()

    policy = policy_fn(env, "Learner", gpu=gpu)
    model_save_threshold = save_model_interval

    checkpoint = tf.train.Checkpoint(policy=policy)
    checkpoint_manager = tf.train.CheckpointManager(checkpoint,
                                                    directory=output_dir,
                                                    max_to_keep=10)

    while not is_training_done.is_set():
        n_evaluated_episode = 0
        # Wait until a new weights comes
        if queue.empty():
            continue
        else:
            set_weights_fn(policy, queue.get())
            trained_steps = queue.get()
            tf.summary.experimental.set_step(trained_steps)
            avg_test_return = 0.
            for _ in range(n_evaluation):
                n_evaluated_episode += 1
                episode_return = 0.
                obs = env.reset()
                done = False
                for _ in range(episode_max_steps):
                    action = policy.get_action(obs, test=True)
                    next_obs, reward, done, _ = env.step(action)
                    if show_test_progress:
                        env.render()
                    episode_return += reward
                    obs = next_obs
                    if done:
                        break
                avg_test_return += episode_return
                # Break if a new weights comes
                if not queue.empty():
                    break
            avg_test_return /= n_evaluated_episode
            logger.info("Evaluation: {} over {} run".format(
                avg_test_return, n_evaluated_episode))
            tf.summary.scalar(name="apex/average_test_return",
                              data=avg_test_return)
            writer.flush()
            if trained_steps > model_save_threshold:
                model_save_threshold += save_model_interval
                checkpoint_manager.save()
    checkpoint_manager.save()
Beispiel #9
0
def learner(global_rb, trained_steps, is_training_done, lock, env, policy_fn,
            get_weights_fn, n_training, update_freq, evaluation_freq, gpu,
            queues):
    """
    Update network weights using samples collected by explorers.

    :param global_rb (multiprocessing.managers.AutoProxy[PrioritizedReplayBuffer]):
        Prioritized replay buffer sharing with multiple explorers and only one learner.
        This object is shared over processes, so it must be locked when trying to
        operate something with `lock` object.
    :param trained_steps (multiprocessing.Value):
        Number of steps to apply gradients.
    :param is_training_done (multiprocessing.Event):
        multiprocessing.Event object to share the status of training.
    :param lock (multiprocessing.Lock):
        multiprocessing.Lock to lock other processes.
    :param env (Gym environment):
        Environment object.
    :param policy_fn (function):
        Method object to generate an explorer.
    :param get_weights_fn (function):
        Method object to get network weights and put them to queue.
    :param n_training (int):
        Maximum number of times to apply gradients. If number of applying gradients
        is over this value, training will be done by setting `is_training_done` to `True`
    :param update_freq (int):
        Frequency to update parameters, i.e., put network parameters to `queues`
    :param evaluation_freq (int):
        Frequency to call `evaluator`.
    :param gpu (int):
        GPU id. If this is set to -1, then this process uses only CPU.
    :param queues (List):
        List of Queues shared with explorers to send latest network parameters.
    """
    tf = import_tf()
    logger = logging.getLogger("tf2rl")

    policy = policy_fn(env, "Learner", global_rb.get_buffer_size(), gpu=gpu)

    output_dir = prepare_output_dir(args=None,
                                    user_specified_dir="./results",
                                    suffix="learner")
    writer = tf.summary.create_file_writer(output_dir)
    writer.set_as_default()

    # Wait until explorers collect transitions
    while not is_training_done.is_set(
    ) and global_rb.get_stored_size() < policy.n_warmup:
        continue

    start_time = time.time()
    while not is_training_done.is_set():
        trained_steps.value += 1
        tf.summary.experimental.set_step(trained_steps.value)
        lock.acquire()
        samples = global_rb.sample(policy.batch_size)
        lock.release()
        td_errors = policy.train(samples["obs"], samples["act"],
                                 samples["next_obs"], samples["rew"],
                                 samples["done"], samples["weights"])
        writer.flush()
        lock.acquire()
        global_rb.update_priorities(samples["indexes"],
                                    np.abs(td_errors) + 1e-6)
        lock.release()

        # Put updated weights to queue
        if trained_steps.value % update_freq == 0:
            weights = get_weights_fn(policy)
            for i in range(len(queues) - 1):
                queues[i].put(weights)
            fps = update_freq / (time.time() - start_time)
            tf.summary.scalar(name="apex/fps", data=fps)
            logger.info(
                "Update weights. {0:.2f} FPS for GRAD. Learned {1:.2f} steps".
                format(fps, trained_steps.value))
            start_time = time.time()

        # Periodically do evaluation
        if trained_steps.value % evaluation_freq == 0:
            queues[-1].put(get_weights_fn(policy))
            queues[-1].put(trained_steps.value)

        if trained_steps.value >= n_training:
            is_training_done.set()
Beispiel #10
0
def learner(global_rb, trained_steps, is_training_done,
            lock, env_fn, policy_fn, n_training, update_freq, *queues):
    """
    Collect transitions and store them to prioritized replay buffer.
    Args:
        global_rb:
            Prioritized replay buffer sharing with multiple explorers and only one learner.
            This object is shared over processes, so it must be locked when trying to
            operate something with `lock` object.
        trained_steps:
            Number of times to apply gradients.
        is_training_done:
            multiprocessing.Event object to share if training is done or not.
        lock:
            multiprocessing.Lock to lock other processes.
            It must be released after process is done.
        env_fn:
            Method object to generate an environment.
        policy_fn:
            Method object to generate an explorer.
        n_training:
            Maximum number of times to apply gradients. If number of applying gradients
            is over this value, training will be done by setting `is_training_done` to `True`
        update_freq:
            Frequency to update parameters, i.e., put network parameters to `queues`
        queues:
            FIFOs shared with explorers to send latest network parameters.
    """
    env = env_fn()
    policy = policy_fn(env, "Learner", global_rb.get_buffer_size())
    update_step = update_freq

    output_dir = prepare_output_dir(args=None, user_specified_dir="./results")
    writer = tf.contrib.summary.create_file_writer(output_dir)
    writer.set_as_default()
    tf.contrib.summary.initialize()
    total_steps = tf.train.create_global_step()

    # Wait until explorers collect transitions
    while not is_training_done.is_set() and global_rb.get_stored_size() == 0:
        continue

    start_time = time.time()
    while not is_training_done.is_set():
        with tf.contrib.summary.record_summaries_every_n_global_steps(1000):
            trained_steps.value += 1
            total_steps.assign(trained_steps.value)
            lock.acquire()
            samples = global_rb.sample(policy.batch_size)
            with tf.contrib.summary.always_record_summaries():
                td_error = policy.train(
                    samples["obs"], samples["act"], samples["next_obs"],
                    samples["rew"], np.array(samples["done"], dtype=np.float64),
                    samples["weights"])
                writer.flush()
            global_rb.update_priorities(samples["indexes"], np.abs(td_error) + 1e-6)
            lock.release()

            # Put updated weights to queue
            if trained_steps.value > update_step:
                weights = []
                weights.append(policy.actor.weights)
                weights.append(policy.critic.weights)
                weights.append(policy.critic_target.weights)
                for queue in queues:
                    queue.put(weights)
                update_step += update_freq
                with tf.contrib.summary.always_record_summaries():
                    fps = update_freq / (time.time() - start_time)
                    tf.contrib.summary.scalar(name="FPS", tensor=fps, family="loss")
                    print("Update weights for explorer. {0:.2f} FPS for GRAD. Learned {1:.2f} steps".format(fps, trained_steps.value))
                start_time = time.time()

        if trained_steps.value >= n_training:
            is_training_done.set()