Beispiel #1
0
    def __init__(
            self,
            policy,
            env,
            params,
            test_env=None):
        """Initializing the training instance."""

        self._params = params
        self._set_from_params()
        self._policy = policy
        self._env = env
        self._test_env = self._env if test_env is None else test_env
        args = self._get_args_from_params()

        # Convolutional Autoencoder:
        self._CAE = CAE(pooling=self._params["cae"]["pooling"],
                        latent_dim=self._params["cae"]["latent_dim"],
                        input_shape=self._env.workspace.shape,
                        conv_filters=self._params["cae"]["conv_filters"])
        self._CAE.build(input_shape=(1, self._env.workspace.shape[0], self._env.workspace.shape[1], 1))
        self._CAE.load_weights(filepath=self._params["cae"]["weights_path"])
        for layer, _ in self._CAE._get_trainable_state().items():
            layer.trainable = False

        #Initialize array for trajectory storage
        self.trajectory=[]

        # Initialize workspace relabeler:
        self._relabeler = PointrobotRelabeler(
            ws_shape=(self._env.grid_size, self._env.grid_size),
            mode=params["trainer"]["relabeling_mode"],
            remove_zigzaging=params["trainer"]["remove_zigzaging"]
            )

        # prepare log directory
        self._output_dir = prepare_output_dir(
            args=args, user_specified_dir=self._logdir,
            suffix="{}_{}".format(self._policy.policy_name, params["trainer"]["dir_suffix"]))
        self.logger = initialize_logger(
            logging_level=logging.getLevelName(params["trainer"]["logging_level"]),
            output_dir=self._output_dir)
        if self._save_test_path_sep:
            sep_logdirs = ['successful_trajs', 'unsuccessful_trajs', 'unfinished_trajs']
            for logdir in sep_logdirs:
                if not os.path.exists(os.path.join(self._logdir, logdir)):
                    os.makedirs(os.path.join(self._logdir, logdir))

        if params["trainer"]["mode"] == "evaluate":
            assert glob.glob(os.path.join(params["trainer"]["model_dir"], '*'))
        self._set_check_point(params["trainer"]["model_dir"])

        # prepare TensorBoard output
        self.writer = tf.summary.create_file_writer(self._output_dir)
        self.writer.set_as_default()

        # relabeling visualization:
        self._relabel_fig = plt.figure(2)
Beispiel #2
0
def test_cae_initialization():
    pooling = 'max'
    latent_dim = 16
    input_shape = (32, 32)
    conv_filters = [4, 8, 16]
    model = CAE(pooling, latent_dim, input_shape, conv_filters)

    x = tf.random.uniform([1, 32, 32, 1])
    x_hat = model(x)
    assert x_hat.shape == (1, 32, 32, 1), "output shape is not (1, 32, 32, 1)"

    workspace = np.random.uniform(size=(32, 32))
    y = model.evaluate(workspace)
    assert isinstance(y, np.ndarray), "Type of latent output is not np.ndarray"
    assert y.shape == (16, ), "latent output shape is not (16,)"
Beispiel #3
0
    def setUp(self):
        """setup"""
        self.params = load_params('params/test_params.json')

        self.env = gym.make(self.params["env"]["name"], params=self.params)
        self.test_env = gym.make(self.params["env"]["name"],
                                 params=self.params)

        self.policy = DDPG(env=self.env, params=self.params)

        self.cae = CAE(pooling='max',
                       latent_dim=16,
                       input_shape=(32, 32),
                       conv_filters=[4, 8, 16])
        self.cae.build(input_shape=(1, 32, 32, 1))
        self.cae.load_weights(filepath='../models/cae/model_num_5_size_8.h5')
Beispiel #4
0
class PointrobotTrainerTests(unittest.TestCase):
    """For testing the Pointrobot trainer."""
    def setUp(self):
        """setup"""
        self.params = load_params('params/test_params.json')

        self.env = gym.make(self.params["env"]["name"], params=self.params)
        self.test_env = gym.make(self.params["env"]["name"],
                                 params=self.params)

        self.policy = DDPG(env=self.env, params=self.params)

        self.cae = CAE(pooling='max',
                       latent_dim=16,
                       input_shape=(32, 32),
                       conv_filters=[4, 8, 16])
        self.cae.build(input_shape=(1, 32, 32, 1))
        self.cae.load_weights(filepath='../models/cae/model_num_5_size_8.h5')

    def test_pointrobot_trainer_init(self):
        """tests the __init__() function of the pointrobot trainer"""
        trainer = PointrobotTrainer(self.policy,
                                    self.env,
                                    self.params,
                                    test_env=self.test_env)

    def test_evaluation(self):
        """tests the evaluation method of the pointrobot trainer"""

        trainer = PointrobotTrainer(self.policy,
                                    self.env,
                                    self.params,
                                    test_env=self.test_env)

        trainer.evaluate()

    def test_training(self):
        """sanity check of the training method."""

        self.params["trainer"]["max_steps"] = 1e4

        trainer = PointrobotTrainer(self.policy,
                                    self.env,
                                    self.params,
                                    test_env=self.test_env)

        trainer.train()
Beispiel #5
0
def test_autoencoder_training():

    parser = CAEtrainer.get_arguments()
    args = parser.parse_args()

    args.num_workspaces = 10
    args.epochs = 10
    args.batch_size = 2
    if os.listdir(args.workspace_dir) == 0:
        args.gen_workspace = True

    input_shape = (args.grid_size, args.grid_size)
    model = CAE(args.pooling, args.latent_dim, input_shape, args.conv_filters)
    optimizer = opt.Adam(learning_rate=args.learning_rate,
                         beta_1=0.9,
                         beta_2=0.999,
                         epsilon=1e-7)
    print('optimizer: {}'.format(optimizer))

    # loss function. Calculating the positive weights for it:
    mean_obj_num = (args.num_obj_max + 1) / 2
    ratio = args.grid_size**2 / (mean_obj_num * (args.obj_size_avg**2))
    beta = ratio
    loss_func = weighted_cross_entropy(beta=beta)
    print('Loss function: WCE with beta: {}'.format(beta))

    trainer = CAEtrainer(CAE=model,
                         optimizer=optimizer,
                         loss_func=loss_func,
                         args=args)

    trainer()

    # Plot results on an unseen workspace: #

    fig = plt.figure(num=1, figsize=(10, 5))
    plt.plot(trainer._train_losses)
    plt.plot(trainer._val_losses)

    # check out the model:

    path = os.path.join('../workspaces/',
                        ('ws_' + str(args.num_workspaces - 1) + '.csv'))
    x = np.expand_dims(np.loadtxt(path), axis=2).astype('float32')
    x = np.expand_dims(x, axis=0)
    x = tf.convert_to_tensor(x)

    x_hat = tf.cast(trainer._CAE(x) >= 0.5, tf.float32)

    fig2 = visualize_workspace(x.numpy()[0, :, :, 0], fignum=2)
    fig3 = visualize_workspace(x_hat.numpy()[0, :, :, 0], fignum=3)

    plt.show()
Beispiel #6
0
def test_weight_loading():
    model = CAE(pooling='max',
                latent_dim=16,
                input_shape=(32, 32),
                conv_filters=[4, 8, 16])
    model.build(input_shape=(1, 32, 32, 1))
    model.load_weights(filepath='../models/cae/model_num_5_size_8.h5')

    for k, _ in model._get_trainable_state().items():
        k.trainable = False
import numpy as np
import os
import matplotlib.pyplot as plt

from tensorflow.data import Dataset
from tensorflow.keras.losses import BinaryCrossentropy
from hwr.cae.cae import CAE
from hwr.cae.cae_trainer import CAEtrainer, weighted_cross_entropy
from hwr.random_workspace import visualize_workspace
"""Train a Convolutional Autoencoder."""

parser = CAEtrainer.get_arguments()
args = parser.parse_args()

input_shape = (args.grid_size, args.grid_size)
model = CAE(args.pooling, args.latent_dim, input_shape, args.conv_filters)
optimizer = opt.Adam(learning_rate=args.learning_rate,
                     beta_1=0.9,
                     beta_2=0.999,
                     epsilon=1e-7)
print('optimizer: {}'.format(optimizer))

# loss function. Calculating the positive weights for it:
mean_obj_num = (args.num_obj_max + 1) / 2
ratio = args.grid_size**2 / (mean_obj_num * (args.obj_size_avg**2))
beta = ratio
loss_func = weighted_cross_entropy(beta=beta)
print('Loss function: WCE with beta: {}'.format(beta))

trainer = CAEtrainer(CAE=model,
                     optimizer=optimizer,
import tensorflow as tf
import numpy as np
import os
import matplotlib.pyplot as plt

from hwr.cae.cae import CAE
from hwr.random_workspace import visualize_workspace
"""Visualize the output of the trained Convolutional Autoencoder"""

pooling = 'max'
latent_dim = 16
input_shape = (32, 32)
conv_filters = [4, 8, 16]
model = CAE(
    pooling=pooling,
    latent_dim=latent_dim,
    input_shape=input_shape,
    conv_filters=conv_filters,
)
model.build(input_shape=(1, 32, 32, 1))
model.load_weights(filepath='../models/cae/model_num_5_size_8.h5')

# Plot results on an unseen workspace: #
path = os.path.join('../workspaces/', ('ws_' + str(9500) + '.csv'))
x = np.expand_dims(np.loadtxt(path), axis=2).astype('float32')
x = np.expand_dims(x, axis=0)
x = tf.convert_to_tensor(x)

x_hat = tf.cast(model(x) >= 0.5, tf.float32)

fig2 = visualize_workspace(x.numpy()[0, :, :, 0], fignum=2)
fig3 = visualize_workspace(x_hat.numpy()[0, :, :, 0], fignum=3)
Beispiel #9
0
class PointrobotTrainer:
    def __init__(
            self,
            policy,
            env,
            params,
            test_env=None):
        """Initializing the training instance."""

        self._params = params
        self._set_from_params()
        self._policy = policy
        self._env = env
        self._test_env = self._env if test_env is None else test_env
        args = self._get_args_from_params()

        # Convolutional Autoencoder:
        self._CAE = CAE(pooling=self._params["cae"]["pooling"],
                        latent_dim=self._params["cae"]["latent_dim"],
                        input_shape=self._env.workspace.shape,
                        conv_filters=self._params["cae"]["conv_filters"])
        self._CAE.build(input_shape=(1, self._env.workspace.shape[0], self._env.workspace.shape[1], 1))
        self._CAE.load_weights(filepath=self._params["cae"]["weights_path"])
        for layer, _ in self._CAE._get_trainable_state().items():
            layer.trainable = False

        #Initialize array for trajectory storage
        self.trajectory=[]

        # Initialize workspace relabeler:
        self._relabeler = PointrobotRelabeler(
            ws_shape=(self._env.grid_size, self._env.grid_size),
            mode=params["trainer"]["relabeling_mode"],
            remove_zigzaging=params["trainer"]["remove_zigzaging"]
            )

        # prepare log directory
        self._output_dir = prepare_output_dir(
            args=args, user_specified_dir=self._logdir,
            suffix="{}_{}".format(self._policy.policy_name, params["trainer"]["dir_suffix"]))
        self.logger = initialize_logger(
            logging_level=logging.getLevelName(params["trainer"]["logging_level"]),
            output_dir=self._output_dir)
        if self._save_test_path_sep:
            sep_logdirs = ['successful_trajs', 'unsuccessful_trajs', 'unfinished_trajs']
            for logdir in sep_logdirs:
                if not os.path.exists(os.path.join(self._logdir, logdir)):
                    os.makedirs(os.path.join(self._logdir, logdir))

        if params["trainer"]["mode"] == "evaluate":
            assert glob.glob(os.path.join(params["trainer"]["model_dir"], '*'))
        self._set_check_point(params["trainer"]["model_dir"])

        # prepare TensorBoard output
        self.writer = tf.summary.create_file_writer(self._output_dir)
        self.writer.set_as_default()

        # relabeling visualization:
        self._relabel_fig = plt.figure(2)


    def _set_check_point(self, model_dir):
        # Save and restore model
        self._checkpoint = tf.train.Checkpoint(policy=self._policy)
        self.checkpoint_manager = tf.train.CheckpointManager(
            self._checkpoint, directory=model_dir, max_to_keep=5)

        if model_dir is not None:
            if not os.path.isdir(model_dir):
                os.makedirs(model_dir)
            self._latest_path_ckpt = tf.train.latest_checkpoint(model_dir)
            self._checkpoint.restore(self._latest_path_ckpt)
            self.logger.info("Restored {}".format(self._latest_path_ckpt))


    def train(self):
        """method for training an agent with Hindsight Workspace Relabeling"""

        # training mode:
        self._policy.eval_mode = False

        total_steps = 0
        tf.summary.experimental.set_step(total_steps)
        episode_steps = 0
        episode_return = 0
        episode_start_time = time.perf_counter()
        n_episode = 0
        success_traj_train = 0.

        relabeling_times, training_times = [], []

        #Initialize replay buffer
        self._replay_buffer = get_replay_buffer(
            self._policy, self._env, self._use_prioritized_rb,
            self._use_nstep_rb, self._n_step)

        # resetting:
        self.trajectory = []
        workspace, goal, obs = self._env.reset()
        
        #Concatenate position observation with start, goal, and reduced workspace
        reduced_workspace = self._CAE.evaluate(workspace)
        obs_full = np.concatenate((obs, goal, reduced_workspace))
        

        while total_steps < self._max_steps:
        
            #Visualize environment if "show_progess"
            if self._show_progress and \
                ((n_episode % self._show_progress_interval) == 0) and \
                total_steps > self._policy.n_warmup:
                self._env.render()

            if total_steps in self._params["agent"]["lr_decay_steps"]:
                ind = self._params["agent"]["lr_decay_steps"].index(total_steps)
                self._params["agent"]["lr_actor"] = self._params["agent"]["actor_lr_decay_vals"][ind]
                self._params["agent"]["lr_actor"] = self._params["agent"]["critic_lr_decay_vals"][ind]
                self._policy.actor_optimizer.learning_rate = self._params["agent"]["lr_actor"]
                self._policy.critic_optimizer.learning_rate = self._params["agent"]["lr_critic"]
                print("---- Learning rate: {}".format(self._policy.actor_optimizer.learning_rate))

            #Get action randomly for warmup /from Actor-NN otherwise
            if total_steps < self._policy.n_warmup:
                action = self._env.action_space.sample()
            else:
                action = self._policy.get_action(obs_full)

            #Take action and get next_obs, reward and done_flag from environment
            next_obs, reward, done, _ = self._env.step(action)
            next_obs_full = np.concatenate((next_obs, goal, reduced_workspace))

            # add the new point to replay buffer
            self._replay_buffer.add(obs=obs_full, act=action,
                next_obs=next_obs_full, rew=reward, done=done)

            #Add obersvation to the trajectory storage
            self.trajectory.append({'workspace': workspace,'position': obs,
                'next_position': next_obs,'goal': goal, 'action': action, 'reward': reward, 'done': done})

            obs = next_obs
            obs_full = next_obs_full        
            
            episode_steps += 1
            episode_return += reward
            total_steps += 1
            tf.summary.experimental.set_step(total_steps)

            if done or episode_steps == self._episode_max_steps:
                
                if (reward != self._env.goal_reward):
                    """Workspace relabeling"""

                    # plotting the trajectory:
                    if self._params["trainer"]["show_relabeling"]:                    
                        self._relabel_fig = visualize_trajectory(
                            trajectory=self.trajectory, 
                            fig=self._relabel_fig,
                            env=self._env
                            )
                        plt.pause(1)

                    relabeling_begin = time.time()
                    # Create new workspace for the trajectory:
                    relabeled_trajectory = self._relabeler.relabel(trajectory=self.trajectory, env=self._env)

                    if relabeled_trajectory:
                        relabeled_ws = relabeled_trajectory[0]['workspace']
                        relabeled_reduced_ws = self._CAE.evaluate(relabeled_ws)
                        
                        # adding the points of the relabeled trajectory to the replay buffer:
                        for point in relabeled_trajectory:
                            relabeled_obs_full = np.concatenate((point['position'],
                                point['goal'], relabeled_reduced_ws))
                            relabeled_next_obs_full = np.concatenate((point['next_position'],
                                point['goal'], relabeled_reduced_ws))
                            self._replay_buffer.add(obs=relabeled_obs_full, act=point['action'],
                                next_obs=relabeled_next_obs_full, rew=point['reward'], done=point['done'])

                        # plotting the relabeled trajectory:
                        if self._params["trainer"]["show_relabeling"]:
                            self._relabel_fig = visualize_trajectory( 
                                trajectory=relabeled_trajectory,
                                fig=self._relabel_fig,
                                env=self._env
                                )
                            plt.pause(1)

                        relabeling_times.append(time.time() - relabeling_begin)

                else:
                    success_traj_train += 1

                # resetting:
                workspace, goal, obs = self._env.reset()
                reduced_workspace = self._CAE.evaluate(workspace)
                obs_full = np.concatenate((obs, goal, reduced_workspace))
                self.trajectory = []

                #Print out train accuracy
                n_episode += 1
                if n_episode % self._test_episodes == 0:
                    train_sucess_rate = success_traj_train / self._test_episodes

                    fps = episode_steps / (time.perf_counter() - episode_start_time)
                    self.logger.info("Total Epi: {0: 5} Train sucess rate: {1: 5.4f} Total Steps: {2: 7} Episode Steps: {3: 5} Return: {4: 5.4f} Last reward: {5: 5.4f} FPS: {6: 5.2f}".format(
                        n_episode, train_sucess_rate, total_steps, episode_steps, episode_return, reward, fps))
                    tf.summary.scalar(
                        name="Common/training_return", data=episode_return)
                    tf.summary.scalar(
                        name="Common/training_success_rate", data=train_sucess_rate)
                    success_traj_train = 0

                    if len(relabeling_times) != 0:
                        print('average relabeling time: {}'.format(sum(relabeling_times) / len(relabeling_times)))
                        relabeling_times = []
                    if len(training_times) != 0:
                        print('average training time: {}'.format(sum(training_times) / len(training_times)))
                        training_times = []

                episode_steps = 0
                episode_return = 0
                episode_start_time = time.perf_counter()

            #While warmup, we only produce experiences without training 
            if total_steps <= self._policy.n_warmup:
                continue
            
            # After every Update_interval we want to train/update the Actor-NN, Critic-NN, 
            # and the Target-Actor-NN & Target-Critic-NN
            if total_steps % self._policy.update_interval == 0:
                training_begin = time.time()
                #Sample a new batch of experiences from the replay buffer for training
                samples = self._replay_buffer.sample(self._policy.batch_size)

                with tf.summary.record_if(total_steps % self._save_summary_interval == 0):
                    # Here we update the Actor-NN, Critic-NN, and the Target-Actor-NN & Target-Critic-NN 
                    # after computing the Critic-loss and the Actor-loss
                    self._policy.train(
                        samples["obs"], samples["act"], samples["next_obs"],
                        samples["rew"], np.array(samples["done"], dtype=np.float32),
                        None if not self._use_prioritized_rb else samples["weights"])
            
                if self._use_prioritized_rb:
                    #Here we compute the Td-Critic-Loss/error
                    td_error = self._policy.compute_td_error(
                        samples["obs"], samples["act"], samples["next_obs"],
                        samples["rew"], np.array(samples["done"], dtype=np.float32))
                    self._replay_buffer.update_priorities(
                        samples["indexes"], np.abs(td_error) + 1e-6)

                training_times.append(time.time() - training_begin)

            # Every test_interval we want to test our agent 
            if total_steps % self._test_interval == 0:
                
                # setting evaluation mode for deterministic actions:
                self._policy.eval_mode = True

                avg_test_return, success_rate, ratio_straight_lines, success_rate_straight_line, success_rate_no_straight_line = self.evaluate_policy(total_steps)
                self.logger.info("Evaluation: Total Steps: {0: 7} Average Reward {1: 5.4f} and Sucess rate: {2: 5.4f} for {3: 2} episodes".format(
                    total_steps, avg_test_return, success_rate, self._test_episodes))
                tf.summary.scalar(
                    name="Common/average_test_return", data=avg_test_return)
                tf.summary.scalar(
                    name="Common/test_success_rate", data=success_rate)
                tf.summary.scalar(
                    name="Ratio_feasible straight_line episodes", data=ratio_straight_lines)
                tf.summary.scalar(
                    name="test_success_rate straight_line episodes", data=success_rate_straight_line)
                tf.summary.scalar(
                    name="test_success_rate no_straight_line episodes", data=success_rate_no_straight_line)
                tf.summary.scalar(name="Common/fps", data=fps)
                self.writer.flush()

                # setting evaluation mode back to false:
                self._policy.eval_mode = False

            # Every save_model_interval we save the model
            if total_steps % self._save_model_interval == 0:
                self.checkpoint_manager.save()

        tf.summary.flush()

    
    def evaluate(self):
        """method for evaluating a pretrained agent for some episodes."""
        self._policy.eval_mode = True

        avg_test_return, success_rate, ratio_straight_lines, success_rate_straight_line, success_rate_no_straight_line = self.evaluate_policy(total_steps=0)
        print("----- Evaluation -----")
        print("avg test return: {}".format(avg_test_return))
        print("avg test success rate: {}".format(success_rate))
        print("Ratio of feasible straight_line episodes: {}".format(ratio_straight_lines))
        print("avg test success_rate for straight_line episodes: {}".format(success_rate_straight_line))
        print("avg test success_rate for no_straight_line episodes: {}".format(success_rate_no_straight_line))

        return avg_test_return, success_rate, ratio_straight_lines, success_rate_straight_line, success_rate_no_straight_line


    def evaluate_policy_continuously(self):
        """
        Periodically search the latest checkpoint, and keep evaluating with the latest model until user kills process.
        """
        if self._model_dir is None:
            self.logger.error("Please specify model directory by passing command line argument `--model-dir`")
            exit(-1)

        self.evaluate_policy(total_steps=0)
        while True:
            latest_path_ckpt = tf.train.latest_checkpoint(self._model_dir)
            if self._latest_path_ckpt != latest_path_ckpt:
                self._latest_path_ckpt = latest_path_ckpt
                self._checkpoint.restore(self._latest_path_ckpt)
                self.logger.info("Restored {}".format(self._latest_path_ckpt))
            self.evaluate_policy(total_steps=0)

    def evaluate_policy(self, total_steps):
        """evaluating the policy."""

        tf.summary.experimental.set_step(total_steps)
        
        total_test_return = 0.
        success_traj = 0
        if self._save_test_path:
            replay_buffer = get_replay_buffer(
                self._policy, self._test_env, size=self._episode_max_steps)

        straight_line_episode = 0 
        no_straight_line_episode = 0
        success_traj_straight_line = 0
        success_traj_no_straight_line = 0

        for i in range(self._test_episodes):
            episode_return = 0.
            frames = []
            workspace, goal, obs = self._test_env.reset()
            start = obs
            reduced_workspace = self._CAE.evaluate(workspace)
            #Concatenate position observation with start, goal, and reduced workspace!!
            obs_full = np.concatenate((obs, goal, reduced_workspace))

            for _ in range(self._episode_max_steps):
                action = self._policy.get_action(obs_full)
                next_obs, reward, done, _ = self._test_env.step(action)
                #Concatenate position observation with start, goal, and reduced workspace!!
                next_obs_full = np.concatenate((obs, goal, reduced_workspace))

                # Add obersvation to the trajectory storage
                self.trajectory.append({'workspace': workspace,'position': obs,
                    'next_position': next_obs,'goal': goal, 'action': action, 'reward': reward, 'done': done})
                
                if self._save_test_path:
                    replay_buffer.add(obs=obs_full, act=action,
                                next_obs=next_obs_full, rew=reward, done=done)

                if self._save_test_movie:
                    frames.append(self._test_env.render(mode='plot'))

                elif self._show_test_progress:
                    self._test_env.render()

                episode_return += reward
                obs = next_obs
                obs_full = next_obs_full
                
                if done:
                    break

            prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format(
                total_steps, i, episode_return)

            if self._save_test_path:
                save_path(replay_buffer._encode_sample(np.arange(self._episode_max_steps)),
                          os.path.join(self._output_dir, prefix + ".pkl"))
                replay_buffer.clear()

            if self._save_test_movie:
                frames_to_gif(frames, prefix, self._output_dir)

            if self._save_test_path_sep:
                self._save_traj_separately(prefix)
                   
            total_test_return += episode_return

            if straight_line_feasible(workspace, start, goal, self._test_env):
                straight_line_episode += 1
                if reward == self._test_env.goal_reward:        
                    success_traj_straight_line += 1
            else:
                no_straight_line_episode += 1
                if reward == self._test_env.goal_reward:        
                    success_traj_no_straight_line += 1

            if reward == self._test_env.goal_reward:        
                success_traj += 1

            # empty trajectory:
            self.trajectory = []

        if self._show_test_images:
            images = tf.cast(
                tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=2),
                tf.uint8)
            tf.summary.image('train/input_img', images,)

        avg_test_return = total_test_return / self._test_episodes
        success_rate = success_traj / self._test_episodes
        if straight_line_episode > 0:
            success_rate_straight_line = success_traj_straight_line/straight_line_episode
        else:
            success_rate_straight_line = 0
        if no_straight_line_episode > 0:
            success_rate_no_straight_line = success_traj_no_straight_line/no_straight_line_episode
        else:
            success_rate_no_straight_line = 0
        ratio_straight_lines = straight_line_episode/ self._test_episodes

        return avg_test_return, success_rate, ratio_straight_lines, success_rate_straight_line, success_rate_no_straight_line


    def _save_traj_separately(self, prefix):
        """Saves the test trajectories into separate folders under the logdir
        based on the ending of the trajectory.
        """
        last_reward = self.trajectory[-1]['reward']

        if last_reward == self._env.goal_reward:
            log_dir = os.path.join(self._logdir, 'successful_trajs')
        elif last_reward == self._env.collision_reward:
            log_dir = os.path.join(self._logdir, 'unsuccessful_trajs')
        else:
            log_dir = os.path.join(self._logdir, 'unfinished_trajs')

        file_name = os.path.join(log_dir, prefix + '.pkl')
        joblib.dump(self.trajectory, file_name)


    def _set_from_params(self):
        # experiment settings
        self._max_steps = self._params["trainer"]["max_steps"]
        self._episode_max_steps = self._params["trainer"]["episode_max_steps"] \
            if self._params["trainer"]["episode_max_steps"] is not None \
            else self._params["trainer"]["max_steps"]
        self._n_experiments = self._params["trainer"]["n_experiments"]
        self._show_progress = self._params["trainer"]["show_progress"]
        self._show_progress_interval = self._params["trainer"]["show_progress_interval"]
        self._save_model_interval = self._params["trainer"]["save_model_interval"]
        self._save_summary_interval = self._params["trainer"]["save_summary_interval"]
        self._normalize_obs = self._params["trainer"]["normalize_obs"]
        self._logdir = self._params["trainer"]["logdir"]
        self._model_dir = self._params["trainer"]["model_dir"]
        # replay buffer
        self._use_prioritized_rb = self._params["trainer"]["use_prioritized_rb"]
        self._use_nstep_rb = self._params["trainer"]["use_nstep_rb"]
        self._n_step = self._params["trainer"]["n_step"]
        # test settings
        self._test_interval = self._params["trainer"]["test_interval"]
        self._show_test_progress = self._params["trainer"]["show_test_progress"]
        self._test_episodes = self._params["trainer"]["test_episodes"]
        self._save_test_path = self._params["trainer"]["save_test_path"]
        self._save_test_path_sep = self._params["trainer"]["save_test_path_sep"]
        self._save_test_movie = self._params["trainer"]["save_test_movie"]
        self._show_test_images = self._params["trainer"]["show_test_images"]


    def _get_args_from_params(self):
        """creates an argparse Namespace object from params for the tf2rl based classes."""

        args = {}
        for key in self._params["trainer"]:
            args[key] = self._params["trainer"][key]

        return argparse.Namespace(**args)