def render_rollout(self, horizon=50, path=None, render_type="pretty", fps=8):
        """ Render a rollout into a video.

        Args:
            horizon: The number of timesteps to roll out.
            path: Directory where the video will be saved.
            render_type: Can be 'pretty' or 'fast'. Impliciations obvious.
            fps: Integer frames per second.
        """
        if path is None:
            path = os.path.abspath(os.path.dirname(__file__)) + "/videos"
            print(path)
            if not os.path.exists(path):
                os.makedirs(path)
        video_name = self.env_name + "_trajectory"

        if render_type == "pretty":
            image_path = os.path.join(path, "frames/")
            if not os.path.exists(image_path):
                os.makedirs(image_path)

            rewards, observations, full_obs = self.rollout(horizon=horizon, save_path=image_path)
            utility_funcs.make_video_from_image_dir(path, image_path, fps=fps, video_name=video_name)

            # Clean up images
            shutil.rmtree(image_path)
        else:
            rewards, observations, full_obs = self.rollout(horizon=horizon)
            utility_funcs.make_video_from_rgb_imgs(full_obs, path, fps=fps, video_name=video_name)
Example #2
0
    def render_rollout(self, horizon=500, path=None,
                       render_type='pretty', fps=8):
        """ Render a rollout into a video.

        Args:
            horizon: The number of timesteps to roll out.
            path: Directory where the video will be saved.
            render_type: Can be 'pretty' or 'fast'. Impliciations obvious.
            fps: Integer frames per second.
        """
        if path is None:
            path = os.path.abspath(os.path.dirname(__file__)) + '/videos'
            print(path)
            if not os.path.exists(path):
                os.makedirs(path)
        video_name = self.env_name + '_trajectory'

        if render_type == 'pretty':
            image_path = os.path.join(path, 'frames/')
            if not os.path.exists(image_path):
                os.makedirs(image_path)
            if self.env_name=='explore':
                self.explore(horizon=horizon, save_path=image_path)
                utility_funcs.make_video_from_image_dir(path, image_path, fps=fps,
                                                        video_name=video_name)

            else:
                rewards, observations, full_obs, final_result = \
                    self.rollout(horizon=horizon, save_path=image_path)
                utility_funcs.make_video_from_image_dir(path, image_path, fps=fps,
                                                        video_name=video_name)

                with open('4-agents-50-hor-imps-nonuniform-rew-prior-ex1.csv', 'w') as writeFile:
                    writer = csv.writer(writeFile)
                    writer.writerows(final_result)

            # Clean up images
            shutil.rmtree(image_path)
        else:
            if self.env_name=='explore':
                self.explore(horizon=horizon)
                utility_funcs.make_vidoe_from_rgb_imgs(path, image_path, fps=fps,
                                                        video_name=video_name)
            else:
                rewards, observations, full_obs, final_result = self.rollout(horizon=horizon)
                utility_funcs.make_video_from_rgb_imgs(full_obs, path, fps=fps,
                                                       video_name=video_name)
Example #3
0
    def render_rollout(self, horizon=50, path=None, fps=8):
        """ Render a rollout into a video.

        Args:
            horizon: The number of timesteps to roll out.
            path: Directory where the video will be saved.
            render_type: Can be 'pretty' or 'fast'. Impliciations obvious.
            fps: Integer frames per second.
        """
        if path is None:
            path = os.path.abspath(os.path.dirname(__file__)) + '/videos'
            print(path)
            if not os.path.exists(path):
                os.makedirs(path)
        video_name = self.env_name + '_trajectory'

        # if render_type == 'pretty':
        #     image_path = os.path.join(path, 'frames/')
        #     if not os.path.exists(image_path):
        #         os.makedirs(image_path)
        #
        #     rewards, observations, full_obs = self.rollout(
        #         horizon=horizon, save_path=image_path, train_agents=False)
        #     utility_funcs.make_video_from_image_dir(path, image_path, fps=fps,
        #                                             video_name=video_name)
        #
        #     # Clean up images
        #     shutil.rmtree(image_path)
        # else:
        rewards, observations, full_obs = self.rollout(horizon=horizon,
                                                       train_agents=False,
                                                       print_act=False)
        utility_funcs.make_video_from_rgb_imgs(full_obs,
                                               path,
                                               fps=fps,
                                               video_name=video_name)
        return rewards
Example #4
0
def visualizer_rllib(args):
    result_dir = args.result_dir if args.result_dir[-1] != '/' \
        else args.result_dir[:-1]

    config = get_rllib_config(result_dir)
    pkl = get_rllib_pkl(result_dir)
    result = pkl

    # check if we have a multiagent scenario but in a
    # backwards compatible way
    if config.get('multiagent', {}).get('policies', {}):
        multiagent = True
        config['multiagent'] = pkl['multiagent']
    else:
        multiagent = False

    # Create and register a gym+rllib env
    env_creator = pkl['env_config']['func_create']
    env_name = config['env_config']['env_name']
    register_env(env_name, env_creator)

    ModelCatalog.register_custom_model("conv_to_fc_net", ObedienceLSTM)

    # Determine agent and checkpoint
    config_run = config['env_config']['run'] if 'run' in config['env_config'] \
        else None
    if (args.run and config_run):
        if (args.run != config_run):
            print('visualizer_rllib.py: error: run argument '
                  + '\'{}\' passed in '.format(args.run)
                  + 'differs from the one stored in params.json '
                  + '\'{}\''.format(config_run))
            sys.exit(1)
    if (args.run):
        agent_cls = get_agent_class(args.run)
    elif (config_run):
        agent_cls = get_agent_class(config_run)
    else:
        print('visualizer_rllib.py: error: could not find flow parameter '
              '\'run\' in params.json, '
              'add argument --run to provide the algorithm or model used '
              'to train the results\n e.g. '
              'python ./visualizer_rllib.py /tmp/ray/result_dir 1 --run PPO')
        sys.exit(1)

    # Run on only one cpu for rendering purposes if possible; A3C requires two
    if config_run == 'A3C':
        config['num_workers'] = 1
        config["sample_async"] = False
    else:
        config['num_workers'] = 0

    # create the agent that will be used to compute the actions
    agent = agent_cls(env=env_name, config=result)
    checkpoint = result_dir + '/checkpoint_' + args.checkpoint_num
    checkpoint = checkpoint + '/checkpoint-' + args.checkpoint_num
    print('Loading checkpoint', checkpoint)
    agent.restore(checkpoint)
    if hasattr(agent, "local_evaluator"):
        env = agent.local_evaluator.env

    if args.save_video:
        shape = env.base_map.shape
        full_obs = [np.zeros((shape[0], shape[1], 3), dtype=np.uint8)
                    for i in range(config["horizon"])]

    if hasattr(agent, "local_evaluator"):
        multiagent = agent.local_evaluator.multiagent
        if multiagent:
            policy_agent_mapping = agent.config["multiagent"][
                "policy_mapping_fn"]
            mapping_cache = {}
        policy_map = agent.local_evaluator.policy_map
        state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
        use_lstm = {p: len(s) > 0 for p, s in state_init.items()}
    else:
        multiagent = False
        use_lstm = {DEFAULT_POLICY_ID: False}

    steps = 0
    while steps < (config['horizon'] or steps + 1):
        state = env.reset()
        done = False
        reward_total = 0.0
        while not done and steps < (config['horizon'] or steps + 1):
            if multiagent:
                action_dict = {}
                for agent_id in state.keys():
                    a_state = state[agent_id]
                    if a_state is not None:
                        policy_id = mapping_cache.setdefault(
                            agent_id, policy_agent_mapping(agent_id))
                        p_use_lstm = use_lstm[policy_id]
                        if p_use_lstm:
                            a_action, p_state_init, _ = agent.compute_action(
                                a_state,
                                state=state_init[policy_id],
                                policy_id=policy_id)
                            state_init[policy_id] = p_state_init
                        else:
                            a_action = agent.compute_action(
                                a_state, policy_id=policy_id)
                        action_dict[agent_id] = a_action
                action = action_dict
            else:
                if use_lstm[DEFAULT_POLICY_ID]:
                    action, state_init, _ = agent.compute_action(
                        state, state=state_init)
                else:
                    action = agent.compute_action(state)

            if agent.config["clip_actions"]:
                # clipped_action = clip_action(action, env.action_space)
                next_state, reward, done, _ = env.step(action)
            else:
                next_state, reward, done, _ = env.step(action)

            if multiagent:
                done = done["__all__"]
                reward_total += sum(reward.values())
            else:
                reward_total += reward

            if args.save_video:
                rgb_arr = env.map_to_colors()
                full_obs[steps] = rgb_arr.astype(np.uint8)

            steps += 1
            state = next_state
        print("Episode reward", reward_total)

    if args.save_video:
        path = os.path.abspath(os.path.dirname(__file__)) + '/videos'
        if not os.path.exists(path):
            os.makedirs(path)
        images_path = path + '/images/'
        if not os.path.exists(images_path):
            os.makedirs(images_path)
        utility_funcs.make_video_from_rgb_imgs(full_obs, path)

        # Clean up images
        shutil.rmtree(images_path)
Example #5
0
def rollout(args,
            agent,
            config,
            num_episodes,
            considered_player=None,
            coalition=None):
    if hasattr(agent, "workers"):
        env = agent.workers.local_worker().env

        if args.save_video:
            shape = env.base_map.shape
            full_obs = [
                np.zeros((shape[0], shape[1], 3), dtype=np.uint8)
                for i in range(config["horizon"])
            ]

        multiagent = isinstance(env, MultiAgentEnv)
        policy_agent_mapping = agent.config["multiagent"][
            "policy_mapping_fn"] if agent.workers.local_worker(
            ).multiagent else None
        mapping_cache = {}
        policy_map = agent.workers.local_worker().policy_map
        state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
        use_lstm = {p: len(s) > 0 for p, s in state_init.items()}
    else:
        multiagent = False
        use_lstm = {DEFAULT_POLICY_ID: False}

    if config["agents_fov"] is not None:
        env.set_agents_fov(config["agents_fov"])

    agents_active = [f"agent-{i}" for i in range(args.agents_active)]

    # Rollout
    episode = 0
    rewards_list = []
    while episode < num_episodes:
        steps = 0

        state = env.reset()

        done = False
        reward_total = 0.0
        while not done and steps < (config['horizon'] or steps + 1):
            if args.render:
                print("render")
                env.render()
            if multiagent:
                if args.shapley_M is not None:
                    action = take_actions_for_coalition(
                        env, agent, considered_player, state, mapping_cache,
                        use_lstm, policy_agent_mapping, state_init, coalition,
                        args.missing_agents_behaviour, agents_active, args.run)
                else:
                    action = take_action(env, agent, state, mapping_cache,
                                         use_lstm, policy_agent_mapping,
                                         state_init, agents_active, args.run)

            else:
                if use_lstm[DEFAULT_POLICY_ID]:
                    action, state_init, _ = agent.compute_action(
                        state, state=state_init)
                else:
                    action = agent.compute_action(state)

            if agent.config["clip_actions"]:
                # action = clip_action(action, env.action_space)
                next_state, reward, done, _ = env.step(action)
            else:
                next_state, reward, done, _ = env.step(action)

            if multiagent:
                done = done["__all__"]
                reward_total += sum(reward.values())
            else:
                reward_total += reward

            if args.social_metrics:
                with open(f'{args.save_dir}/{args.exp_name}.csv',
                          'a',
                          newline='') as csvfile:
                    writer = csv.writer(csvfile, delimiter=',')
                    row = [episode] + [steps] + list(reward.values())
                    writer.writerow(row)

            if args.save_video:
                rgb_arr = env.map_to_colors()
                full_obs[steps] = rgb_arr.astype(np.uint8)

            steps += 1
            state = next_state

        print("Episode reward", reward_total)
        episode += 1
        rewards_list.append(reward_total)

        if args.save_video:
            path = os.path.abspath(os.path.dirname(__file__)) + '/videos'
            if not os.path.exists(path):
                os.makedirs(path)
            images_path = path + '/images/'
            if not os.path.exists(images_path):
                os.makedirs(images_path)
            utility_funcs.make_video_from_rgb_imgs(full_obs, path)

            # Clean up images
            shutil.rmtree(images_path)

    return rewards_list
def visualizer_rllib(args):
    result_dir = args.result_dir if args.result_dir[-1] != '/' \
        else args.result_dir[:-1]

    config = get_rllib_config(result_dir)
    pkl = get_rllib_pkl(result_dir)

    # check if we have a multiagent scenario but in a
    # backwards compatible way
    if config.get('multiagent', {}).get('policy_graphs', {}):
        multiagent = True
        config['multiagent'] = pkl['multiagent']
    else:
        multiagent = False

    # Create and register a gym+rllib env
    env_creator = pkl['env_config']['func_create']
    env_name = config['env_config']['env_name']
    register_env(env_name, env_creator.func)

    ModelCatalog.register_custom_model("conv_to_fc_net", ConvToFCNet)

    # Determine agent and checkpoint
    config_run = config['env_config']['run'] if 'run' in config['env_config'] \
        else None
    if (args.run and config_run):
        if (args.run != config_run):
            print('visualizer_rllib.py: error: run argument ' +
                  '\'{}\' passed in '.format(args.run) +
                  'differs from the one stored in params.json ' +
                  '\'{}\''.format(config_run))
            sys.exit(1)
    if (args.run):
        agent_cls = get_agent_class(args.run)
    elif (config_run):
        agent_cls = get_agent_class(config_run)
    else:
        print('visualizer_rllib.py: error: could not find flow parameter '
              '\'run\' in params.json, '
              'add argument --run to provide the algorithm or model used '
              'to train the results\n e.g. '
              'python ./visualizer_rllib.py /tmp/ray/result_dir 1 --run PPO')
        sys.exit(1)

    # Run on only one cpu for rendering purposes if possible; A3C requires two
    if config_run == 'A3C':
        config['num_workers'] = 1
    else:
        config['num_workers'] = 0

    # create the agent that will be used to compute the actions
    agent = agent_cls(env=env_name, config=config)
    checkpoint = result_dir + '/checkpoint_' + args.checkpoint_num
    checkpoint = checkpoint + '/checkpoint-' + args.checkpoint_num
    agent.restore(checkpoint)
    if hasattr(agent, "local_evaluator"):
        env = agent.local_evaluator.env

    if args.save_video:
        shape = env.map.shape
        full_obs = [
            np.zeros((shape[0], shape[1], 3), dtype=np.uint8)
            for i in range(config["horizon"])
        ]

    rets = {}
    # map the agent id to its policy
    policy_map_fn = config['multiagent']['policy_mapping_fn'].func
    for key in config['multiagent']['policy_graphs'].keys():
        rets[key] = []

    if config['model']['use_lstm']:
        use_lstm = True
        state_init = [
            np.zeros(config['model']['lstm_cell_size'], np.float32),
            np.zeros(config['model']['lstm_cell_size'], np.float32)
        ]
    else:
        use_lstm = False

    for i in range(args.num_rollouts):
        state = env.reset()
        done = False
        if multiagent:
            ret = {key: [0] for key in rets.keys()}
        else:
            ret = 0
        for j in range(config["horizon"]):
            action = {}
            for agent_id in state.keys():
                if use_lstm:
                    action[
                        agent_id], state_init, logits = agent.compute_action(
                            state[agent_id],
                            state=state_init,
                            policy_id=policy_map_fn(agent_id))
                else:
                    action[agent_id] = agent.compute_action(
                        state[agent_id], policy_id=policy_map_fn(agent_id))
            observations, reward, done, _ = env.step(action)
            if args.render:
                env.render_map()
            if args.save_video:
                rgb_arr = env.map_to_colors()
                full_obs[j] = rgb_arr.astype(np.uint8)

            for actor, rew in reward.items():
                ret[policy_map_fn(actor)][0] += rew

            if multiagent and done['__all__']:
                break
            if not multiagent and done:
                break

        for key in rets.keys():
            rets[key].append(ret[key])

        for agent_id, rew in rets.items():
            print('Round {}, Return: {} for agent {}'.format(i, ret, agent_id))
    for agent_id, rew in rets.items():
        print('Average, std return: {}, {} for agent {}'.format(
            np.mean(rew), np.std(rew), agent_id))

    if args.save_video:
        path = os.path.abspath(os.path.dirname(__file__)) + '/videos'
        if not os.path.exists(path):
            os.makedirs(path)
        images_path = path + '/images/'
        if not os.path.exists(images_path):
            os.makedirs(images_path)
        utility_funcs.make_video_from_rgb_imgs(full_obs, path)

        # Clean up images
        shutil.rmtree(images_path)
def rollout(agent,
            env_name,
            num_steps,
            num_episodes=0,
            saver=None,
            no_render=True,
            monitor=False):
    policy_agent_mapping = default_policy_agent_mapping

    if saver is None:
        saver = RolloutSaver()

    if hasattr(agent, "workers"):
        env = agent.workers.local_worker().env
        multiagent = isinstance(env, MultiAgentEnv)
        if agent.workers.local_worker().multiagent:
            policy_agent_mapping = agent.config["multiagent"][
                "policy_mapping_fn"]

        policy_map = agent.workers.local_worker().policy_map
        state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
        use_lstm = {p: len(s) > 0 for p, s in state_init.items()}
        action_init = {
            p: _flatten_action(m.action_space.sample())
            for p, m in policy_map.items()
        }
    else:
        env = gym.make(env_name)
        multiagent = False
        use_lstm = {DEFAULT_POLICY_ID: False}

    if monitor and not no_render and saver and saver.outfile is not None:
        # If monitoring has been requested,
        # manually wrap our environment with a gym monitor
        # which is set to record every episode.
        env = gym.wrappers.Monitor(
            env, os.path.join(os.path.dirname(saver.outfile), "monitor"),
            lambda x: True)

    steps = 0
    episodes = 0
    if not no_render:
        shape = env.base_map.shape
        full_obs = [
            np.zeros((shape[0], shape[1], 3), dtype=np.uint8)
            for i in range(num_steps * num_episodes)
        ]
    while episodes < num_episodes:
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        saver.begin_rollout()
        obs = env.reset()
        agent_states = DefaultMapping(
            lambda agent_id: state_init[mapping_cache[agent_id]])
        prev_actions = DefaultMapping(
            lambda agent_id: action_init[mapping_cache[agent_id]])
        prev_rewards = collections.defaultdict(lambda: 0.)
        done = False
        reward_total = 0.0
        intrinsic_total = 0.0
        env_total = 0.0
        out_file = open('videos/communication_log.txt', 'w')
        out_file.write(f'\n\n episode-{episodes} \n\n')
        while not done and steps < num_steps:
            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            action_dict = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    if p_use_lstm:
                        a_action, p_state, _ = agent.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                        agent_states[agent_id] = p_state
                    else:
                        a_action = agent.compute_action(
                            a_obs,
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                    a_action = _flatten_action(a_action)  # tuple actions
                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action
            action = action_dict

            action = action if multiagent else action[_DUMMY_AGENT_ID]
            next_obs, reward, done, info = env.step(action)
            if multiagent:
                for agent_id, r in reward.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = reward

            if multiagent:
                done = done["__all__"]
                reward_total += sum(reward.values())
                intrinsic_total += sum([f['intrinsic'] for f in info.values()])
                env_total += sum([f['environmental'] for f in info.values()])
            else:
                reward_total += reward
            if not no_render:
                # env.render()
                rgb_arr = env.map_to_colors()
                full_obs[steps + (num_steps * episodes)] = rgb_arr.astype(
                    np.uint8)
                out_file.write(f'step-{steps}: {action}\n')

            saver.append_step(obs, action, next_obs, reward, done, info)
            steps += 1
            obs = next_obs
        out_file.close()
        saver.end_rollout()
        print("Episode #{}: reward: {}, intrinsic: {}, env: {}".format(
            episodes, reward_total, intrinsic_total, env_total))
        episodes += 1
        steps = 0

    if not no_render:
        path = os.path.abspath(os.path.dirname(__file__)) + '/videos'
        print('saving video to ', path)
        if not os.path.exists(path):
            os.makedirs(path)
        images_path = path + '/images/'
        if not os.path.exists(images_path):
            os.makedirs(images_path)
        utility_funcs.make_video_from_rgb_imgs(full_obs, path)

        # Clean up images
        shutil.rmtree(images_path)
def rollout(
    agent,
    env_name,
    num_steps,
    num_episodes=0,
    saver=None,
    no_render=True,
    video_dir=None,
    video_name=None,
):
    policy_agent_mapping = default_policy_agent_mapping

    if saver is None:
        saver = RolloutSaver()

    if hasattr(agent, "workers") and isinstance(agent.workers, WorkerSet):
        env = agent.workers.local_worker().env
        multiagent = isinstance(env, MultiAgentEnv)
        if agent.workers.local_worker().multiagent:
            policy_agent_mapping = agent.config["multiagent"][
                "policy_mapping_fn"]

        policy_map = agent.workers.local_worker().policy_map
        state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
        use_lstm = {p: len(s) > 0 for p, s in state_init.items()}
    else:
        env = gym.make(env_name)
        multiagent = False
        try:
            policy_map = {DEFAULT_POLICY_ID: agent.policy}
        except AttributeError:
            raise AttributeError(
                "Agent ({}) does not have a `policy` property! This is needed "
                "for performing (trained) agent rollouts.".format(agent))
        use_lstm = {DEFAULT_POLICY_ID: False}

    action_init = {
        p: flatten_to_single_ndarray(m.action_space.sample())
        for p, m in policy_map.items()
    }

    # If rendering, create an array to store observations
    if video_dir:
        shape = env.base_map.shape
        total_num_steps = max(num_steps,
                              num_episodes * agent.config["horizon"])
        all_obs = [
            np.zeros((shape[0], shape[1], 3), dtype=np.uint8)
            for _ in range(total_num_steps)
        ]

    steps = 0
    episodes = 0
    while keep_going(steps, num_steps, episodes, num_episodes):
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        saver.begin_rollout()
        obs = env.reset()
        agent_states = DefaultMapping(
            lambda agent_id: state_init[mapping_cache[agent_id]])
        prev_actions = DefaultMapping(
            lambda agent_id: action_init[mapping_cache[agent_id]])
        prev_rewards = collections.defaultdict(lambda: 0.0)
        done = False
        reward_total = 0.0
        while not done and keep_going(steps, num_steps, episodes,
                                      num_episodes):
            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            action_dict = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    if p_use_lstm:
                        a_action, p_state, _ = agent.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id,
                        )
                        agent_states[agent_id] = p_state
                    else:
                        a_action = agent.compute_action(
                            a_obs,
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id,
                        )
                    a_action = flatten_to_single_ndarray(a_action)
                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action
            action = action_dict

            action = action if multiagent else action[_DUMMY_AGENT_ID]
            next_obs, reward, done, info = env.step(action)
            if multiagent:
                for agent_id, r in reward.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = reward

            if multiagent:
                done = done["__all__"]
                reward_total += sum(reward.values())
            else:
                reward_total += reward
            if not no_render:
                rgb_arr = env.full_map_to_colors()
                all_obs[steps] = rgb_arr.astype(np.uint8)
            saver.append_step(obs, action, next_obs, reward, done, info)
            steps += 1
            obs = next_obs
        saver.end_rollout()
        print("Episode #{}: reward: {}".format(episodes, reward_total))
        if done:
            episodes += 1

    # Render video from observations
    if video_dir:
        if not os.path.exists(video_dir):
            os.makedirs(video_dir)
        images_path = video_dir + "/images/"
        if not os.path.exists(images_path):
            os.makedirs(images_path)
        height, width, _ = all_obs[0].shape
        # Upscale to be more legible
        width *= 20
        height *= 20
        utility_funcs.make_video_from_rgb_imgs(all_obs,
                                               video_dir,
                                               video_name=video_name,
                                               resize=(width, height))

        # Clean up images
        shutil.rmtree(images_path)