Ejemplo n.º 1
0
def play(env, model, video_path, num_episodes, timesteps, metadata):
    video_recorder = None
    for i_episodes in range(num_episodes):
        video_recorder = VideoRecorder(
            env=env, path=video_path, metadata=metadata, enabled=video_path is not None)
        obs = env.reset()
        for t in range(timesteps):
            obs = [np.array([[list(obs)]])]
            video_recorder.capture_frame()
            action = model.predict(obs)[0]
            obs, rew, done, info = env.step(action)
            env.render()
            theta.append(obs[0])
            theta_dot.append(obs[1])
            actions.append(action[0])
            if done:
                print("Episode finished after {} timesteps".format(t + 1))
                num_episodes += 1
                # save video of first episode
                print("Saved video.")
                video_recorder.close()
                video_recorder.enabled = False
                break
    env.close()
    return theta
Ejemplo n.º 2
0
def run_policy(env,
               get_action,
               env_params_list,
               max_ep_len=None,
               episode_id=0,
               record=False,
               recording_path=None,
               no_render=False,
               use_baselines=False):
    if record:
        if os.name == "nt":
            full_path = os.path.join(pathlib.Path().absolute(), recording_path)
            full_path_len = len(full_path)
            nb_char_to_remove = full_path_len - 245
            if nb_char_to_remove > 0:
                recording_path = recording_path[:-nb_char_to_remove]
        video_recorder = VideoRecorder(env,
                                       recording_path + "_ep" +
                                       str(episode_id) + ".mp4",
                                       enabled=True)

    if use_baselines:
        env.get_raw_env().set_environment(**env_params_list[episode_id])
    else:
        env.set_environment(**env_params_list[episode_id])

    if use_baselines:
        _, o = env.reset()
    else:
        o = env.reset()

    r, d, ep_ret, ep_len, n = 0, False, 0, 0, 0
    while True:
        if record and video_recorder.enabled:
            video_recorder.capture_frame()
        if not record and not no_render:
            env.render()
            time.sleep(1e-3)

        a = get_action(o)
        o, r, d, i = env.step(a)
        if use_baselines:
            ep_ret += i[0]["original_reward"][0]
        else:
            ep_ret += r
        ep_len += 1

        if d or (ep_len == max_ep_len):
            print('Episode %d \t EpRet %.3f \t EpLen %d' %
                  (episode_id, ep_ret, ep_len))
            if record and video_recorder.enabled:
                video_recorder.close()
                video_recorder.enabled = False
            break
    return ep_ret
Ejemplo n.º 3
0
def play(env, act, stochastic, video_path):
    num_episodes = 0
    video_recorder = None
    video_recorder = VideoRecorder(env,
                                   video_path,
                                   enabled=video_path is not None)
    obs = env.reset()
    while True:
        env.unwrapped.render()
        video_recorder.capture_frame()
        action = act(np.array(obs)[None], stochastic=stochastic)[0]
        obs, rew, done, _ = env.step(action)
        if done:
            obs = env.reset()
            if video_recorder.enabled:
                # save video of first episode
                print("Saved video.")
                video_recorder.close()
                video_recorder.enabled = False
Ejemplo n.º 4
0
def save_video(agent, video_path):
    num_episodes = 0
    # video_recorder = None
    video_recorder = VideoRecorder(
        agent.env, video_path, enabled=video_path is not None)
    state = agent.env.reset()
    state = state2tensor(state)
    for t in count():
        agent.env.unwrapped.render()
        video_recorder.capture_frame()
        action = select_best_action(state=state, agent=agent)
        next_state, rew, done, info = agent.env.step(action.item())
        next_state = state2tensor(next_state)
        state = next_state

        if done:
            # save video of first episode
            print("Saved video.")
            video_recorder.close()
            video_recorder.enabled = False
            break
Ejemplo n.º 5
0
def _hrl_run_episodes(env,
                      agent: HIROAgent,
                      n_steps,
                      n_episodes,
                      max_episode_len=None,
                      logger=None,
                      step_number=None,
                      video_outdir=None):
    """Run multiple episodes and return returns."""
    assert (n_steps is None) != (n_episodes is None)
    evaluation_videos_dir = f'{video_outdir}/evaluation_videos'
    os.makedirs(evaluation_videos_dir, exist_ok=True)
    video_recorder = VideoRecorder(
        env, path=f'{evaluation_videos_dir}/evaluation_{step_number}.mp4')
    video_recorder.enabled = step_number is not None
    logger = logger or logging.getLogger(__name__)
    scores = []
    successes = 0
    success_rate = 0
    terminate = False
    timestep = 0
    env.evaluate = True
    reset = True
    while not terminate:
        if reset:
            # env.seed(np.random.randint(0, 2 ** 32 - 1))
            obs_dict = env.reset()
            fg = obs_dict['desired_goal']
            obs = obs_dict['observation']
            sg = env.subgoal_space.sample()
            done = False
            test_r = 0
            episode_len = 0
            info = {}

        a = agent.act_low_level(obs, sg)
        obs_dict, r, done, info = env.step(a)

        video_recorder.capture_frame()

        obs = obs_dict['observation']
        # select subgoal for the lower level controller.
        n_sg = agent.act_high_level(obs, fg, sg, timestep)

        test_r += r
        episode_len += 1
        timestep += 1
        reset = done or episode_len == max_episode_len or info.get(
            "needs_reset", False)
        agent.observe(obs, fg, n_sg, r, done, reset, timestep)
        sg = n_sg
        if reset:
            logger.info("evaluation episode %s length:%s R:%s", len(scores),
                        episode_len, test_r)
            success = agent.evaluate_final_goal(fg, obs)
            successes += 1 if success else 0
            logger.info(f"{successes} successes so far.")
            # As mixing float and numpy float causes errors in statistics
            # functions, here every score is cast to float.
            scores.append(float(test_r))

        if n_steps is None:
            terminate = len(scores) >= n_episodes
        else:
            terminate = timestep >= n_steps
    # If all steps were used for a single unfinished episode
    if len(scores) == 0:
        scores.append(float(test_r))
        logger.info("evaluation episode %s length:%s R:%s", len(scores),
                    episode_len, test_r)

    success_rate = successes / n_episodes
    logger.info(f"Success Rate: {success_rate}")

    if step_number is not None:
        print("Saved video.")
    video_recorder.close()
    return scores, success_rate
Ejemplo n.º 6
0
def run_learner(env_name,
                agent_class,
                max_episodes=1000,
                n_deterministic_episodes=10,
                output_freq=10,
                env_seed=None,
                tf_seed=None,
                max_time=200,
                video_save_root=None,
                **kwargs):
    """
    Run the reinforcement learning process (either can be in deterministic or training mode)

    :param env_name: name of the environment to use for training
    :param agent_class: class that can be used to create an agent object
    :param max_episodes: number of episodes to use for training
    :param n_deterministic_episodes: number of deterministic episodes to test the controller
    :param output_freq: Frequency at which to output episode results
    :param env_seed: random seed for the environment (to control the start conditions)
    :param tf_seed: random seed for tensorflow (to control network initialization)
    :param max_time: maximum number of time steps before terminating an episode
    :param video_save_root: directory to specify video save location
    :param kwargs: passed to the agent's __init__ method
    :return: trained agent and data from run
    """

    # Create the OpaenAI gym environment
    env = gym.make(env_name)

    # Set up video saving directroy
    if video_save_root and not video_save_root.exists():
        video_save_root.mkdir()

    # Implement random seeds
    if env_seed:
        env.seed(env_seed)

    if tf_seed:
        tf.compat.v1.random.set_random_seed(tf_seed)

    # Find state and action sizes of the environment
    state_size = env.observation_space.shape[0]
    if not hasattr(env.action_space, 'n'):
        action_size = env.action_space.shape[0]
    else:
        action_size = env.action_space.n

    # Create agent
    agent = agent_class(state_size, action_size,
                        actor_limit=env.action_space.high[0],  # size of action (assumes all actions scaled the same)
                        **kwargs)

    # Initialize data lists for episodes
    return_info = dict(score_list=[],
                       deterministic_runs=[],
                       states=[],
                       actions=[],
                       rewards=[],
                       dones=[])

    # Iterate through episodes
    for episode in range(max_episodes + n_deterministic_episodes):

        # Reset environment for beginning of episode
        state = env.reset()

        # Determine if the action should be stochastic or not (only true during training)
        deterministic_action = episode >= max_episodes

        # Sum of reward over an episode
        episode_rewards_sum = 0

        # Episode data lists
        states = []
        rewards = []
        actions = []
        dones = []

        # Set up video recorder if we want a video of this episode
        video_recorder = None
        if deterministic_action and video_save_root:
            video_path = str(video_save_root / f'episode-{agent.current_episode_number}.mp4')
            video_recorder = VideoRecorder(env, video_path, enabled=video_save_root is not None)

        # Conduct episode
        done = False
        t = 0
        while not done:

            # Show the frame and save the movie frame if this is in testing mode
            if deterministic_action:
                env.render()
                video_recorder.capture_frame()

            # Get action from agent
            action = agent.act(state, deterministic=deterministic_action)

            # Get signals from environment
            new_state, reward, done, info = env.step(action)

            # Save data
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            dones.append(done)
            episode_rewards_sum += reward

            # enforce maximum time for episode
            done = done or t == max_time - 1

            # Tell the agent a step in the environment has occured
            if not deterministic_action:
                agent.on_t_update(state, action, new_state, reward, done)

            # Update state value
            state = new_state

            if done:

                # Output data
                if episode % output_freq == 0 or deterministic_action:
                    print(f"episode: {episode}/{max_episodes},     "
                          f"score: {episode_rewards_sum},     "
                          f"critic loss: {'-' if not agent.critic_losses else np.abs(agent.critic_losses[-1])},     "
                          f"actor loss: {'-' if not agent.actor_losses else np.abs(agent.actor_losses[-1])},     "
                          f"loss: {'-' if not agent.updates else np.abs(agent.updates[-1])},     "
                          f"greedy: {deterministic_action}")

                if episode > 0 and episode % 100 == 0:
                    avg_reward = np.average(return_info['score_list'][-100:])
                    print(f"Average reward over 100 episodes: {avg_reward}")

                if deterministic_action:
                    # Save episode reward
                    return_info['deterministic_runs'].append(episode_rewards_sum)

                    # Close the video recorder
                    video_recorder.close()
                    video_recorder.enabled = False
                else:
                    # Save episode reward
                    return_info['score_list'].append(episode_rewards_sum)

                    # Inform the agent that an episode has completed
                    agent.on_episode_complete(episode)

                # Update data lists
                return_info['states'].append(states)
                return_info['actions'].append(actions)
                return_info['rewards'].append(rewards)
                return_info['dones'].append(dones)

            # Update time
            t += 1

    # Save the agent if training has occurred
    if max_episodes > 0:
        agent.save(agent.sess, global_step=max_episodes)

    return agent, return_info
Ejemplo n.º 7
0
def run_policy(env,
               get_action,
               env_params_list,
               max_ep_len=None,
               episode_id=0,
               record=False,
               recording_path=None,
               no_render=False,
               use_baselines=False):
    '''
        Run an episode of a trained policy.

        Args:
            env: Environment
            get_action: Policy function
            env_params_list: List of tasks among one must be loaded
            max_ep_len: Maximum number of steps allowed in the episode
            episode_id: Id of the episode to load in `env_params_list`
            record: Whether a video of the episode should be recorded
            recording_path: Path on which the video must be saved
            no_render: Whether the episode must be ran without a frame rendering it
            use_baselines: Whether the policy was trained using OpenAI Baselines
    '''
    if record:
        if os.name == "nt":
            full_path = os.path.join(pathlib.Path().absolute(), recording_path)
            full_path_len = len(full_path)
            nb_char_to_remove = full_path_len - 245
            if nb_char_to_remove > 0:
                recording_path = recording_path[:-nb_char_to_remove]
        video_recorder = VideoRecorder(env,
                                       recording_path + "_ep" +
                                       str(episode_id) + ".mp4",
                                       enabled=True)

    if use_baselines:
        env.get_raw_env().set_environment(**env_params_list[episode_id])
    else:
        env.set_environment(**env_params_list[episode_id])

    if use_baselines:
        _, o = env.reset()
    else:
        o = env.reset()

    r, d, ep_ret, ep_len, n = 0, False, 0, 0, 0
    while True:
        if record and video_recorder.enabled:
            video_recorder.capture_frame()
        if not record and not no_render:
            env.render()
            time.sleep(1e-3)

        a = get_action(o)
        o, r, d, i = env.step(a)
        if use_baselines:
            ep_ret += i[0]["original_reward"][0]
        else:
            ep_ret += r
        ep_len += 1

        if d or (ep_len == max_ep_len):
            print('Episode %d \t EpRet %.3f \t EpLen %d' %
                  (episode_id, ep_ret, ep_len))
            if record and video_recorder.enabled:
                video_recorder.close()
                video_recorder.enabled = False
            break
    return ep_ret
Ejemplo n.º 8
0
    def work(self):
        self.summary_writer.add_graph(self.sess.graph)
        n_episodes = 1000

        episode_i = 0
        episode_len = 0
        cur_state = preprocessor_for_ddn.obs_to_state(self.env.reset())
        count = 1
        cum_reward = 0
        # start_life = 5
        # need_restart = True

        while episode_i < n_episodes:
            # setup video recorder
            video_path = os.path.join(BASE_VIDEO_PATH, f"{episode_i}.mp4")
            video_recorder = VideoRecorder(self.env,
                                           video_path,
                                           enabled=video_path is not None)

            # 1) sync from global model to local model
            # self._copy_to_local()

            # 2) collect t_max steps (if terminated then i++)
            steps = []
            # print(self.local_model.predict_policy(cur_state, self.sess), int(np.argmax(cur_state))% (6), int(np.argmax(cur_state)/(6)))
            for _ in range(self.t_max):
                # if need_restart:
                #     action = 0
                #     need_restart = False
                #     print('start using life: ' + str(start_life))
                # else:
                action = self.local_model.get_action(cur_state, self.sess)
                # print(action)

                next_state, reward, done, info = self.env.step(action)
                next_state = preprocessor_for_ddn.obs_to_state(next_state)

                # if start_life > info['ale.lives']:
                #     need_restart = True
                #     start_life = info['ale.lives']

                # capture video
                video_recorder.capture_frame()

                # reward *= MULT_FAC
                cum_reward += np.power(self.gamma, episode_len) * reward

                if reward != 0:
                    print('cum_reward: ' + str(cum_reward))
                # print(episode_len)

                episode_len = episode_len + 1
                # steps.append(
                #     Step(
                #         cur_step=cur_state,
                #         action=action,
                #         next_step=next_state,
                #         reward=reward,
                #         done=done
                #     )
                # )
                if done or episode_len >= MAX_STEPS_PER_EPISODE:
                    self.history.append(episode_len)
                    summary = tf.Summary()
                    summary.value.add(tag='Perf/episode_len',
                                      simple_value=float(episode_len))
                    summary.value.add(tag='Perf/episode_reward',
                                      simple_value=float(cum_reward))
                    self.summary_writer.add_summary(summary, episode_i)
                    print(episode_i)
                    print(summary)
                    print(
                        'worker {}: episode {} finished in {} steps, cumulative reward: {}'
                        .format(self.name, episode_i, episode_len, cum_reward))
                    # print(action)
                    if episode_i % 100 == 0 and episode_i != 0:
                        saver.save(
                            self.sess, self.model_path + '/model-' +
                            str(episode_i) + '.cptk')
                        print("Saved Model")
                    cum_reward = 0
                    episode_i = episode_i + 1
                    episode_len = 0
                    # start_life = 5
                    # need_restart = True
                    cur_state = preprocessor_for_ddn.obs_to_state(
                        self.env.reset())
                    break
                cur_state = next_state

            # save video
            print(f"Saving video to {video_path}")
            video_recorder.close()
            video_recorder.enabled = False
            print(f"Video saved")