Example #1
0
def test_malformed_lookup():
    try:
        spec("“Breakout-v0”")
    except error.Error as e:
        assert "Malformed environment ID" in f"{e}", f"Unexpected message: {e}"
    else:
        assert False
Example #2
0
def test_default_lookups():
    register("test/Test3")

    with pytest.raises(error.DeprecatedEnv):
        spec("test/Test3-v0")

    # Lookup default
    spec("test/Test3")
Example #3
0
def test_versioned_lookups():
    register("test/Test2-v5")

    with pytest.raises(error.VersionNotFound):
        spec("test/Test2-v9")

    with pytest.raises(error.DeprecatedEnv):
        spec("test/Test2-v4")

    assert spec("test/Test2-v5")
Example #4
0
def test_missing_lookup():
    register(id="Test1-v0", entry_point=None)
    register(id="Test1-v15", entry_point=None)
    register(id="Test1-v9", entry_point=None)
    register(id="Other1-v100", entry_point=None)

    with pytest.raises(error.DeprecatedEnv):
        spec("Test1-v1")

    try:
        spec("Test1-v1000")
    except error.UnregisteredEnv:
        pass
    else:
        assert False

    try:
        spec("Unknown1-v1")
    except error.UnregisteredEnv:
        pass
    else:
        assert False
Example #5
0
def test_spec():
    spec = envs.spec("CartPole-v0")
    assert spec.id == "CartPole-v0"
def test_spec():
    spec = envs.spec('CartPole-v0')
    assert spec.id == 'CartPole-v0'
Example #7
0
    def score_evaluation(self, benchmark, env_id, data_sources, initial_reset_timestamps, episode_lengths, episode_rewards, episode_types, timestamps):
        tasks = benchmark.task_specs(env_id)
        spec = envs.spec(env_id)

        #### 0. Compute timing stats

        if len(initial_reset_timestamps) > 0:
            initial_reset_timestamp = min(initial_reset_timestamps)
        else:
            initial_reset_timestamp = 0


        # How long each episode actually took
        timestamps = np.array(timestamps)
        durations = _compute_episode_durations(initial_reset_timestamps, data_sources, timestamps)

        #### Grab the data corresponding to each of evaluation/training
        lengths = np.array(episode_lengths)
        rewards = np.array(episode_rewards)

        #### Calculate the total elapsed time (in various units)
        #### for each episode

        # How many training timesteps have elapsed by the end of each
        # episode. Not to be confused with Unix timestamps.
        elapsed_timesteps = np.cumsum(lengths)
        # Total number of seconds elapsed by the end of each
        # episode. Note that with n parallel workers each running for
        # m seconds, we want to count the total time as n * m.
        elapsed_seconds = np.cumsum(durations)

        # List of score for each task
        scores = []
        # List of lists of solved episodes for each task
        solves = []
        # List of lists of episode rewards for each task
        rewards = []
        _timestamps = []
        elapsed_times = []
        for task in tasks:
            # Find the first episode where we're over the allotted
            # training timesteps.
            cutoff_idx = _find_cutoffs_for_task(task, elapsed_timesteps, elapsed_seconds)
            if not np.isfinite(cutoff_idx):
                # All episodes are fair game
                cutoff_idx = len(lengths)

            reward = np.array(episode_rewards)[:cutoff_idx]

            score, solved = self.score_and_solved_func(task, reward, elapsed_seconds[:cutoff_idx])

            scores.append(score)
            solves.append(solved)
            rewards.append(reward)

            if np.any(timestamps[:cutoff_idx]):
                last_timestamp = timestamps[cutoff_idx - 1]
                elapsed_time = elapsed_seconds[cutoff_idx - 1]
            else:
                # If we don't have any valid episodes, then the
                # last valid timestamp is when we started.
                last_timestamp = initial_reset_timestamp
                elapsed_time = 0.0

            # Record the timestamp of the last episode
            _timestamps.append(last_timestamp)
            elapsed_times.append(elapsed_time)

        return {
            'rewards': rewards,
            'scores': scores,
            'solves': solves,
            'timestamps': _timestamps,
            'elapsed_times': elapsed_times,
            'initial_reset_timestamp': initial_reset_timestamp,
        }
Example #8
0
def generate_yaml(
        exp_names,
        output_path,  # number=None
):
    # The yaml file should reflect complete information of experiment.
    # So we do not allow number as argument.
    # Get the trial_name-json_path dict.

    assert spec(args.env_name)  # make sure no typo in env_name
    trial_json_dict = {}
    if isinstance(exp_names, str):
        exp_names = [exp_names]
    for exp_name in exp_names:
        trial_json_dict.update(get_trial_json_dict(exp_name))
    # Get the trial_name-trial_data dict. This is not ordered.
    trial_data_dict = get_trial_data_dict(trial_json_dict)
    K = 3
    trial_performance_list = []
    for i, (trial_name, data) in enumerate(trial_data_dict.items()):
        avg = data[PERFORMANCE_METRIC].tail(K).mean()
        if np.isnan(avg):
            avg = float("-inf")
            print("Avg: ", avg, np.isnan(avg))
        trial_performance_list.append([trial_name, avg])
    # print("Collected trial_performance_list: ", trial_performance_list)
    sorted_trial_pfm_list = sorted(trial_performance_list,
                                   key=lambda pair: pair[1])

    def get_video_name(trial_name, performance):
        # trial_name: PPO_BipedalWalker-v2_38_seed=138
        # result: "PPO seed=139 rew=249.01"
        components = trial_name.split("_")
        try:
            ret = "{0} {3} rew={4:.2f}".format(*components, performance)
        except ValueError:
            strs = components + [performance]
            strs = [str(s) for s in strs]
            ret = ",".join(strs)
        return ret

    # Return: [{"name": NAME, "path": CKPT_PATH, ...}, {...}, ...]
    results = []
    for (trial_name, performance) in sorted_trial_pfm_list:
        json_path = trial_json_dict[trial_name]
        trial_path = os.path.dirname(json_path)
        ckpt = get_latest_checkpoint(trial_path)
        if ckpt is None:
            continue
        run_name = trial_name.split("_")[0]
        env_name = trial_name.split("_")[1]
        cool_name = get_video_name(trial_name, performance)
        results.append({
            "name": cool_name,
            "path": ckpt["path"],
            "performance": float(performance),
            "run_name": run_name,
            "env_name": env_name,
            "iter": ckpt["iter"]
        })
    save_yaml(results, output_path)
    print("Successfully collect yaml file containing {} checkpoints.".format(
        len(results)))

    # if rollout:
    #     pass
    # several_agent_rollout(output_path, num_rollouts, seed)

    return results
Example #9
0
def test_spec():
    spec = envs.spec('CartPole-v0')
    assert spec.id == 'CartPole-v0'
Example #10
0
    def score_evaluation(self, benchmark, env_id, data_sources,
                         initial_reset_timestamps, episode_lengths,
                         episode_rewards, episode_types, timestamps):
        # TODO refactor code shared with the clip scoring rule above
        tasks = benchmark.task_specs(env_id)
        spec = envs.spec(env_id)

        #### 0. Compute timing stats

        if len(initial_reset_timestamps) > 0:
            initial_reset_timestamp = min(initial_reset_timestamps)
        else:
            initial_reset_timestamp = 0

        # How long each episode actually took
        durations = np.zeros(len(timestamps))

        data_sources = np.array(data_sources)
        timestamps = np.array(timestamps)
        for source, initial_ts in enumerate(initial_reset_timestamps):
            (source_indexes, ) = np.where(data_sources == source)

            # Once we know the indexes corresponding to a particular
            # source (i.e. worker thread), we can just subtract
            # adjoining values
            durations[
                source_indexes[0]] = timestamps[source_indexes[0]] - initial_ts
            durations[source_indexes[1:]] = timestamps[
                source_indexes[1:]] - timestamps[source_indexes[:-1]]

        #### Grab the data corresponding to each of evaluation/training
        lengths = np.array(episode_lengths)
        rewards = np.array(episode_rewards)
        durations = np.array(durations)

        #### Calculate the total elapsed time (in various units)
        #### for each episode

        # How many training timesteps have elapsed by the end of each
        # episode. Not to be confused with Unix timestamps.
        elapsed_timesteps = np.cumsum(lengths)
        # Total number of seconds elapsed by the end of each
        # episode. Note that with n parallel workers each running for
        # m seconds, we want to count the total time as n * m.
        elapsed_seconds = np.cumsum(durations)

        scores = []
        solves = []
        rewards = []
        _timestamps = []
        elapsed_times = []
        for task in tasks:
            # Find the first episode where we're over the allotted
            # training timesteps.
            cutoff_idx = np.inf
            if task.max_timesteps:
                # this looks a little funny, but we want the first idx greater
                # than the cutoff
                (timestep_cutoff, ) = np.where(
                    elapsed_timesteps > task.max_timesteps)
                if len(timestep_cutoff) > 0:
                    cutoff_idx = min(cutoff_idx, timestep_cutoff[0])
            if task.max_seconds:
                (seconds_cutoff, ) = np.where(
                    elapsed_seconds > task.max_seconds)
                if len(seconds_cutoff) > 0:
                    cutoff_idx = min(cutoff_idx, seconds_cutoff[0])
            if not np.isfinite(cutoff_idx):
                # All episodes are fair game
                cutoff_idx = len(lengths)

            reward = np.array(episode_rewards)[:cutoff_idx]

            floor = task.reward_floor
            ceiling = task.reward_ceiling

            solved = reward >= ceiling
            # Sum raw rewards, linearly rescale to between 0 and 1
            score = np.clip((np.mean(reward) - floor) / (ceiling - floor), 0,
                            1)

            # Take the mean rescaled score
            scores.append(score)
            # Record the list of solved episodes
            solves.append(solved)
            # Record the list of rewards
            rewards.append(reward)

            if np.any(timestamps[:cutoff_idx]):
                last_idx = cutoff_idx - 1
                last_timestamp = timestamps[last_idx]
                elapsed_time = elapsed_seconds[last_idx]
            else:
                # If we don't have any valid episodes, then the
                # last valid timestamp is when we started.
                last_timestamp = initial_reset_timestamp
                elapsed_time = 0.0

            # Record the timestamp of the last episode timestamp
            _timestamps.append(last_timestamp)
            elapsed_times.append(elapsed_time)

        return {
            'rewards': rewards,
            'scores': scores,
            'solves': solves,
            'timestamps': _timestamps,
            'elapsed_times': elapsed_times,
            'initial_reset_timestamp': initial_reset_timestamp,
        }
Example #11
0
def test_spec():
    spec = envs.spec("CartPole-v0")
    assert spec.id == "CartPole-v0"
Example #12
0
    def score_evaluation(self, benchmark, env_id, episode_lengths,
                         episode_rewards, episode_types, timestamps,
                         initial_reset_timestamp):
        tasks = benchmark.task_groups[env_id]
        spec = envs.spec(env_id)

        (t_idx, ) = np.where([t == 't'
                              for t in episode_types])  # training episodes
        (e_idx, ) = np.where([t == 'e'
                              for t in episode_types])  # evaluation episodes
        if len(e_idx) == 0:
            # If no episodes marked for evaluation, consider
            # everything both a training and evaluation episode.
            (t_idx, ) = np.where([True for t in episode_types])
            (e_idx, ) = np.where([True for t in episode_types])

        training_lengths = np.array(episode_lengths)[t_idx]
        training_rewards = np.array(episode_rewards)[t_idx]

        evaluation_lengths = np.array(episode_lengths)[e_idx]
        evaluation_rewards = np.array(episode_rewards)[e_idx]

        # How many training timesteps have elapsed by the end of each
        # episode. Not to be confused with Unix timestamps.
        elapsed_timesteps = np.cumsum(training_lengths)

        scores = []
        solves = []
        rewards = []
        _timestamps = []
        for task in tasks:
            # Find the first episode where we're over the allotted
            # training timesteps.
            (cutoff, ) = np.where(elapsed_timesteps > task.timesteps)
            if len(cutoff) > 0:
                cutoff_idx = cutoff[-1]
                orig_cutoff_idx = t_idx[
                    cutoff_idx]  # cutoff index in the original
                (allowed_e_idx, ) = np.where(
                    e_idx < orig_cutoff_idx)  # restrict to earlier episodes
            else:
                # All episodes are fair game
                allowed_e_idx = e_idx

            if len(allowed_e_idx) > 0:
                last_timestamp = timestamps[allowed_e_idx[-1]]
            else:
                # If we don't have any evaluation episodes, then the
                # last valid timestamp is when we started.
                last_timestamp = initial_reset_timestamp

            # Grab the last num_episodes evaluation episodes from
            # before the cutoff (at which point we've gathered too
            # much experience).
            #
            # This probably won't work long-term but is fine for now.
            allowed_episode_rewards = np.array(episode_rewards)[allowed_e_idx]
            reward = allowed_episode_rewards[-self.num_episodes:]

            floor = task.reward_floor
            ceiling = task.reward_ceiling

            if len(reward) < self.num_episodes:
                extra = self.num_episodes - len(reward)
                logger.info('Only %s rewards for %s; adding %s', len(reward),
                            env_id, extra)
                reward = np.concatenate([reward, [floor] * extra])

            # Grab the indexes where we reached the ceiling
            solved = reward >= ceiling
            # Linearly rescale rewards to between 0 and 1
            clipped = np.clip((reward - floor) / (ceiling - floor), 0, 1)

            # Take the mean rescaled score
            score = np.mean(clipped)
            scores.append(score)
            # Record the list of solved episodes
            solves.append(solved)
            # Record the list of rewards
            rewards.append(reward)
            # Record the timestamp of the last episode timestamp
            _timestamps.append(last_timestamp)

        return {
            'rewards': rewards,
            'scores': scores,
            'solves': solves,
            'timestamps': _timestamps,
        }
Example #13
0
    def score_evaluation(self, benchmark, env_id, episode_lengths, episode_rewards, episode_types, timestamps, initial_reset_timestamp):
        tasks = benchmark.task_groups[env_id]
        spec = envs.spec(env_id)

        (t_idx,) = np.where([t == 't' for t in episode_types]) # training episodes
        (e_idx,) = np.where([t == 'e' for t in episode_types]) # evaluation episodes
        if len(e_idx) == 0:
            # If no episodes marked for evaluation, consider
            # everything both a training and evaluation episode.
            (t_idx,) = np.where([True for t in episode_types])
            (e_idx,) = np.where([True for t in episode_types])

        training_lengths = np.array(episode_lengths)[t_idx]
        training_rewards = np.array(episode_rewards)[t_idx]

        evaluation_lengths = np.array(episode_lengths)[e_idx]
        evaluation_rewards = np.array(episode_rewards)[e_idx]

        # How many training timesteps have elapsed by the end of each
        # episode. Not to be confused with Unix timestamps.
        elapsed_timesteps = np.cumsum(training_lengths)

        scores = []
        solves = []
        rewards = []
        _timestamps = []
        for task in tasks:
            # Find the first episode where we're over the allotted
            # training timesteps.
            (cutoff,) = np.where(elapsed_timesteps > task.timesteps)
            if len(cutoff) > 0:
                cutoff_idx = cutoff[-1]
                orig_cutoff_idx = t_idx[cutoff_idx] # cutoff index in the original
                (allowed_e_idx,) = np.where(e_idx < orig_cutoff_idx) # restrict to earlier episodes
            else:
                # All episodes are fair game
                allowed_e_idx = e_idx

            if len(allowed_e_idx) > 0:
                last_timestamp = timestamps[allowed_e_idx[-1]]
            else:
                # If we don't have any evaluation episodes, then the
                # last valid timestamp is when we started.
                last_timestamp = initial_reset_timestamp

            # Grab the last num_episodes evaluation episodes from
            # before the cutoff (at which point we've gathered too
            # much experience).
            #
            # This probably won't work long-term but is fine for now.
            allowed_episode_rewards = np.array(episode_rewards)[allowed_e_idx]
            reward = allowed_episode_rewards[-self.num_episodes:]

            floor = task.reward_floor
            ceiling = task.reward_ceiling

            if len(reward) < self.num_episodes:
                extra = self.num_episodes-len(reward)
                logger.info('Only %s rewards for %s; adding %s', len(reward), env_id, extra)
                reward = np.concatenate([reward, [floor] * extra])

            # Grab the indexes where we reached the ceiling
            solved = reward >= ceiling
            # Linearly rescale rewards to between 0 and 1
            clipped = np.clip((reward - floor) / (ceiling - floor), 0, 1)

            # Take the mean rescaled score
            score = np.mean(clipped)
            scores.append(score)
            # Record the list of solved episodes
            solves.append(solved)
            # Record the list of rewards
            rewards.append(reward)
            # Record the timestamp of the last episode timestamp
            _timestamps.append(last_timestamp)

        return {
            'rewards': rewards,
            'scores': scores,
            'solves': solves,
            'timestamps': _timestamps,
        }
Example #14
0
    def score_evaluation(self, benchmark, env_id, data_sources, initial_reset_timestamps, episode_lengths, episode_rewards, episode_types, timestamps):
        # TODO refactor code shared with the clip scoring rule above
        tasks = benchmark.task_specs(env_id)
        spec = envs.spec(env_id)

        #### 0. Compute timing stats

        if len(initial_reset_timestamps) > 0:
            initial_reset_timestamp = min(initial_reset_timestamps)
        else:
            initial_reset_timestamp = 0


        # How long each episode actually took
        durations = np.zeros(len(timestamps))

        data_sources = np.array(data_sources)
        timestamps = np.array(timestamps)
        for source, initial_ts in enumerate(initial_reset_timestamps):
            (source_indexes,) = np.where(data_sources == source)

            # Once we know the indexes corresponding to a particular
            # source (i.e. worker thread), we can just subtract
            # adjoining values
            durations[source_indexes[0]] = timestamps[source_indexes[0]] - initial_ts
            durations[source_indexes[1:]] = timestamps[source_indexes[1:]] - timestamps[source_indexes[:-1]]

        #### Grab the data corresponding to each of evaluation/training
        lengths = np.array(episode_lengths)
        rewards = np.array(episode_rewards)
        durations = np.array(durations)

        #### Calculate the total elapsed time (in various units)
        #### for each episode

        # How many training timesteps have elapsed by the end of each
        # episode. Not to be confused with Unix timestamps.
        elapsed_timesteps = np.cumsum(lengths)
        # Total number of seconds elapsed by the end of each
        # episode. Note that with n parallel workers each running for
        # m seconds, we want to count the total time as n * m.
        elapsed_seconds = np.cumsum(durations)

        scores = []
        solves = []
        rewards = []
        _timestamps = []
        elapsed_times = []
        for task in tasks:
            # Find the first episode where we're over the allotted
            # training timesteps.
            cutoff_idx = np.inf
            if task.max_timesteps:
                # this looks a little funny, but we want the first idx greater
                # than the cutoff
                (timestep_cutoff,) = np.where(elapsed_timesteps > task.max_timesteps)
                if len(timestep_cutoff) > 0:
                    cutoff_idx = min(cutoff_idx, timestep_cutoff[0])
            if task.max_seconds:
                (seconds_cutoff,) = np.where(elapsed_seconds > task.max_seconds)
                if len(seconds_cutoff) > 0:
                    cutoff_idx = min(cutoff_idx, seconds_cutoff[0])
            if not np.isfinite(cutoff_idx):
                # All episodes are fair game
                cutoff_idx = len(lengths)

            reward = np.array(episode_rewards)[:cutoff_idx]

            floor = task.reward_floor
            ceiling = task.reward_ceiling

            solved = reward >= ceiling
            # Sum raw rewards, linearly rescale to between 0 and 1
            score = np.clip((np.mean(reward) - floor) / (ceiling - floor), 0, 1)

            # Take the mean rescaled score
            scores.append(score)
            # Record the list of solved episodes
            solves.append(solved)
            # Record the list of rewards
            rewards.append(reward)

            if np.any(timestamps[:cutoff_idx]):
                last_idx = cutoff_idx - 1
                last_timestamp = timestamps[last_idx]
                elapsed_time = elapsed_seconds[last_idx]
            else:
                # If we don't have any valid episodes, then the
                # last valid timestamp is when we started.
                last_timestamp = initial_reset_timestamp
                elapsed_time = 0.0

            # Record the timestamp of the last episode timestamp
            _timestamps.append(last_timestamp)
            elapsed_times.append(elapsed_time)

        return {
            'rewards': rewards,
            'scores': scores,
            'solves': solves,
            'timestamps': _timestamps,
            'elapsed_times': elapsed_times,
            'initial_reset_timestamp': initial_reset_timestamp,
        }
Example #15
0
    def score_evaluation(self, benchmark, env_id, data_sources, initial_reset_timestamps, episode_lengths, episode_rewards, episode_types, timestamps):
        tasks = benchmark.task_specs(env_id)
        spec = envs.spec(env_id)

        #### 0. Compute timing stats

        if len(initial_reset_timestamps) > 0:
            initial_reset_timestamp = min(initial_reset_timestamps)
        else:
            initial_reset_timestamp = 0


        # How long each episode actually took
        # How long each episode actually took
        durations = np.zeros(len(timestamps))

        data_sources = np.array(data_sources)
        timestamps = np.array(timestamps)
        for source, initial_ts in enumerate(initial_reset_timestamps):
            (source_indexes,) = np.where(data_sources == source)

            if len(source_indexes) == 0:
                continue
            # Once we know the indexes corresponding to a particular
            # source (i.e. worker thread), we can just subtract
            # adjoining values
            durations[source_indexes[0]] = timestamps[source_indexes[0]] - initial_ts
            durations[source_indexes[1:]] = timestamps[source_indexes[1:]] - timestamps[source_indexes[:-1]]

        #### 1. Select out which indexes are for evaluation and which are for training

        (t_idx,) = np.where([t == 't' for t in episode_types]) # training episodes
        (e_idx,) = np.where([t == 'e' for t in episode_types]) # evaluation episodes
        if len(e_idx) == 0:
            # If no episodes marked for evaluation, consider
            # everything both a training and evaluation episode.
            (t_idx,) = np.where([True for t in episode_types])
            (e_idx,) = np.where([True for t in episode_types])

        #### 2. Grab the data corresponding to each of evaluation/training

        training_lengths = np.array(episode_lengths)[t_idx]
        training_rewards = np.array(episode_rewards)[t_idx]
        training_durations = np.array(durations)[t_idx]

        evaluation_lengths = np.array(episode_lengths)[e_idx]
        evaluation_rewards = np.array(episode_rewards)[e_idx]
        evaluation_durations = np.array(durations)[e_idx]

        #### 3. Calculate the total elapsed time (in various units)
        #### for each episode

        # How many training timesteps have elapsed by the end of each
        # episode. Not to be confused with Unix timestamps.
        elapsed_timesteps = np.cumsum(training_lengths)
        # Total number of seconds elapsed by the end of each
        # episode. Note that with n parallel workers each running for
        # m seconds, we want to count the total time as n * m.
        elapsed_seconds = np.cumsum(training_durations)

        scores = []
        solves = []
        rewards = []
        _timestamps = []
        elapsed_times = []
        for task in tasks:
            # Find the first episode where we're over the allotted
            # training timesteps.
            cutoff_idx = np.inf
            if task.max_timesteps:
                # this looks a little funny, but we want the first idx greater
                # than the cutoff
                (timestep_cutoff,) = np.where(elapsed_timesteps > task.max_timesteps)
                if len(timestep_cutoff) > 0:
                    cutoff_idx = min(cutoff_idx, timestep_cutoff[0])
            if task.max_seconds:
                (seconds_cutoff,) = np.where(elapsed_seconds > task.max_seconds)
                if len(seconds_cutoff) > 0:
                    cutoff_idx = min(cutoff_idx, seconds_cutoff[0])
            if np.isfinite(cutoff_idx):
                orig_cutoff_idx = t_idx[cutoff_idx] # cutoff index in the original (i.e. before filtering to training/evaluation)
                (allowed_e_idx,) = np.where(e_idx < orig_cutoff_idx) # restrict to earlier episodes
            else:
                # All episodes are fair game
                allowed_e_idx = e_idx

            # Grab the last num_episodes evaluation episodes from
            # before the cutoff (at which point we've gathered too
            # much experience).
            #
            # This probably won't work long-term but is fine for now.
            allowed_episode_rewards = np.array(episode_rewards)[allowed_e_idx]
            reward = allowed_episode_rewards[-self.num_episodes:]

            floor = task.reward_floor
            ceiling = task.reward_ceiling

            if len(reward) < self.num_episodes:
                extra = self.num_episodes-len(reward)
                logger.info('Only %s rewards for %s; adding %s', len(reward), env_id, extra)
                reward = np.concatenate([reward, [floor] * extra])

            # Grab the indexes where we reached the ceiling
            solved = reward >= ceiling
            # Linearly rescale rewards to between 0 and 1
            clipped = np.clip((reward - floor) / (ceiling - floor), 0, 1)

            # Take the mean rescaled score
            score = np.mean(clipped)
            scores.append(score)
            # Record the list of solved episodes
            solves.append(solved)
            # Record the list of rewards
            rewards.append(reward)

            if len(allowed_e_idx) > 0:
                if not np.isfinite(cutoff_idx):
                    cutoff_idx = len(elapsed_seconds) - 1
                last_t_idx = t_idx[cutoff_idx]
                # timestamps is full length
                last_timestamp = timestamps[last_t_idx]
                # elapsed seconds contains only training
                elapsed_time = elapsed_seconds[cutoff_idx]
            else:
                # If we don't have any evaluation episodes, then the
                # last valid timestamp is when we started.
                last_timestamp = initial_reset_timestamp
                elapsed_time = 0.0

            # Record the timestamp of the last episode timestamp
            _timestamps.append(last_timestamp)
            elapsed_times.append(elapsed_time)

        return {
            'rewards': rewards,
            'scores': scores,
            'solves': solves,
            'timestamps': _timestamps,
            'elapsed_times': elapsed_times,
            'initial_reset_timestamp': initial_reset_timestamp,
        }
Example #16
0
    def score_evaluation(self, benchmark, env_id, data_sources, initial_reset_timestamps, episode_lengths, episode_rewards, episode_types, timestamps):
        tasks = benchmark.task_specs(env_id)
        spec = envs.spec(env_id)

        #### 0. Compute timing stats

        if len(initial_reset_timestamps) > 0:
            initial_reset_timestamp = min(initial_reset_timestamps)
        else:
            initial_reset_timestamp = 0


        # How long each episode actually took
        # How long each episode actually took
        durations = np.zeros(len(timestamps))

        data_sources = np.array(data_sources)
        timestamps = np.array(timestamps)
        for source, initial_ts in enumerate(initial_reset_timestamps):
            (source_indexes,) = np.where(data_sources == source)

            # Once we know the indexes corresponding to a particular
            # source (i.e. worker thread), we can just subtract
            # adjoining values
            durations[source_indexes[0]] = timestamps[source_indexes[0]] - initial_ts
            durations[source_indexes[1:]] = timestamps[source_indexes[1:]] - timestamps[source_indexes[:-1]]

        #### 1. Select out which indexes are for evaluation and which are for training

        (t_idx,) = np.where([t == 't' for t in episode_types]) # training episodes
        (e_idx,) = np.where([t == 'e' for t in episode_types]) # evaluation episodes
        if len(e_idx) == 0:
            # If no episodes marked for evaluation, consider
            # everything both a training and evaluation episode.
            (t_idx,) = np.where([True for t in episode_types])
            (e_idx,) = np.where([True for t in episode_types])

        #### 2. Grab the data corresponding to each of evaluation/training

        training_lengths = np.array(episode_lengths)[t_idx]
        training_rewards = np.array(episode_rewards)[t_idx]
        training_durations = np.array(durations)[t_idx]

        evaluation_lengths = np.array(episode_lengths)[e_idx]
        evaluation_rewards = np.array(episode_rewards)[e_idx]
        evaluation_durations = np.array(durations)[e_idx]

        #### 3. Calculate the total elapsed time (in various units)
        #### for each episode

        # How many training timesteps have elapsed by the end of each
        # episode. Not to be confused with Unix timestamps.
        elapsed_timesteps = np.cumsum(training_lengths)
        # Total number of seconds elapsed by the end of each
        # episode. Note that with n parallel workers each running for
        # m seconds, we want to count the total time as n * m.
        elapsed_seconds = np.cumsum(training_durations)

        scores = []
        solves = []
        rewards = []
        _timestamps = []
        elapsed_times = []
        for task in tasks:
            # Find the first episode where we're over the allotted
            # training timesteps.
            cutoff_idx = np.inf
            if task.max_timesteps:
                # this looks a little funny, but we want the first idx greater
                # than the cutoff
                (timestep_cutoff,) = np.where(elapsed_timesteps > task.max_timesteps)
                if len(timestep_cutoff) > 0:
                    cutoff_idx = min(cutoff_idx, timestep_cutoff[0])
            if task.max_seconds:
                (seconds_cutoff,) = np.where(elapsed_seconds > task.max_seconds)
                if len(seconds_cutoff) > 0:
                    cutoff_idx = min(cutoff_idx, seconds_cutoff[0])
            if np.isfinite(cutoff_idx):
                orig_cutoff_idx = t_idx[cutoff_idx] # cutoff index in the original (i.e. before filtering to training/evaluation)
                (allowed_e_idx,) = np.where(e_idx < orig_cutoff_idx) # restrict to earlier episodes
            else:
                # All episodes are fair game
                allowed_e_idx = e_idx

            # Grab the last num_episodes evaluation episodes from
            # before the cutoff (at which point we've gathered too
            # much experience).
            #
            # This probably won't work long-term but is fine for now.
            allowed_episode_rewards = np.array(episode_rewards)[allowed_e_idx]
            reward = allowed_episode_rewards[-self.num_episodes:]

            floor = task.reward_floor
            ceiling = task.reward_ceiling

            if len(reward) < self.num_episodes:
                extra = self.num_episodes-len(reward)
                logger.info('Only %s rewards for %s; adding %s', len(reward), env_id, extra)
                reward = np.concatenate([reward, [floor] * extra])

            # Grab the indexes where we reached the ceiling
            solved = reward >= ceiling
            # Linearly rescale rewards to between 0 and 1
            clipped = np.clip((reward - floor) / (ceiling - floor), 0, 1)

            # Take the mean rescaled score
            score = np.mean(clipped)
            scores.append(score)
            # Record the list of solved episodes
            solves.append(solved)
            # Record the list of rewards
            rewards.append(reward)

            if len(allowed_e_idx) > 0:
                if not np.isfinite(cutoff_idx):
                    cutoff_idx = len(elapsed_seconds) - 1
                last_t_idx = t_idx[cutoff_idx]
                # timestamps is full length
                last_timestamp = timestamps[last_t_idx]
                # elapsed seconds contains only training
                elapsed_time = elapsed_seconds[cutoff_idx]
            else:
                # If we don't have any evaluation episodes, then the
                # last valid timestamp is when we started.
                last_timestamp = initial_reset_timestamp
                elapsed_time = 0.0

            # Record the timestamp of the last episode timestamp
            _timestamps.append(last_timestamp)
            elapsed_times.append(elapsed_time)

        return {
            'rewards': rewards,
            'scores': scores,
            'solves': solves,
            'timestamps': _timestamps,
            'elapsed_times': elapsed_times,
            'initial_reset_timestamp': initial_reset_timestamp,
        }
Example #17
0
def generate_progress_yaml(exp_names, output_path, number=None):
    # The meaning of number: if None, extract all checkpoints from all trials
    # if is an integer, then extract N checkpoints for each trials.
    assert (number is None) or (isinstance(number, int))
    assert spec(args.env_name)  # make sure no typo in env_name
    trial_json_dict = {}
    if isinstance(exp_names, str):
        exp_names = [exp_names]

    for exp_name in exp_names:
        trial_json_dict.update(get_trial_json_dict(exp_name))
    # Get the trial_name-trial_data dict. This is not ordered.
    trial_data_dict = get_trial_data_dict(trial_json_dict)

    def get_video_name(trial_name, performance, num_iters):
        # trial_name: PPO_BipedalWalker-v2_38_seed=138
        # result: "PPO seed=139 rew=249.01"
        components = trial_name.split("_")
        assert len(components) == 4
        return "{0} {3} rew={4:.2f} iter={5:}" \
            .format(*components, performance, num_iters)

    # Return: [{"name": NAME, "path": CKPT_PATH, ...}, {...}, ...]
    results = []
    for (trial_name, dataframe) in trial_data_dict.items():

        # We assume all iteration have stored the checkpoints.
        # But sometimes we store checkpoint in some interval.
        if number is None or number * 2 > len(dataframe):
            data_list = dataframe
        else:
            interval = int(floor(len(dataframe) / number))
            start_index = len(dataframe) % number - 1
            data_list = dataframe[:start_index:-interval][::-1]
        assert (len(data_list) == number) or (
                len(dataframe) == len(data_list)), \
            len(data_list)
        for _, series in data_list.iterrows():
            # varibales show here:
            #    trial_name: PPO_xx_seed=199
            #    json_path: xxx/xxx/trial/result.json
            #    trial_path: xxx/xxx/trial
            num_iters = series["training_iteration"]
            json_path = trial_json_dict[trial_name]
            trial_path = os.path.dirname(json_path)
            # Todo you sure the index of this dataframe is that of iteration?
            ckpt = get_checkpoint(trial_path, num_iters)
            if ckpt is None:
                continue
            run_name = trial_name.split("_")[0]
            env_name = trial_name.split("_")[1]
            performance = series[PERFORMANCE_METRIC]
            cool_name = get_video_name(trial_name, performance, num_iters)
            results.append({
                "name": cool_name,
                "path": ckpt['path'],
                "performance": float(performance),
                "run_name": run_name,
                "env_name": env_name,
                "iter": num_iters
            })
    save_yaml(results, output_path)
    print("Successfully collect yaml file containing {} checkpoints.".format(
        len(results)))
    return results