Example #1
0
def run_tasks(model: api_model.Model, task_list):
    """Tests format of task class, correctness of task output, presence of canary strings.

    Args:
      model: The model to test on
      task_list: A list of task directory names to test on.
        If None, all tasks will be tested.
    """

    bench_dir = benchmark_tasks.__path__[0]
    task_dirs = os.listdir(bench_dir)

    if task_list:
        for task in task_list:
            if task not in task_dirs:
                raise ValueError(f"Task not found: {task}")

    for task_dir in task_dirs:
        if task_dir == "__pycache__":
            continue

        if task_list and task_dir not in task_list:
            continue

        task_path = os.path.join(bench_dir, task_dir)
        if not os.path.isdir(task_path):
            continue
        task_files = os.listdir(task_path)

        print(f">>>>>>>>> Testing {task_dir}")

        assert (
            "README.md" in task_files or "README.pdf" in task_files
        ), f"Task {task_dir} is missing a README.md or .pdf file"

        if "task.json" in task_files:
            json_path = os.path.join(task_path, "task.json")
            this_task = util.load_json_task(json_path)
            task_name = json_path
        elif "task.py" in task_files:
            task_module_name = "bigbench.benchmark_tasks." + task_dir + ".task"
            this_task = util.load_programmatic_task(task_module_name)
            task_name = task_module_name
        else:
            assert (
                False
            ), f"Task definition file (task.py or task.json) not found in task directory {task_path}"

        run_single_task(model, this_task, task_name)

        verify_canary(task_path)
Example #2
0
def _get_json_tasks(
    task_list: Optional[List[str]] = None,
    task_blacklist: Optional[List[str]] = None,
    max_examples: Optional[int] = None,
) -> Dict[str, json_task.JsonTask]:
    """Instantiates json tasks.

    Args:
      task_list: list of tasks to load, if None return all json tasks
      task_blacklist: list of tasks to skip
      max_examples: maximum number of examples to evaluate

    Returns:
      mapping of task_name -> task
    """
    blacklist = task_blacklist or []
    bench_dir = benchmark_tasks.__path__[0]
    task_dirs = os.listdir(bench_dir)
    task_dict = {}

    if task_list:
        for task in task_list:
            if task not in task_dirs:
                raise ValueError(f"Task not found: {task}")

    for task_dir in task_dirs:
        if task_dir == "__pycache__":
            continue

        if task_list and task_dir not in task_list:
            continue

        if task_dir in blacklist:
            continue

        task_path = os.path.join(bench_dir, task_dir)
        if not os.path.isdir(task_path):
            continue
        task_files = os.listdir(task_path)

        if "task.json" not in task_files:
            continue

        json_path = os.path.join(task_path, "task.json")
        this_task = util.load_json_task(json_path, max_examples=max_examples)
        task_name = json_path

        task_dict[json_path] = this_task

    return task_dict
Example #3
0
def single_task_metadata(task_dir):
    task_path = os.path.join(bench_dir, task_dir)

    task_files = os.listdir(task_path)

    if "task.json" in task_files:
        json_path = os.path.join(task_path, "task.json")
        this_task = util.load_json_task(json_path)
        task_name = json_path
    elif "task.py" in task_files:
        task_module_name = "bigbench.benchmark_tasks." + task_dir + ".task"
        this_task = util.load_programmatic_task(task_module_name)
        task_name = task_module_name
    else:
        assert False, task_path

    task_info = this_task.get_task_details()
    description = task_info.description.replace("\n", " ").replace("|", "/")
    return description, task_info.keywords
Example #4
0
def run_tasks(
    model: api_model.Model,
    task_list: List[str],
    max_examples: Optional[int] = None,
    task_denylist: Optional[List[str]] = None,
):
    """Tests format of task class, correctness of task output, presence of canary strings.

    Args:
      model: a Model instance
      task_list: A list of task directory names to test on.
        If None, all tasks will be tested.
      max_examples: maximum number of examples to evaluate for json tasks
      task_denylist: a list of tasks to be skipped
    """

    denylist = task_denylist or []
    bench_dir = benchmark_tasks.__path__[0]
    task_dirs = os.listdir(bench_dir)
    # test the most recently modified tasks first, to fail fast if there's a bug
    task_dirs.sort(reverse=True,
                   key=lambda task_dir: os.path.getmtime(
                       os.path.join(bench_dir, task_dir)))

    if task_list:
        for task in task_list:
            if task not in task_dirs:
                raise ValueError(f"Task not found: {task}")

    for task_dir in task_dirs:
        if task_dir == "__pycache__":
            continue

        if task_list and task_dir not in task_list:
            continue

        if task_dir in denylist:
            continue

        task_path = os.path.join(bench_dir, task_dir)
        if not os.path.isdir(task_path):
            continue
        task_files = os.listdir(task_path)

        print(f">>>>>>>>> Testing {task_dir}")

        assert ("README.md" in task_files or "README.pdf" in task_files
                ), f"Task {task_dir} is missing a README.md or .pdf file"

        if "task.json" in task_files:
            json_path = os.path.join(task_path, "task.json")
            this_task = util.load_json_task(json_path)
            if this_task.subtasks:  # eval one example/subtask
                eval_examples = len(this_task.subtasks)
            else:
                eval_examples = 1
        elif "task.py" in task_files:
            task_module_name = "bigbench.benchmark_tasks." + task_dir + ".task"
            this_task = util.load_programmatic_task(task_module_name)
            eval_examples = max_examples
        else:
            assert (
                False
            ), f"Task definition file (task.py or task.json) not found in task directory {task_path}"

        run_single_task(model, this_task, task_dir, max_examples=eval_examples)

        verify_canary(task_path)