def dataset_fn(split: str,
                   shuffle_files: Optional[bool] = None,
                   seed: Optional[int] = 42) -> tf.data.Dataset:

        if not Split.is_valid(split):
            allowed_splits = [Split.to_str(spl) for spl in Split]
            raise ValueError(
                f"splits supported: {allowed_splits}, got: {split}")

        bigbench_task = BigBenchJsonTaskFetcher.get_bigbench_json_task_or_subtask(
            task_name=task_name,
            task_path=task_path,
            subtask_name=subtask_name,
            json_util=json_util)

        bigbench_json_data = copy.deepcopy(bigbench_task._ds)  # pylint: disable=protected-access
        scaled_max_examples = max_examples  # avoid mutating nonlocal var
        split = Split.from_str(split)
        if split != Split.ALL:
            # Shuffle and slice the data deterministically, so that examples in the
            # split are mixed, but reproducible.
            bigbench_json_data = _shuffle(bigbench_json_data,
                                          shuffle_seed=SPLIT_SHUFFLE_SEED)
            scaled_max_examples, bigbench_json_data = _slice(
                bigbench_json_data, split, max_num_examples=max_examples)
        bigbench_task_rng = np.random.RandomState(seed=seed)

        if num_shots:
            if scaled_max_examples:
                num_dataset_examples = min(scaled_max_examples,
                                           len(bigbench_json_data))
            else:
                num_dataset_examples = len(bigbench_json_data)
            bigbench_json_data = json_task.make_nshot_dataset(
                bigbench_json_data,
                num_shots,
                random_seed=seed,
                separator=bigbench_task.few_shot_example_separator,
                max_examples=num_dataset_examples)
        elif scaled_max_examples:
            # Ideally sample if scaled_max_examples < len(ds), otherwise the data is
            # just shuffled, but this matches BIG-bench functionality.
            bigbench_json_data = bigbench_task_rng.choice(
                bigbench_json_data,
                min(scaled_max_examples, len(bigbench_json_data)),
                replace=False)

        if shuffle_files:
            # BIG-bench task data is contained in one file, so shuffle_files
            # should be no-op, but FunctionDataSource.get_dataset(shuffle=True)
            # passes the shuffle arg through shuffle_files; hence use shuffle_files to
            # shuffle the examples.
            bigbench_json_data = _shuffle(bigbench_json_data, seed)

        examples = _parse_examples(bigbench_json_data, bigbench_task_type,
                                   bigbench_task.task_prefix, split)
        ds = tf.data.Dataset.from_tensor_slices(examples)
        return ds
Esempio n. 2
0
    def evaluate_fixed_shot(self, model, num_shots) -> Dict[str, Any]:
        """Evaluates task for different metrics.

    Args:
      generate_batch_fn: generating function.
      score_batch_fn: generating function.
      num_shots: number of shots

    Returns:
      Dictionary with results for metrics.
    """
        results = {}
        invalid_results = {'creativity_and_consistency_score': 0}
        if num_shots:
            samples = json_task.make_nshot_dataset(self._ds, num_shots)
        else:
            samples = copy.deepcopy(self._ds)

        inputs = [
            self.task_prefix + sample['input'] + sample['target'][0]
            for sample in samples
        ]

        generative_responses = model.generate_text(inputs=inputs)

        new_samples, results[
            'created_valid_percentage'] = get_multiple_choice_task_from_incepted(
                generative_responses,
                num_choices=self.num_choices,
                input_prefix=self.input_prefix,
                output_prefix=self.output_prefix)

        if not new_samples:
            return invalid_results

        multiple_choice_inputs = [sample['input'] for sample in new_samples]
        choice_targets = [
            list(sample['target_scores'].keys()) for sample in new_samples
        ]

        scores = model.cond_log_prob(inputs=multiple_choice_inputs,
                                     targets=choice_targets)

        for i, sample in enumerate(new_samples):
            sample['log_prob'] = scores[i]

        accuracy = metrics.measure_multiple_choice_metrics(
            new_samples, ['multiple_choice_grade'])['multiple_choice_grade']

        results['multiple_choice_grade'] = accuracy
        results['creativity_and_consistency_score'] = results[
            'created_valid_percentage'] * results['multiple_choice_grade']

        return results
Esempio n. 3
0
def check_fewshot_samples(samples, expected_samples, append=True):
    samples = copy.deepcopy(samples)
    formatted_samples = [
        default_format_fn(
            sample,
            input_prefix="Q: ",
            output_prefix=" A: ",
            append_choices_to_input=append,
        )
        for sample in samples
    ]
    fewshot_samples = make_nshot_dataset(formatted_samples, shots=1)
    for sample, expected in zip(fewshot_samples, expected_samples):
        assert sample["input"] == expected
    return
  def evaluate_fixed_shot(self, model,
                          num_shots, score: bool = True, random_seed: int = 42, generate_probs_for_targets: bool = True) -> Dict[str, Any]:
    """Evaluates task for different metrics.

    Args:
      model: model to evaluate
      num_shots: number of shots
      score: if False, skip scoring, useful for testing
      random_seed: random number generator seed
      generate_probs_for_targets: unused

    Returns:
      Dictionary with results for metrics.
    """
    del generate_probs_for_targets
    results = {}
    invalid_results = {
        'creativity_and_consistency_score': 0
    }
    if num_shots:
      samples = json_task.make_nshot_dataset(self._ds, num_shots, random_seed=random_seed)
    else:
      samples = copy.deepcopy(self._ds)

    inputs = [
        self.task_prefix + sample['input'] + sample['target'][0]
        for sample in samples
    ]

    generative_responses = model.generate_text(inputs=inputs)

    new_samples, results[
        'created_valid_percentage'] = get_multiple_choice_task_from_incepted(
            generative_responses,
            num_choices=self.num_choices,
            input_prefix=self.input_prefix,
            output_prefix=self.output_prefix)

    if not new_samples:
      return invalid_results

    multiple_choice_inputs = [sample['input'] for sample in new_samples]
    choice_targets = [
        list(sample['target_scores'].keys()) for sample in new_samples
    ]

    if not score:
      return {}

    scores = model.cond_log_prob(inputs=multiple_choice_inputs, targets=choice_targets)

    for i, sample in enumerate(new_samples):
      sample['log_prob'] = scores[i]

    accuracy = metrics.measure_multiple_choice_metrics(
        new_samples, ['multiple_choice_grade'])['multiple_choice_grade']

    results['multiple_choice_grade'] = accuracy
    results['creativity_and_consistency_score'] = results[
        'created_valid_percentage'] * results['multiple_choice_grade'] 
    
    return results