def dataset_fn(split: str, shuffle_files: Optional[bool] = None, seed: Optional[int] = 42) -> tf.data.Dataset: if not Split.is_valid(split): allowed_splits = [Split.to_str(spl) for spl in Split] raise ValueError( f"splits supported: {allowed_splits}, got: {split}") bigbench_task = BigBenchJsonTaskFetcher.get_bigbench_json_task_or_subtask( task_name=task_name, task_path=task_path, subtask_name=subtask_name, json_util=json_util) bigbench_json_data = copy.deepcopy(bigbench_task._ds) # pylint: disable=protected-access scaled_max_examples = max_examples # avoid mutating nonlocal var split = Split.from_str(split) if split != Split.ALL: # Shuffle and slice the data deterministically, so that examples in the # split are mixed, but reproducible. bigbench_json_data = _shuffle(bigbench_json_data, shuffle_seed=SPLIT_SHUFFLE_SEED) scaled_max_examples, bigbench_json_data = _slice( bigbench_json_data, split, max_num_examples=max_examples) bigbench_task_rng = np.random.RandomState(seed=seed) if num_shots: if scaled_max_examples: num_dataset_examples = min(scaled_max_examples, len(bigbench_json_data)) else: num_dataset_examples = len(bigbench_json_data) bigbench_json_data = json_task.make_nshot_dataset( bigbench_json_data, num_shots, random_seed=seed, separator=bigbench_task.few_shot_example_separator, max_examples=num_dataset_examples) elif scaled_max_examples: # Ideally sample if scaled_max_examples < len(ds), otherwise the data is # just shuffled, but this matches BIG-bench functionality. bigbench_json_data = bigbench_task_rng.choice( bigbench_json_data, min(scaled_max_examples, len(bigbench_json_data)), replace=False) if shuffle_files: # BIG-bench task data is contained in one file, so shuffle_files # should be no-op, but FunctionDataSource.get_dataset(shuffle=True) # passes the shuffle arg through shuffle_files; hence use shuffle_files to # shuffle the examples. bigbench_json_data = _shuffle(bigbench_json_data, seed) examples = _parse_examples(bigbench_json_data, bigbench_task_type, bigbench_task.task_prefix, split) ds = tf.data.Dataset.from_tensor_slices(examples) return ds
def evaluate_fixed_shot(self, model, num_shots) -> Dict[str, Any]: """Evaluates task for different metrics. Args: generate_batch_fn: generating function. score_batch_fn: generating function. num_shots: number of shots Returns: Dictionary with results for metrics. """ results = {} invalid_results = {'creativity_and_consistency_score': 0} if num_shots: samples = json_task.make_nshot_dataset(self._ds, num_shots) else: samples = copy.deepcopy(self._ds) inputs = [ self.task_prefix + sample['input'] + sample['target'][0] for sample in samples ] generative_responses = model.generate_text(inputs=inputs) new_samples, results[ 'created_valid_percentage'] = get_multiple_choice_task_from_incepted( generative_responses, num_choices=self.num_choices, input_prefix=self.input_prefix, output_prefix=self.output_prefix) if not new_samples: return invalid_results multiple_choice_inputs = [sample['input'] for sample in new_samples] choice_targets = [ list(sample['target_scores'].keys()) for sample in new_samples ] scores = model.cond_log_prob(inputs=multiple_choice_inputs, targets=choice_targets) for i, sample in enumerate(new_samples): sample['log_prob'] = scores[i] accuracy = metrics.measure_multiple_choice_metrics( new_samples, ['multiple_choice_grade'])['multiple_choice_grade'] results['multiple_choice_grade'] = accuracy results['creativity_and_consistency_score'] = results[ 'created_valid_percentage'] * results['multiple_choice_grade'] return results
def check_fewshot_samples(samples, expected_samples, append=True): samples = copy.deepcopy(samples) formatted_samples = [ default_format_fn( sample, input_prefix="Q: ", output_prefix=" A: ", append_choices_to_input=append, ) for sample in samples ] fewshot_samples = make_nshot_dataset(formatted_samples, shots=1) for sample, expected in zip(fewshot_samples, expected_samples): assert sample["input"] == expected return
def evaluate_fixed_shot(self, model, num_shots, score: bool = True, random_seed: int = 42, generate_probs_for_targets: bool = True) -> Dict[str, Any]: """Evaluates task for different metrics. Args: model: model to evaluate num_shots: number of shots score: if False, skip scoring, useful for testing random_seed: random number generator seed generate_probs_for_targets: unused Returns: Dictionary with results for metrics. """ del generate_probs_for_targets results = {} invalid_results = { 'creativity_and_consistency_score': 0 } if num_shots: samples = json_task.make_nshot_dataset(self._ds, num_shots, random_seed=random_seed) else: samples = copy.deepcopy(self._ds) inputs = [ self.task_prefix + sample['input'] + sample['target'][0] for sample in samples ] generative_responses = model.generate_text(inputs=inputs) new_samples, results[ 'created_valid_percentage'] = get_multiple_choice_task_from_incepted( generative_responses, num_choices=self.num_choices, input_prefix=self.input_prefix, output_prefix=self.output_prefix) if not new_samples: return invalid_results multiple_choice_inputs = [sample['input'] for sample in new_samples] choice_targets = [ list(sample['target_scores'].keys()) for sample in new_samples ] if not score: return {} scores = model.cond_log_prob(inputs=multiple_choice_inputs, targets=choice_targets) for i, sample in enumerate(new_samples): sample['log_prob'] = scores[i] accuracy = metrics.measure_multiple_choice_metrics( new_samples, ['multiple_choice_grade'])['multiple_choice_grade'] results['multiple_choice_grade'] = accuracy results['creativity_and_consistency_score'] = results[ 'created_valid_percentage'] * results['multiple_choice_grade'] return results