Example #1
0
    def _evaluate_model(self,
                        model,
                        number_of_shots=0,
                        max_examples_per_shot=None,
                        random_seed=0):
        """Evaluate model with a specific number of examples in the prompt"""
        # Use same prompt/example ordering for each number_of_shots
        np.random.seed(random_seed)

        trials = 0
        score, tasks = 0, 0
        for idx, val_task in enumerate(
                tqdm(np.random.permutation(self.examples))):
            if trials >= max_examples_per_shot:
                break
            tasks += 1
            prompt = self._make_prompt(number_of_shots)
            val_prompts, targets = self._make_val_prompts(val_task)
            correct_val = 0

            for val_prompt, target in zip(val_prompts, targets):
                full_prompt = prompt + val_prompt
                if self.verbose:
                    print("Task {}".format(idx))
                    if idx == 0:
                        print(full_prompt)
                for i in range(self.trials_per_input):
                    trials += 1
                    result = model.generate_text(full_prompt,
                                                 output_regex=r"\d[\d\n]+\d")

                    if result == target:
                        correct_val += 1
                        break
                if self.verbose:
                    print(
                        f"Problem: {val_prompt}, Answer: {result}, "
                        f"Expected Answer: {target}, Correct: {result == target}"
                    )
            if correct_val == len(val_prompt):
                score += 1
            if self.verbose:
                print("Intermediate score", score, tasks)
        return task.ScoreData(
            {"exact_str_match": score / tasks},
            preferred_score="exact_str_match",
            subtask_description=
            "Exact string match on Abstraction and Reasoning Corpus with"
            f"{number_of_shots} training samples",
            low_score=0.0,
            high_score=1.0,
            number_of_shots=number_of_shots,
        )
 def play_round(self, model: model.Model) -> None:
     """Play a single round of the game."""
     prompt = get_prompt(
         self.letter_set,
         self.correct_words,
         self.incorrect_words,
         self.score,
         self.round,
         self.total_rounds,
     )
     logging.info(prompt)
     attempt = model.generate_text(prompt)
     logging.info(attempt)
     self._score_round(attempt)
     self.round += 1
# ----------------------------------------------------------------------------
if __name__ == "__main__":
    """Use this to run a simple test of the HF model types."""

    # test a few gpt models
    for model_name in ["openai-gpt", "gpt2", "gpt2-medium", "gpt2-large"]:
        print("-" * 80)
        print(f"model: {model_name}")
        set_seed(42)

        model = BIGBenchHFModel(model_name=model_name)
        prompt = "It was the best of times, it was"
        response = model.generate_text(
            inputs=prompt,
            max_length=32,
            stop_string=".",
        )

        print(f"prompt: {prompt}")
        print(f"response: {response}")

        prompts = ["These are the times that", "Stately, plump Buck Mulligan"]
        responses = model.generate_text(inputs=prompts,
                                        max_length=32,
                                        stop_string=".")

        for p, r in zip(prompts, responses):
            print(f"prompt: {p}")
            print(f"response: {r}")