Example #1
0
    def clean_filter_sample_gpt_eval(self, sample):
        """
        Does tokenization for final model evaluation. This should return
        input_ids as the context and labels as the true answer.
        """

        if sample == None:
            return None

        if self.mode_answer == 'eval_peeking':
            return self.clean_filter_sample_peeking_gpt_eval(sample)
        elif self.mode_answer == 'eval_nopack_padding':
            return self.clean_filter_sample_nopackpadding_gpt_eval(sample)

        question, answer = sample

        if self.clean_numbers:
            question = _clean_numbers(question)
            answer = _clean_numbers(answer)
        answer_final = last_boxed_only_string(answer)

        assert not answer.isspace()

        question_ids = torch.LongTensor(
            self.tokenizer.encode("\nQUESTION:\n" + question, verbose=False))
        sep_ids = torch.LongTensor(
            self.tokenizer.encode("\FULL SOLUTION:\n", verbose=False))
        answer_final_ids = torch.LongTensor(
            self.tokenizer.encode(
                answer_final,
                verbose=False))  # Loss only counted on these tokens.

        input_ids = torch.cat([
            question_ids,
            sep_ids,
        ], dim=0)

        label_ids = torch.cat([answer_final_ids.clone()], dim=0)

        # Stop early if this Q,A pair is too long
        if input_ids.shape[0] + label_ids.shape[0] > self.max_tokens:
            # Print reason for skipping
            # print(f"Skipping due to input_ids being too big. input_ids.shape[0] = {input_ids.shape[0]}.")
            return None

        return {
            'input_ids_list': input_ids.tolist(),
            'label_ids_list': label_ids.tolist()
        }
Example #2
0
    def clean_filter_sample_nopackpadding_gpt_eval(self, sample):

        if sample == None:
            return None

        question, answer = sample

        if self.clean_numbers:
            question = _clean_numbers(question)
            answer = _clean_numbers(answer)

        answer_final = last_boxed_only_string(answer)

        question_ids = torch.LongTensor(
            self.tokenizer.encode("\nQUESTION:\n" + question, verbose=False))
        sep_ids = torch.LongTensor(
            self.tokenizer.encode("\nFINAL ANSWER:\n", verbose=False))
        final_answer_ids = torch.LongTensor(
            self.tokenizer.encode(answer_final, verbose=False))

        num_to_pad = 32
        padding_tensor = torch.ones(
            (num_to_pad
             )) * 220  # 220 is the token for space in the case of GPT2 models

        input_ids = torch.cat([
            question_ids,
            padding_tensor,
            sep_ids,
        ], dim=0)

        # Only answer_ids contribute to the loss
        label_ids = torch.cat([final_answer_ids.clone()], dim=0)

        # Stop early if this Q,A pair is too long
        if input_ids.shape[0] + label_ids.shape[0] > self.max_tokens:
            # Print reason for skipping
            # print(f"Skipping due to input_ids being too big. input_ids.shape[0] = {input_ids.shape[0]}.")
            return None

        input_ids = input_ids.tolist()
        label_ids = label_ids.tolist()

        return {'input_ids_list': input_ids, 'label_ids_list': label_ids}
Example #3
0
    def clean_filter_sample_nopackpadding_gpt(self, sample):

        if sample == None:
            return None

        question, answer = sample

        if self.clean_numbers:
            question = _clean_numbers(question)
            answer = _clean_numbers(answer)

        answer_final = last_boxed_only_string(answer)

        question_ids = torch.LongTensor(
            self.tokenizer.encode("\nQUESTION:\n" + question, verbose=False))
        sep_ids = torch.LongTensor(
            self.tokenizer.encode("\nFINAL ANSWER:\n", verbose=False))
        final_answer_ids = torch.LongTensor(
            self.tokenizer.encode(answer_final, verbose=False))

        # Stop early if this Q,A pair is too long
        num_to_pad = 32
        padding_tensor = torch.ones(
            (num_to_pad
             )) * 220  # 220 is the token for space in the case of GPT2 models

        input_ids = torch.cat(
            [question_ids, padding_tensor, sep_ids, final_answer_ids], dim=0)

        # Only answer_ids contribute to the loss
        label_ids = torch.cat([
            torch.ones_like(question_ids) * -100,
            torch.ones_like(padding_tensor) * -100,
            torch.ones_like(sep_ids) * -100,
            final_answer_ids.clone()
        ],
                              dim=0)

        input_ids = input_ids.tolist()
        label_ids = label_ids.tolist()

        return {'input_ids_list': input_ids, 'label_ids_list': label_ids}
Example #4
0
def run_eval(args):

    argsdict = vars(args)
    print(pprint.pformat(argsdict))

    if args.tokenizer_merges_file is not None:
        tokenizer = transformers.GPT2Tokenizer.from_pretrained(
            args.arch, merges_file=args.tokenizer_merges_file)
    else:
        tokenizer = transformers.GPT2Tokenizer.from_pretrained(args.arch)

    eval_data = get_dataset(args)
    for inner_dset in eval_data.datasets:
        inner_dset.tokenizer = tokenizer

    dataloader = torch.utils.data.DataLoader(
        eval_data,
        batch_size=1,
        num_workers=0,
        pin_memory=True,
    )
    """
    with torch.no_grad():
        correct = 0
        total = 0
        for i, batch in enumerate(tqdm(dataloader)):
            batch = dict_to_gpu(batch, device_id=0)
            print(batch['fnames'])
            print(batch['input_ids'])
            quit()
    """

    # Set up model
    if args.load == "NONE":
        model = transformers.GPT2LMHeadModel.from_pretrained(args.arch)
    else:
        print(f"Loading model from {args.load}")
        model = transformers.GPT2LMHeadModel.from_pretrained(args.load)
        print(f"Successfully loaded model from {args.load}")

    model = model.eval()
    model = model.cuda()

    loss_moving_average = 0

    outputs = []
    answers = []
    types = []
    levels = []
    fnames_list = []

    cors = {}
    subject_cors = {}
    level_cors = {}

    with torch.no_grad():
        correct = 0
        total = 0
        skipped = 0
        mean_max_probs_correct = []
        mean_max_probs_wrong = []
        for i, batch in enumerate(tqdm(dataloader)):

            if torch.sum(batch['input_ids']) == 0:
                skipped += 1
                print("SKIPPING", batch['fnames'][0])
                continue

            fnames = batch['fnames'][0]
            assert len(fnames) == 1
            fnames_list.append(fnames[0])
            prob_level, prob_type = get_level_type(fnames[0])
            batch = dict_to_gpu(batch, device_id=0)

            output_ids = model.generate(
                batch['input_ids'],
                num_beams=args.num_beams,
                early_stopping=True,
                temperature=1.0,
                max_length=384 if args.arch == 'gpt2-xl' else 1024)

            # logits = model(output_ids).logits
            # probs = F.softmax(logits, dim=2) # torch.Size([1, L, 50257])
            # max_probs, max_tokens = probs.max(2) # torch.Size([1, L]), torch.Size([1, L])

            # num_tokens_for_question = batch['input_ids'].shape[1]
            # probs_sol = max_probs[:, num_tokens_for_question-1:]
            # tokens_sol = max_tokens[:, num_tokens_for_question-1:]

            # real_sol_start_idx, real_sol_stop_idx = get_real_sol_idxs(tokens_sol, tokenizer)
            # if real_sol_start_idx is None or real_sol_stop_idx is None:
            #     skipped += 1
            #     print("BAD ANSWER, SKIPPING", batch['fnames'][0])
            #     continue
            # probs_sol = probs_sol[:, real_sol_start_idx:real_sol_stop_idx + 1]
            # mean_probs_sol = torch.mean(probs_sol).item()
            mean_probs_sol = 0

            output_tokens = get_model_output(batch['input_ids'][0],
                                             output_ids[0], tokenizer)

            # Print this iteration
            output_str = tokenizer.decode(output_tokens)
            output_full = output_str
            output_str = last_boxed_only_string(output_str)

            if args.aops_mode == "eval_peeking":
                answer_str = last_boxed_only_string(
                    tokenizer.decode(batch['labels'][0]))
            else:
                answer_str = tokenizer.decode(batch['labels'][0])

            output, answer = remove_boxed(output_str), remove_boxed(answer_str)

            print("Problem String:")
            print(tokenizer.decode(batch['input_ids'][0]) + "\n")
            print("Model output:")
            print(output_full)
            print(output)
            print("Correct answer:")
            print(answer)
            print("fname")
            print(fnames)
            print("--------------------------------------------")

            # scratchwork_fname = "___".join(fnames[0].split("/")[-2:])
            # with open(f"scratchwork_Temp2e-1_{args.arch}/{scratchwork_fname}.txt", 'w') as f:
            #     f.write("Problem String:" + "\n")
            #     f.write(tokenizer.decode(batch['input_ids'][0]) + "\n")
            #     f.write("Model output:" + "\n")
            #     f.write(output_full + "\n")
            #     f.write(str(output) + "\n")
            #     f.write("Correct answer:" + "\n")
            #     f.write(answer + "\n")
            #     f.write("--------------------------------------------" + "\n")

            outputs.append(output)
            answers.append(answer)
            types.append(prob_type)
            levels.append(prob_level)

            equiv = is_equiv(output, answer)
            if (prob_level, prob_type) in cors:
                cors[(prob_level, prob_type)].append(equiv)
            else:
                cors[(prob_level, prob_type)] = [equiv]

            if prob_level in level_cors:
                level_cors[prob_level].append(equiv)
            else:
                if prob_level is not None:
                    level_cors[prob_level] = [equiv]

            if prob_type in subject_cors:
                subject_cors[prob_type].append(equiv)
            else:
                if prob_type is not None:
                    subject_cors[prob_type] = [equiv]

            if equiv:
                correct += 1
                mean_max_probs_correct.append(mean_probs_sol)
            else:
                mean_max_probs_wrong.append(mean_probs_sol)

            # print("CORRECT", mean_max_probs_correct)
            # print("WRONG", mean_max_probs_wrong)

            total += 1

    subjects = [
        'Prealgebra', 'Algebra', 'Number Theory', 'Counting & Probability',
        'Geometry', 'Intermediate Algebra', 'Precalculus'
    ]

    print(
        f"Average of mean_max_probs_correct = {sum(mean_max_probs_correct)}/{len(mean_max_probs_correct)} = ",
        sum(mean_max_probs_correct) / len(mean_max_probs_correct))
    print(
        f"Average of mean_max_probs_wrong   = {sum(mean_max_probs_wrong)}/{len(mean_max_probs_wrong)} = ",
        sum(mean_max_probs_wrong) / len(mean_max_probs_wrong))

    # now save outputs and answers
    with open(f"outputs_answers_Temp2e-1_{args.arch}.txt", "w+") as f:
        for k, (output, answer, prob_type, prob_level, fname) in enumerate(
                zip(outputs, answers, types, levels, fnames_list)):
            f.write(
                "{} TYPE: {} | LEVEL: {} | OUTPUT: {} | ANSWER: {} | FNAME: {}\n"
                .format(k, prob_type, prob_level, output, answer, fname))

        # print(cors)
        for prob_type in subjects:
            for prob_level in [1, 2, 3, 4, 5]:
                if (prob_level, prob_type) in cors:
                    cors_list = cors[(prob_level, prob_type)]
                    print("{} Level {} Accuracy = {}/{} = {:.3f}".format(
                        prob_type, prob_level, np.sum(cors_list),
                        len(cors_list), np.mean(cors_list)))
                    f.write("{} Level {} Accuracy = {}/{} = {:.3f}\n".format(
                        prob_type, prob_level, np.sum(cors_list),
                        len(cors_list), np.mean(cors_list)))

        print("#####################")
        f.write("#####################\n")
        # also get accuracies for each
        for level in sorted(level_cors):
            cors_list = level_cors[level]
            print("Level {} Accuracy = {}/{} = {:.3f}".format(
                level, np.sum(cors_list), len(cors_list), np.mean(cors_list)))
            f.write("Level {} Accuracy = {}/{} = {:.3f}\n".format(
                level, np.sum(cors_list), len(cors_list), np.mean(cors_list)))
        print("#####################")
        f.write("#####################\n")

        for subject in subjects:
            # for subject in sorted(subject_cors):
            if subject in subject_cors:
                cors_list = subject_cors[subject]
                print("{} Accuracy = {}/{} = {:.3f}".format(
                    subject, np.sum(cors_list), len(cors_list),
                    np.mean(cors_list)))
                f.write("{} Accuracy = {}/{} = {:.3f}\n".format(
                    subject, np.sum(cors_list), len(cors_list),
                    np.mean(cors_list)))
        print("#####################")
        f.write("#####################\n")

        print("Overall Accuracy = {}/{} = {:.3f}".format(
            correct, total, correct / total))
        print("Skipped = {}".format(skipped))
        f.write("Overall Accuracy = {}/{} = {:.3f}\n".format(
            correct, total, correct / total))
        f.write("Skipped = {}".format(skipped))

    print()
Example #5
0
def run(engine="davinci", max=-1):
    outputs = []
    answers = []
    types = []
    levels = []

    fnames_list = []

    cors = {}
    subject_cors = {}
    level_cors = {}
    correct = 0
    total = 0
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            fnames_list.append(os.path.join(subdir, file))
            with open(os.path.join(subdir, file), 'r') as fp:
                try:
                    problem_data = json.load(fp)
                except Exception as e:
                    print(f"Error loading JSON from {file}", e)
                    raise e
                prob_level = problem_data["level"]
                prob_type = problem_data["type"]
                try:
                    prob_level = int(prob_level.split("Level ")[1])
                except:
                    prob_level = None
                model_output = call_engine(train_prompt, problem_data["problem"], engine=engine)
                answer = remove_boxed(last_boxed_only_string(problem_data["solution"]))

                levels.append(prob_level)
                types.append(prob_type)
                outputs.append(model_output)
                answers.append(answer)

                print("Model output:")
                print(model_output)
                print("Correct answer:")
                print(answer)
                print("--------------------------------------------")

                try:
                    equiv = is_equiv(model_output, answer)
                except:
                    equiv = False
                if (prob_level, prob_type) in cors:
                    cors[(prob_level, prob_type)].append(equiv)
                else:
                    cors[(prob_level, prob_type)] = [equiv]
                if prob_level in level_cors:
                    level_cors[prob_level].append(equiv)
                else:
                    if prob_level is not None:
                        level_cors[prob_level] = [equiv]
                if prob_type in subject_cors:
                    subject_cors[prob_type].append(equiv)
                else:
                    if prob_type is not None:
                        subject_cors[prob_type] = [equiv]
                if equiv:
                    correct += 1
                total += 1

                print(str(correct) + "/" + str(total))

            if max > 0 and total > max:
                break
        if max > 0 and total > max:
            break

    with open("outputs_answers_gpt3_{}.txt".format(engine), "w+") as f:
        for k, (output, answer, prob_type, prob_level, fname) in enumerate(zip(outputs, answers, types, levels, fnames_list)):
            f.write("{} TYPE: {} | LEVEL: {} | OUTPUT: {} | ANSWER: {} | FNAME: {}\n".format(k, prob_type, prob_level, output, answer, fname))

        f.write("#####################\n")
        # also get accuracies for each
        for subject in ['Prealgebra', 'Algebra', 'Number Theory', 'Counting & Probability', 'Geometry', 'Intermediate Algebra', 'Precalculus']:
            for level in range(1, 6):
                key = (level, subject)
                if key not in cors.keys():
                    print("Skipping", key)
                    continue
                cors_list = cors[key]
                print("{} Level {} Accuracy = {}/{} = {:.3f}".format(subject, level, np.sum(cors_list), len(cors_list), np.mean(cors_list)))
                f.write("{} Level {} Accuracy = {}/{} = {:.3f}\n".format(subject, level, np.sum(cors_list), len(cors_list), np.mean(cors_list)))
        print("#####################")
        f.write("#####################\n")
        for level in sorted(level_cors):
            if level not in level_cors.keys():
                print("Skipping", level)
                continue
            cors_list = level_cors[level]
            print("Level {} Accuracy = {}/{} = {:.3f}".format(level, np.sum(cors_list), len(cors_list), np.mean(cors_list)))
            f.write("Level {} Accuracy = {}/{} = {:.3f}\n".format(level, np.sum(cors_list), len(cors_list), np.mean(cors_list)))
        print("#####################")
        f.write("#####################\n")
        for subject in ['Prealgebra', 'Algebra', 'Number Theory', 'Counting & Probability', 'Geometry', 'Intermediate Algebra', 'Precalculus']:
            if subject not in subject_cors.keys():
                print("Skipping", subject)
                continue
            cors_list = subject_cors[subject]
            print("{} Accuracy = {}/{} = {:.3f}".format(subject, np.sum(cors_list), len(cors_list), np.mean(cors_list)))
            f.write("{} Accuracy = {}/{} = {:.3f}\n".format(subject, np.sum(cors_list), len(cors_list), np.mean(cors_list)))
        print("#####################")
        f.write("#####################\n")
        print("Overall Accuracy = {}/{} = {:.3f}".format(correct, total, correct/total))
        f.write("Overall Accuracy = {}/{} = {:.3f}\n".format(correct, total, correct/total))
Example #6
0
    def clean_filter_sample_gpt(self, sample):
        """
        Does the actual tokenization. Should be parallelized because it can be a bit slow.
        """

        if sample == None:
            return None

        if self.mode_answe == 'peeking_only':
            return self.clean_filter_sample_peeking_gpt(sample)
        if self.mode_answer == 'mixed_full_and_peeking':
            if random.random() < 0.5:
                return self.clean_filter_sample_peeking_gpt(sample)
            else:
                _mode_answer = 'full'
        elif self.mode_answer == 'mixed_full_and_nopack_padding':
            if random.random() < 0.5:
                return self.clean_filter_sample_nopackpadding_gpt(sample)
            else:
                _mode_answer = 'full'
        elif self.mode_answer == 'mixed_final_boxed_and_full':
            if random.random() < 0.5:
                _mode_answer = 'full'
            else:
                _mode_answer = 'final_boxed'
        elif self.mode_answer == 'full':
            _mode_answer = 'full'
        elif self.mode_answer == 'final_boxed':
            _mode_answer = 'final_boxed'
        else:
            raise NotImplementedError(
                f"self.mode_answer = {self.mode_answer} not recognized.")

        if _mode_answer == 'full':
            question, answer = sample

            if self.clean_numbers:
                question = _clean_numbers(question)
                answer = _clean_numbers(answer)

            answer_final = last_boxed_only_string(answer)

            question_ids = torch.LongTensor(
                self.tokenizer.encode("\nQUESTION:\n" + question,
                                      verbose=False))

            sep_ids_2 = torch.LongTensor(
                self.tokenizer.encode("\nFULL SOLUTION:\n", verbose=False))
            answer_ids = self.tokenizer.encode(answer, verbose=False)
            answer_ids.append(self.tokenizer.eos_token_id)
            answer_ids = torch.LongTensor(answer_ids)

            input_ids = torch.cat([question_ids, sep_ids_2, answer_ids], dim=0)

            # Only answer_ids contribute to the loss
            label_ids = torch.cat([
                torch.ones_like(question_ids) * -100,
                torch.ones_like(sep_ids_2) * -100,
                answer_ids.clone()
            ],
                                  dim=0)

        elif _mode_answer == 'final_boxed':
            question, answer = sample

            if self.clean_numbers:
                question = _clean_numbers(question)
                answer = _clean_numbers(answer)
            answer_final = last_boxed_only_string(answer)
            if not answer_final:
                print("ERROR FROM", question, answer)
                return None

            question_ids = torch.LongTensor(
                self.tokenizer.encode("\nQUESTION:\n" + question,
                                      verbose=False))

            sep_ids_1 = torch.LongTensor(
                self.tokenizer.encode("\nFINAL ANSWER:\n", verbose=False))
            answer_final_ids = self.tokenizer.encode(answer_final,
                                                     verbose=False)
            answer_final_ids.append(self.tokenizer.eos_token_id)
            answer_final_ids = torch.LongTensor(answer_final_ids)

            input_ids = torch.cat([
                question_ids,
                sep_ids_1,
                answer_final_ids,
            ],
                                  dim=0)

            # Only answer_ids contribute to the loss
            label_ids = torch.cat([
                torch.ones_like(question_ids) * -100,
                torch.ones_like(sep_ids_1) * -100,
                answer_final_ids.clone(),
            ],
                                  dim=0)

        else:
            raise NotImplementedError()

        # Stop early if this Q,A pair is too long
        if input_ids.shape[0] > self.max_tokens:
            # Print reason for skipping
            # print(f"Skipping due to input_ids being too big. input_ids.shape[0] = {input_ids.shape[0]}.")
            return None

        input_ids = input_ids.tolist()
        label_ids = label_ids.tolist()

        return {'input_ids_list': input_ids, 'label_ids_list': label_ids}
Example #7
0
    def clean_filter_sample_peeking_gpt_eval(self, sample):
        """
        Does the actual tokenization. Should be parallelized because it can be a bit slow.
        """

        if sample == None:
            return None

        question, answer = sample

        if self.clean_numbers:
            question = _clean_numbers(question)
            answer = _clean_numbers(answer)

        answer_final = last_boxed_only_string(answer)

        question_ids = torch.LongTensor(
            self.tokenizer.encode("\nQUESTION:\n" + question +
                                  "\nFULL SOLUTION:\n",
                                  verbose=False))
        answer_ids = self.tokenizer.tokenize(answer)
        answer_ids_full = torch.LongTensor(self.tokenizer.encode(answer))
        answer_ids = only_until_first_boxed_from_tokens(answer, answer_ids)
        if len(answer_ids) == 0:
            return None
        answer_ids = torch.LongTensor(
            self.tokenizer.encode(answer_ids, verbose=False))

        # Take a fraction
        if isinstance(self.peek_fraction, tuple):
            final_idx = int(
                len(answer_ids) * random.uniform(*self.peek_fraction))
        else:
            final_idx = int(len(answer_ids) * self.peek_fraction)

        answer_ids = answer_ids[:final_idx]

        # sep_ids          = torch.LongTensor(self.tokenizer.encode("\nFINAL ANSWER\n", verbose=False))
        final_answer_ids = answer_ids_full[final_idx:]
        print(final_answer_ids)

        input_ids = torch.cat(
            [
                question_ids,
                answer_ids,
                # sep_ids,
            ],
            dim=0)

        # Only answer_ids contribute to the loss
        label_ids = torch.cat([final_answer_ids.clone()], dim=0)

        # Stop early if this Q,A pair is too long
        if input_ids.shape[0] + label_ids.shape[0] > self.max_tokens:
            # Print reason for skipping
            # print(f"Skipping due to input_ids being too big. input_ids.shape[0] = {input_ids.shape[0]}.")
            return None

        input_ids = input_ids.tolist()
        label_ids = label_ids.tolist()

        return {'input_ids_list': input_ids, 'label_ids_list': label_ids}