def clean_filter_sample_gpt_eval(self, sample): """ Does tokenization for final model evaluation. This should return input_ids as the context and labels as the true answer. """ if sample == None: return None if self.mode_answer == 'eval_peeking': return self.clean_filter_sample_peeking_gpt_eval(sample) elif self.mode_answer == 'eval_nopack_padding': return self.clean_filter_sample_nopackpadding_gpt_eval(sample) question, answer = sample if self.clean_numbers: question = _clean_numbers(question) answer = _clean_numbers(answer) answer_final = last_boxed_only_string(answer) assert not answer.isspace() question_ids = torch.LongTensor( self.tokenizer.encode("\nQUESTION:\n" + question, verbose=False)) sep_ids = torch.LongTensor( self.tokenizer.encode("\FULL SOLUTION:\n", verbose=False)) answer_final_ids = torch.LongTensor( self.tokenizer.encode( answer_final, verbose=False)) # Loss only counted on these tokens. input_ids = torch.cat([ question_ids, sep_ids, ], dim=0) label_ids = torch.cat([answer_final_ids.clone()], dim=0) # Stop early if this Q,A pair is too long if input_ids.shape[0] + label_ids.shape[0] > self.max_tokens: # Print reason for skipping # print(f"Skipping due to input_ids being too big. input_ids.shape[0] = {input_ids.shape[0]}.") return None return { 'input_ids_list': input_ids.tolist(), 'label_ids_list': label_ids.tolist() }
def clean_filter_sample_nopackpadding_gpt_eval(self, sample): if sample == None: return None question, answer = sample if self.clean_numbers: question = _clean_numbers(question) answer = _clean_numbers(answer) answer_final = last_boxed_only_string(answer) question_ids = torch.LongTensor( self.tokenizer.encode("\nQUESTION:\n" + question, verbose=False)) sep_ids = torch.LongTensor( self.tokenizer.encode("\nFINAL ANSWER:\n", verbose=False)) final_answer_ids = torch.LongTensor( self.tokenizer.encode(answer_final, verbose=False)) num_to_pad = 32 padding_tensor = torch.ones( (num_to_pad )) * 220 # 220 is the token for space in the case of GPT2 models input_ids = torch.cat([ question_ids, padding_tensor, sep_ids, ], dim=0) # Only answer_ids contribute to the loss label_ids = torch.cat([final_answer_ids.clone()], dim=0) # Stop early if this Q,A pair is too long if input_ids.shape[0] + label_ids.shape[0] > self.max_tokens: # Print reason for skipping # print(f"Skipping due to input_ids being too big. input_ids.shape[0] = {input_ids.shape[0]}.") return None input_ids = input_ids.tolist() label_ids = label_ids.tolist() return {'input_ids_list': input_ids, 'label_ids_list': label_ids}
def clean_filter_sample_nopackpadding_gpt(self, sample): if sample == None: return None question, answer = sample if self.clean_numbers: question = _clean_numbers(question) answer = _clean_numbers(answer) answer_final = last_boxed_only_string(answer) question_ids = torch.LongTensor( self.tokenizer.encode("\nQUESTION:\n" + question, verbose=False)) sep_ids = torch.LongTensor( self.tokenizer.encode("\nFINAL ANSWER:\n", verbose=False)) final_answer_ids = torch.LongTensor( self.tokenizer.encode(answer_final, verbose=False)) # Stop early if this Q,A pair is too long num_to_pad = 32 padding_tensor = torch.ones( (num_to_pad )) * 220 # 220 is the token for space in the case of GPT2 models input_ids = torch.cat( [question_ids, padding_tensor, sep_ids, final_answer_ids], dim=0) # Only answer_ids contribute to the loss label_ids = torch.cat([ torch.ones_like(question_ids) * -100, torch.ones_like(padding_tensor) * -100, torch.ones_like(sep_ids) * -100, final_answer_ids.clone() ], dim=0) input_ids = input_ids.tolist() label_ids = label_ids.tolist() return {'input_ids_list': input_ids, 'label_ids_list': label_ids}
def run_eval(args): argsdict = vars(args) print(pprint.pformat(argsdict)) if args.tokenizer_merges_file is not None: tokenizer = transformers.GPT2Tokenizer.from_pretrained( args.arch, merges_file=args.tokenizer_merges_file) else: tokenizer = transformers.GPT2Tokenizer.from_pretrained(args.arch) eval_data = get_dataset(args) for inner_dset in eval_data.datasets: inner_dset.tokenizer = tokenizer dataloader = torch.utils.data.DataLoader( eval_data, batch_size=1, num_workers=0, pin_memory=True, ) """ with torch.no_grad(): correct = 0 total = 0 for i, batch in enumerate(tqdm(dataloader)): batch = dict_to_gpu(batch, device_id=0) print(batch['fnames']) print(batch['input_ids']) quit() """ # Set up model if args.load == "NONE": model = transformers.GPT2LMHeadModel.from_pretrained(args.arch) else: print(f"Loading model from {args.load}") model = transformers.GPT2LMHeadModel.from_pretrained(args.load) print(f"Successfully loaded model from {args.load}") model = model.eval() model = model.cuda() loss_moving_average = 0 outputs = [] answers = [] types = [] levels = [] fnames_list = [] cors = {} subject_cors = {} level_cors = {} with torch.no_grad(): correct = 0 total = 0 skipped = 0 mean_max_probs_correct = [] mean_max_probs_wrong = [] for i, batch in enumerate(tqdm(dataloader)): if torch.sum(batch['input_ids']) == 0: skipped += 1 print("SKIPPING", batch['fnames'][0]) continue fnames = batch['fnames'][0] assert len(fnames) == 1 fnames_list.append(fnames[0]) prob_level, prob_type = get_level_type(fnames[0]) batch = dict_to_gpu(batch, device_id=0) output_ids = model.generate( batch['input_ids'], num_beams=args.num_beams, early_stopping=True, temperature=1.0, max_length=384 if args.arch == 'gpt2-xl' else 1024) # logits = model(output_ids).logits # probs = F.softmax(logits, dim=2) # torch.Size([1, L, 50257]) # max_probs, max_tokens = probs.max(2) # torch.Size([1, L]), torch.Size([1, L]) # num_tokens_for_question = batch['input_ids'].shape[1] # probs_sol = max_probs[:, num_tokens_for_question-1:] # tokens_sol = max_tokens[:, num_tokens_for_question-1:] # real_sol_start_idx, real_sol_stop_idx = get_real_sol_idxs(tokens_sol, tokenizer) # if real_sol_start_idx is None or real_sol_stop_idx is None: # skipped += 1 # print("BAD ANSWER, SKIPPING", batch['fnames'][0]) # continue # probs_sol = probs_sol[:, real_sol_start_idx:real_sol_stop_idx + 1] # mean_probs_sol = torch.mean(probs_sol).item() mean_probs_sol = 0 output_tokens = get_model_output(batch['input_ids'][0], output_ids[0], tokenizer) # Print this iteration output_str = tokenizer.decode(output_tokens) output_full = output_str output_str = last_boxed_only_string(output_str) if args.aops_mode == "eval_peeking": answer_str = last_boxed_only_string( tokenizer.decode(batch['labels'][0])) else: answer_str = tokenizer.decode(batch['labels'][0]) output, answer = remove_boxed(output_str), remove_boxed(answer_str) print("Problem String:") print(tokenizer.decode(batch['input_ids'][0]) + "\n") print("Model output:") print(output_full) print(output) print("Correct answer:") print(answer) print("fname") print(fnames) print("--------------------------------------------") # scratchwork_fname = "___".join(fnames[0].split("/")[-2:]) # with open(f"scratchwork_Temp2e-1_{args.arch}/{scratchwork_fname}.txt", 'w') as f: # f.write("Problem String:" + "\n") # f.write(tokenizer.decode(batch['input_ids'][0]) + "\n") # f.write("Model output:" + "\n") # f.write(output_full + "\n") # f.write(str(output) + "\n") # f.write("Correct answer:" + "\n") # f.write(answer + "\n") # f.write("--------------------------------------------" + "\n") outputs.append(output) answers.append(answer) types.append(prob_type) levels.append(prob_level) equiv = is_equiv(output, answer) if (prob_level, prob_type) in cors: cors[(prob_level, prob_type)].append(equiv) else: cors[(prob_level, prob_type)] = [equiv] if prob_level in level_cors: level_cors[prob_level].append(equiv) else: if prob_level is not None: level_cors[prob_level] = [equiv] if prob_type in subject_cors: subject_cors[prob_type].append(equiv) else: if prob_type is not None: subject_cors[prob_type] = [equiv] if equiv: correct += 1 mean_max_probs_correct.append(mean_probs_sol) else: mean_max_probs_wrong.append(mean_probs_sol) # print("CORRECT", mean_max_probs_correct) # print("WRONG", mean_max_probs_wrong) total += 1 subjects = [ 'Prealgebra', 'Algebra', 'Number Theory', 'Counting & Probability', 'Geometry', 'Intermediate Algebra', 'Precalculus' ] print( f"Average of mean_max_probs_correct = {sum(mean_max_probs_correct)}/{len(mean_max_probs_correct)} = ", sum(mean_max_probs_correct) / len(mean_max_probs_correct)) print( f"Average of mean_max_probs_wrong = {sum(mean_max_probs_wrong)}/{len(mean_max_probs_wrong)} = ", sum(mean_max_probs_wrong) / len(mean_max_probs_wrong)) # now save outputs and answers with open(f"outputs_answers_Temp2e-1_{args.arch}.txt", "w+") as f: for k, (output, answer, prob_type, prob_level, fname) in enumerate( zip(outputs, answers, types, levels, fnames_list)): f.write( "{} TYPE: {} | LEVEL: {} | OUTPUT: {} | ANSWER: {} | FNAME: {}\n" .format(k, prob_type, prob_level, output, answer, fname)) # print(cors) for prob_type in subjects: for prob_level in [1, 2, 3, 4, 5]: if (prob_level, prob_type) in cors: cors_list = cors[(prob_level, prob_type)] print("{} Level {} Accuracy = {}/{} = {:.3f}".format( prob_type, prob_level, np.sum(cors_list), len(cors_list), np.mean(cors_list))) f.write("{} Level {} Accuracy = {}/{} = {:.3f}\n".format( prob_type, prob_level, np.sum(cors_list), len(cors_list), np.mean(cors_list))) print("#####################") f.write("#####################\n") # also get accuracies for each for level in sorted(level_cors): cors_list = level_cors[level] print("Level {} Accuracy = {}/{} = {:.3f}".format( level, np.sum(cors_list), len(cors_list), np.mean(cors_list))) f.write("Level {} Accuracy = {}/{} = {:.3f}\n".format( level, np.sum(cors_list), len(cors_list), np.mean(cors_list))) print("#####################") f.write("#####################\n") for subject in subjects: # for subject in sorted(subject_cors): if subject in subject_cors: cors_list = subject_cors[subject] print("{} Accuracy = {}/{} = {:.3f}".format( subject, np.sum(cors_list), len(cors_list), np.mean(cors_list))) f.write("{} Accuracy = {}/{} = {:.3f}\n".format( subject, np.sum(cors_list), len(cors_list), np.mean(cors_list))) print("#####################") f.write("#####################\n") print("Overall Accuracy = {}/{} = {:.3f}".format( correct, total, correct / total)) print("Skipped = {}".format(skipped)) f.write("Overall Accuracy = {}/{} = {:.3f}\n".format( correct, total, correct / total)) f.write("Skipped = {}".format(skipped)) print()
def run(engine="davinci", max=-1): outputs = [] answers = [] types = [] levels = [] fnames_list = [] cors = {} subject_cors = {} level_cors = {} correct = 0 total = 0 for subdir, dirs, files in os.walk(rootdir): for file in files: fnames_list.append(os.path.join(subdir, file)) with open(os.path.join(subdir, file), 'r') as fp: try: problem_data = json.load(fp) except Exception as e: print(f"Error loading JSON from {file}", e) raise e prob_level = problem_data["level"] prob_type = problem_data["type"] try: prob_level = int(prob_level.split("Level ")[1]) except: prob_level = None model_output = call_engine(train_prompt, problem_data["problem"], engine=engine) answer = remove_boxed(last_boxed_only_string(problem_data["solution"])) levels.append(prob_level) types.append(prob_type) outputs.append(model_output) answers.append(answer) print("Model output:") print(model_output) print("Correct answer:") print(answer) print("--------------------------------------------") try: equiv = is_equiv(model_output, answer) except: equiv = False if (prob_level, prob_type) in cors: cors[(prob_level, prob_type)].append(equiv) else: cors[(prob_level, prob_type)] = [equiv] if prob_level in level_cors: level_cors[prob_level].append(equiv) else: if prob_level is not None: level_cors[prob_level] = [equiv] if prob_type in subject_cors: subject_cors[prob_type].append(equiv) else: if prob_type is not None: subject_cors[prob_type] = [equiv] if equiv: correct += 1 total += 1 print(str(correct) + "/" + str(total)) if max > 0 and total > max: break if max > 0 and total > max: break with open("outputs_answers_gpt3_{}.txt".format(engine), "w+") as f: for k, (output, answer, prob_type, prob_level, fname) in enumerate(zip(outputs, answers, types, levels, fnames_list)): f.write("{} TYPE: {} | LEVEL: {} | OUTPUT: {} | ANSWER: {} | FNAME: {}\n".format(k, prob_type, prob_level, output, answer, fname)) f.write("#####################\n") # also get accuracies for each for subject in ['Prealgebra', 'Algebra', 'Number Theory', 'Counting & Probability', 'Geometry', 'Intermediate Algebra', 'Precalculus']: for level in range(1, 6): key = (level, subject) if key not in cors.keys(): print("Skipping", key) continue cors_list = cors[key] print("{} Level {} Accuracy = {}/{} = {:.3f}".format(subject, level, np.sum(cors_list), len(cors_list), np.mean(cors_list))) f.write("{} Level {} Accuracy = {}/{} = {:.3f}\n".format(subject, level, np.sum(cors_list), len(cors_list), np.mean(cors_list))) print("#####################") f.write("#####################\n") for level in sorted(level_cors): if level not in level_cors.keys(): print("Skipping", level) continue cors_list = level_cors[level] print("Level {} Accuracy = {}/{} = {:.3f}".format(level, np.sum(cors_list), len(cors_list), np.mean(cors_list))) f.write("Level {} Accuracy = {}/{} = {:.3f}\n".format(level, np.sum(cors_list), len(cors_list), np.mean(cors_list))) print("#####################") f.write("#####################\n") for subject in ['Prealgebra', 'Algebra', 'Number Theory', 'Counting & Probability', 'Geometry', 'Intermediate Algebra', 'Precalculus']: if subject not in subject_cors.keys(): print("Skipping", subject) continue cors_list = subject_cors[subject] print("{} Accuracy = {}/{} = {:.3f}".format(subject, np.sum(cors_list), len(cors_list), np.mean(cors_list))) f.write("{} Accuracy = {}/{} = {:.3f}\n".format(subject, np.sum(cors_list), len(cors_list), np.mean(cors_list))) print("#####################") f.write("#####################\n") print("Overall Accuracy = {}/{} = {:.3f}".format(correct, total, correct/total)) f.write("Overall Accuracy = {}/{} = {:.3f}\n".format(correct, total, correct/total))
def clean_filter_sample_gpt(self, sample): """ Does the actual tokenization. Should be parallelized because it can be a bit slow. """ if sample == None: return None if self.mode_answe == 'peeking_only': return self.clean_filter_sample_peeking_gpt(sample) if self.mode_answer == 'mixed_full_and_peeking': if random.random() < 0.5: return self.clean_filter_sample_peeking_gpt(sample) else: _mode_answer = 'full' elif self.mode_answer == 'mixed_full_and_nopack_padding': if random.random() < 0.5: return self.clean_filter_sample_nopackpadding_gpt(sample) else: _mode_answer = 'full' elif self.mode_answer == 'mixed_final_boxed_and_full': if random.random() < 0.5: _mode_answer = 'full' else: _mode_answer = 'final_boxed' elif self.mode_answer == 'full': _mode_answer = 'full' elif self.mode_answer == 'final_boxed': _mode_answer = 'final_boxed' else: raise NotImplementedError( f"self.mode_answer = {self.mode_answer} not recognized.") if _mode_answer == 'full': question, answer = sample if self.clean_numbers: question = _clean_numbers(question) answer = _clean_numbers(answer) answer_final = last_boxed_only_string(answer) question_ids = torch.LongTensor( self.tokenizer.encode("\nQUESTION:\n" + question, verbose=False)) sep_ids_2 = torch.LongTensor( self.tokenizer.encode("\nFULL SOLUTION:\n", verbose=False)) answer_ids = self.tokenizer.encode(answer, verbose=False) answer_ids.append(self.tokenizer.eos_token_id) answer_ids = torch.LongTensor(answer_ids) input_ids = torch.cat([question_ids, sep_ids_2, answer_ids], dim=0) # Only answer_ids contribute to the loss label_ids = torch.cat([ torch.ones_like(question_ids) * -100, torch.ones_like(sep_ids_2) * -100, answer_ids.clone() ], dim=0) elif _mode_answer == 'final_boxed': question, answer = sample if self.clean_numbers: question = _clean_numbers(question) answer = _clean_numbers(answer) answer_final = last_boxed_only_string(answer) if not answer_final: print("ERROR FROM", question, answer) return None question_ids = torch.LongTensor( self.tokenizer.encode("\nQUESTION:\n" + question, verbose=False)) sep_ids_1 = torch.LongTensor( self.tokenizer.encode("\nFINAL ANSWER:\n", verbose=False)) answer_final_ids = self.tokenizer.encode(answer_final, verbose=False) answer_final_ids.append(self.tokenizer.eos_token_id) answer_final_ids = torch.LongTensor(answer_final_ids) input_ids = torch.cat([ question_ids, sep_ids_1, answer_final_ids, ], dim=0) # Only answer_ids contribute to the loss label_ids = torch.cat([ torch.ones_like(question_ids) * -100, torch.ones_like(sep_ids_1) * -100, answer_final_ids.clone(), ], dim=0) else: raise NotImplementedError() # Stop early if this Q,A pair is too long if input_ids.shape[0] > self.max_tokens: # Print reason for skipping # print(f"Skipping due to input_ids being too big. input_ids.shape[0] = {input_ids.shape[0]}.") return None input_ids = input_ids.tolist() label_ids = label_ids.tolist() return {'input_ids_list': input_ids, 'label_ids_list': label_ids}
def clean_filter_sample_peeking_gpt_eval(self, sample): """ Does the actual tokenization. Should be parallelized because it can be a bit slow. """ if sample == None: return None question, answer = sample if self.clean_numbers: question = _clean_numbers(question) answer = _clean_numbers(answer) answer_final = last_boxed_only_string(answer) question_ids = torch.LongTensor( self.tokenizer.encode("\nQUESTION:\n" + question + "\nFULL SOLUTION:\n", verbose=False)) answer_ids = self.tokenizer.tokenize(answer) answer_ids_full = torch.LongTensor(self.tokenizer.encode(answer)) answer_ids = only_until_first_boxed_from_tokens(answer, answer_ids) if len(answer_ids) == 0: return None answer_ids = torch.LongTensor( self.tokenizer.encode(answer_ids, verbose=False)) # Take a fraction if isinstance(self.peek_fraction, tuple): final_idx = int( len(answer_ids) * random.uniform(*self.peek_fraction)) else: final_idx = int(len(answer_ids) * self.peek_fraction) answer_ids = answer_ids[:final_idx] # sep_ids = torch.LongTensor(self.tokenizer.encode("\nFINAL ANSWER\n", verbose=False)) final_answer_ids = answer_ids_full[final_idx:] print(final_answer_ids) input_ids = torch.cat( [ question_ids, answer_ids, # sep_ids, ], dim=0) # Only answer_ids contribute to the loss label_ids = torch.cat([final_answer_ids.clone()], dim=0) # Stop early if this Q,A pair is too long if input_ids.shape[0] + label_ids.shape[0] > self.max_tokens: # Print reason for skipping # print(f"Skipping due to input_ids being too big. input_ids.shape[0] = {input_ids.shape[0]}.") return None input_ids = input_ids.tolist() label_ids = label_ids.tolist() return {'input_ids_list': input_ids, 'label_ids_list': label_ids}