def test_evaluator(taskname, task_class): task_dict = tasks.get_task_dict([taskname]) os.system("rm test_cache.db") lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db") def ll_fn(reqs): for ctx, cont in reqs: if len(ctx) == 0: continue # space convention assert ctx[-1] != " " assert cont[0] == " " or ctx[-1] == "\n" res = [] random.seed(42) for _ in reqs: res.append((-random.random(), False)) return res def ll_perp_fn(reqs): for (string, ) in reqs: assert isinstance(string, str) res = [] random.seed(42) for _ in reqs: res.append(-random.random()) return res lm.loglikelihood = ll_fn lm.loglikelihood_rolling = ll_perp_fn limit = 10 e1 = evaluator.evaluate( lm=lm, task_dict=task_dict, num_fewshot=0, limit=limit, bootstrap_iters=10, description_dict=None, ) e2 = evaluator.evaluate( lm=lm, task_dict=task_dict, num_fewshot=0, limit=limit, bootstrap_iters=10, description_dict=None, ) # check that caching is working assert e1 == e2
def main(): args = parse_args() if os.path.exists(args.output_path): print(f"Output path {args.output_path} exists!!!") return random.seed(args.seed) np.random.seed(args.seed) if args.limit: print( "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." ) if args.tasks == "all_tasks": task_names = tasks.ALL_TASKS else: task_names = args.tasks.split(",") task_dict = tasks.get_task_dict(task_names) lm = models.get_model(args.model) train_args = simple_parse_args_string(args.train_args) model_args = simple_parse_args_string(args.model_args) if train_args: train_args.update(model_args) train_args["seed"] = args.seed results = evaluator.evaluate(lm, task_dict, args.provide_description, args.num_fewshot, args.limit, train_args, args.model_args, args.seed) results["args"] = args.__dict__ dumped = json.dumps(results, indent=2) print(dumped) if args.output_path: with open(args.output_path, "w") as f: f.write(dumped) for task, task_res in results.items(): if task not in task_names: continue if "train_args" not in task_res: experiment = comet_ml.Experiment( api_key=os.environ.get('COMET_API_KEY'), project_name=os.environ.get('COMET_PROJECT', "few-shot"), workspace=os.environ.get('COMET_WORKSPACE', "yuvalkirstain"), ) experiment.log_asset(args.output_path) else: experiment = comet_ml.ExistingExperiment( api_key=os.environ.get('COMET_API_KEY'), previous_experiment=task_res["train_args"] ["previous_experiment"]) experiment.log_asset(args.output_path)
def main(): lm = DryrunLM() task_list = "arc_challenge,arc_easy,boolq,cola,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,record,rte,sciq,sst,triviaqa,webqs,wic,wikitext,winogrande,wnli,wsc" values = [] for taskname in task_list.split(","): lm.tokencost = 0 evaluator.evaluate( lm=lm, task_dict={taskname: tasks.get_task(taskname)()}, num_fewshot=0, limit=None, bootstrap_iters=10, description_dict=None, ) print(taskname, lm.tokencost) values.append([ taskname, lm.tokencost, lm.tokencost / 1000 * 0.0008, lm.tokencost / 1000 * 0.0012, lm.tokencost / 1000 * 0.006, lm.tokencost / 1000 * 0.06, ]) from pytablewriter import MarkdownTableWriter writer = MarkdownTableWriter() writer.headers = ["Task", "Tokens", "Ada", "Babbage", "Curie", "Davinci"] values.sort(key=lambda x: -x[1]) totcost = sum([x[1] for x in values]) values.append([ "**Total**", totcost, totcost / 1000 * 0.0008, totcost / 1000 * 0.0012, totcost / 1000 * 0.006, totcost / 1000 * 0.06, ]) writer.value_matrix = values print(writer.dumps())
def test_evaluator(taskname, Task): task_dict = tasks.get_task_dict([taskname]) lm = models.get_model('dummy')() def ll_fn(reqs): for ctx, cont in reqs: # space convention assert ctx[-1] != ' ' assert cont[0] == ' ' or ctx[-1] == '\n' res = [] random.seed(42) for _ in reqs: res.append((-random.random(), False)) return res lm.loglikelihood = ll_fn evaluator.evaluate(lm, task_dict, False, 0, 10)
def main(): lm = DryrunLM() values = [] for taskname in list(tasks.TASK_REGISTRY.keys()): lm.tokencost = 0 evaluator.evaluate(lm, {taskname: tasks.get_task(taskname)()}, False, 0, None) print(taskname, lm.tokencost) values.append([taskname, lm.tokencost, lm.tokencost / 1000 * 0.06]) from pytablewriter import MarkdownTableWriter writer = MarkdownTableWriter() writer.headers = ["Task", "Tokens", "Davinci Cost"] values.sort(key=lambda x: -x[1]) totcost = sum([x[1] for x in values]) values.append(["**Total**", totcost, totcost / 1000 * 0.06]) writer.value_matrix = values print(writer.dumps())
def main(): args = parse_args() random.seed(args.seed) np.random.seed(args.seed) lm = models.get_model(args.model).create_from_arg_string(args.model_args) if args.limit: print( "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." ) if not args.no_cache: lm = base.CachingLM( lm, 'lm_cache/' + args.model + '_' + args.model_args.replace('=', '-').replace(',', '_') + '.db') if args.tasks == "all_tasks": task_names = tasks.ALL_TASKS else: task_names = args.tasks.split(",") task_dict = tasks.get_task_dict(task_names) results = evaluator.evaluate(lm, task_dict, args.provide_description, args.num_fewshot, args.limit) dumped = json.dumps(results, indent=2) print(dumped) if args.output_path: with open(args.output_path, "w") as f: f.write(dumped) # MAKE TABLE from pytablewriter import MarkdownTableWriter writer = MarkdownTableWriter() writer.headers = ["Task", "Metric", "Value"] values = [] for k, dic in results.items(): for m, v in dic.items(): values.append([k, m, '%.4f' % v]) k = "" writer.value_matrix = values print(writer.dumps())
def main(): random.seed(42) np.random.seed(42) lm = models.get_model(model).create_from_arg_string(model_args) if limit: print( "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." ) if not no_cache: lm = base.CachingLM( lm, 'lm_cache/' + model + '_' + model_args.replace('=', '-').replace(',', '_') + '.db') task_dict = tasks.get_task_dict([task]) for desc in fewshot_descriptions: custom_task_dict = { k: CustomDescTask(v, desc) for k, v in task_dict.items() } results = evaluator.evaluate(lm, custom_task_dict, True, num_fewshot, limit) dumped = json.dumps(results, indent=2) print('Description:', desc) print(dumped) # MAKE TABLE from pytablewriter import MarkdownTableWriter writer = MarkdownTableWriter() writer.headers = ["Task", "Metric", "Value"] values = [] for k, dic in results.items(): for m, v in dic.items(): values.append([k, m, '%.4f' % v]) k = "" writer.value_matrix = values print(writer.dumps())
def run_eval(self, eval_tasks=None): was_training = self.model.training self.model.eval() in_micro_batches = self.model.micro_batches # store input microbatches - we need to set to 1 during eval self.model.micro_batches = 1 if eval_tasks is None: eval_tasks = ["lambada", "piqa", "hellaswag", "winogrande", "mathqa", "pubmedqa"] results = evaluator.evaluate(lm=self, task_dict=tasks.get_task_dict(eval_tasks), provide_description=False, num_fewshot=0, limit=None, bootstrap_iters=2).get('results') if was_training: self.model.train() self.model.micro_batches = in_micro_batches return results
def main(): args = parse_args() random.seed(args.seed) np.random.seed(args.seed) lm = models.get_model(args.model).create_from_arg_string(args.model_args) if args.cache: lm = base.CachingLM(lm, 'lm_cache/' + args.model + '_' + args.model_args.replace('=', '-').replace(',', '_') + '.db') if args.tasks == "all_tasks": task_names = tasks.ALL_TASKS else: task_names = args.tasks.split(",") task_dict = tasks.get_task_dict(task_names) results = evaluator.evaluate(lm, task_dict, args.provide_description, args.num_fewshot, args.limit) dumped = json.dumps(results, indent=2) print(dumped) if args.output_path: with open(args.output_path, "w") as f: f.write(dumped)
t = build_model(params, tpu_name, region, preemptible) adaptor = EvalHarnessAdaptor(t, seq, total_batch, shrink=pe != "fixed") step, aux = t.load(bucket, model_dir) t.move() results = evaluator.evaluate( adaptor, tasks.get_task_dict([ "lambada", "piqa", "hellaswag", "winogrande", "mathqa", "pubmedqa", # "boolq", # "cb", # "copa", # "multirc", # "record", # "wic", # "wsc", ]), False, 0, None) dumped = json.dumps(results, indent=2) print(dumped) results = evaluator.evaluate(adaptor, tasks.get_task_dict([ "lambada_cloze",
def main(): args = parse_args() random.seed(args.seed) np.random.seed(args.seed) lm = models.get_model(args.model).create_from_arg_string( args.model_args, { 'batch_size': args.batch_size, 'device': args.device }) if args.limit: print( "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." ) if not args.no_cache: lm = base.CachingLM( lm, 'lm_cache/' + args.model + '_' + args.model_args.replace( '=', '-').replace(',', '_').replace('/', '-') + '.db') if args.tasks == "all_tasks": task_names = tasks.ALL_TASKS else: task_names = args.tasks.split(",") task_dict = tasks.get_task_dict(task_names) results = evaluator.evaluate(lm, task_dict, args.provide_description, args.num_fewshot, args.limit) dumped = json.dumps(results, indent=2) print(dumped) if args.output_path: with open(args.output_path, "w") as f: f.write(dumped) # MAKE TABLE from pytablewriter import MarkdownTableWriter, LatexTableWriter md_writer = MarkdownTableWriter() latex_writer = LatexTableWriter() md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"] latex_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"] values = [] for k, dic in results["results"].items(): version = results["versions"][k] for m, v in dic.items(): if m.endswith("_stderr"): continue if m + "_stderr" in dic: se = dic[m + "_stderr"] values.append([k, version, m, '%.4f' % v, '±', '%.4f' % se]) else: values.append([k, version, m, '%.4f' % v, '', '']) k = "" version = "" md_writer.value_matrix = values latex_writer.value_matrix = values # todo: make latex table look good # print(latex_writer.dumps()) print( f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}" ) print(md_writer.dumps())
def run_eval( self, eval_tasks=None, num_fewshot=0, bootstrap_iters=2, description_dict=None, use_cache=True, name="neox", limit=None ): was_training = self.model.training self.model.eval() in_micro_batches = ( self.model.micro_batches ) # store input microbatches - we need to set to 1 during eval, but want to return to its original value after self.model.micro_batches = 1 if eval_tasks is None: eval_tasks = [ "lambada", "piqa", "hellaswag", "winogrande", "mathqa", "pubmedqa", ] # **HACK INCOMING**: # first get task dict on local main rank # the tasks are downloaded *as they are initialized*, and the downloads don't like multithreading. # so we download them once on the local main rank, wait, and then initialize them on all other ranks, which *should* load from the cache. if self.is_local_main: task_dict = tasks.get_task_dict(eval_tasks) # torch barrier if torch.distributed.is_initialized(): torch.distributed.barrier() task_dict = tasks.get_task_dict(eval_tasks) lm = self if use_cache: # TODO(jon-tow): Append a subset of `neox_args` to the cache database # name arg to distinguish model runs that use different configurations. lm = base.CachingLM(lm, 'lm_cache/' + name + '.db') results = evaluator.evaluate( lm=lm, task_dict=tasks.get_task_dict(eval_tasks), description_dict=description_dict, num_fewshot=num_fewshot, limit=limit, bootstrap_iters=bootstrap_iters, ) results["config"] = { "model": name, "model_args": dataclasses.asdict(self.neox_args), "num_fewshot": num_fewshot, "batch_size": self.batch_size, "device": str(self.device), "no_cache": not use_cache, "limit": limit, "bootstrap_iters": bootstrap_iters, "description_dict": description_dict } if was_training: self.model.train() self.model.micro_batches = in_micro_batches return results
print(f"step {step} done") if step % val_every == 0: for name, val_set in val_sets.items(): val_loss = [] for i, _ in tqdm( zip(val_set.sample_once(), range(val_batches)), desc=f"validation for step {step}, set {name}", total=val_batches): val_loss.append(t.eval(i)) val_loss = np.array(val_loss).mean() print( f"validation loss for step {step}, set {name}: {val_loss}") wandb.log({f'val/loss_{name}': float(val_loss)}, step) results = evaluator.evaluate(adaptor, eval_task_dict, False, 0, None) flat_results = {} for task_name, task_res in results.items(): for metric_name, metric_res in task_res.items(): flat_results[f"{task_name}/{metric_name}"] = float( metric_res) dumped = json.dumps(results, indent=2) print(f"step {step} val results: {dumped}") wandb.log(flat_results, step) step += 1