def test_evaluate_from_args(self): kebab_args = ["evaluate", str(self.FIXTURES_ROOT / "bidaf" / "serialization" / "model.tar.gz"), str(self.FIXTURES_ROOT / "data" / "squad.json"), "--cuda-device", "-1"] args = self.parser.parse_args(kebab_args) metrics = evaluate_from_args(args) assert metrics.keys() == {'span_acc', 'end_acc', 'start_acc', 'em', 'f1', 'loss'}
def test_evaluate_from_args(self): kebab_args = ["evaluate", "tests/fixtures/bidaf/serialization/model.tar.gz", "--evaluation-data-file", "tests/fixtures/data/squad.json", "--cuda-device", "-1"] args = self.parser.parse_args(kebab_args) metrics = evaluate_from_args(args) assert metrics.keys() == {'span_acc', 'end_acc', 'start_acc', 'em', 'f1'}
def test_evaluate_from_args(self): kebab_args = ["evaluate", str(self.FIXTURES_ROOT / "bidaf" / "serialization" / "model.tar.gz"), str(self.FIXTURES_ROOT / "data" / "squad.json"), "--cuda-device", "-1"] args = self.parser.parse_args(kebab_args) metrics = evaluate_from_args(args) assert metrics.keys() == {'span_acc', 'end_acc', 'start_acc', 'em', 'f1'}
def evaluate(config, serialization_dir, bert_path, trainable): if os.path.exists(serialization_dir): print(f"{serialization_dir} exists, removing...") shutil.rmtree(serialization_dir) print("#" * 40) print("# Training") print("#" * 40) os.environ["BERT_DIMS"] = str( BertModel.from_pretrained(bert_path).config.hidden_size) os.environ["BERT_PATH"] = bert_path os.environ["TRAINABLE"] = str(int(trainable)) train_model_from_file(config, serialization_dir) print("#" * 40) print("# Evaluating") print("#" * 40) args = eval_args(serialization_dir) evaluate_from_args(args)
def test_output_file_evaluate_from_args(self): output_file = str(self.TEST_DIR / "metrics.json") kebab_args = ["evaluate", str(self.FIXTURES_ROOT / "bidaf" / "serialization" / "model.tar.gz"), str(self.FIXTURES_ROOT / "data" / "squad.json"), "--cuda-device", "-1", "--output-file", output_file] args = self.parser.parse_args(kebab_args) computed_metrics = evaluate_from_args(args) with open(output_file, 'r') as file: saved_metrics = json.load(file) assert computed_metrics == saved_metrics
def test_output_file_evaluate_from_args(self): output_file = str(self.TEST_DIR / "metrics.json") kebab_args = ["evaluate", str(self.FIXTURES_ROOT / "bidaf" / "serialization" / "model.tar.gz"), str(self.FIXTURES_ROOT / "data" / "squad.json"), "--cuda-device", "-1", "--output-file", output_file] args = self.parser.parse_args(kebab_args) computed_metrics = evaluate_from_args(args) with open(output_file, 'r') as file: saved_metrics = json.load(file) assert computed_metrics == saved_metrics
def test_evaluate_from_args(self): kebab_args = [ "evaluate", "tests/fixtures/bidaf/serialization/model.tar.gz", "--evaluation-data-file", "tests/fixtures/data/squad.json", "--cuda-device", "-1" ] args = self.parser.parse_args(kebab_args) metrics = evaluate_from_args(args) assert metrics.keys() == { 'span_acc', 'end_acc', 'start_acc', 'em', 'f1' }
def test_evaluate_from_args(self): kebab_args = [ u"evaluate", unicode(self.FIXTURES_ROOT / u"bidaf" / u"serialization" / u"model.tar.gz"), unicode(self.FIXTURES_ROOT / u"data" / u"squad.json"), u"--cuda-device", u"-1" ] args = self.parser.parse_args(kebab_args) metrics = evaluate_from_args(args) assert list(metrics.keys()) == set( [u'span_acc', u'end_acc', u'start_acc', u'em', u'f1'])
def test_evaluate_from_args(self): parser = argparse.ArgumentParser(description="Testing") subparsers = parser.add_subparsers(title='Commands', metavar='') add_subparser(subparsers) raw_args = ["evaluate", "--config_file", "tests/fixtures/bidaf/experiment.json", "--evaluation_data_file", "tests/fixtures/data/squad.json"] args = parser.parse_args(raw_args) metrics = evaluate_from_args(args) assert metrics == {'full_span_acc': 0.0, 'span_end_acc': 0.0, 'span_start_acc': 0.0}
def restore_and_evaluate(self) -> Dict[str, Any]: allennlp_args = argparse.Namespace() allennlp_args.file_friendly_logging = False allennlp_args.archive_file = "result/model.tar.gz" allennlp_args.weights_file = None allennlp_args.cuda_device = -1 allennlp_args.overrides = "" allennlp_args.input_file = "data/movie_review/test.tsv" allennlp_args.embedding_sources_mapping = "" allennlp_args.extend_vocab = False allennlp_args.batch_size = None allennlp_args.batch_weight_key = None allennlp_args.output_file = "evaluation/evaluation" allennlp_args.predictions_output_file = "evaluation/pred" metric = evaluate_from_args(allennlp_args)
def test_output_file_evaluate_from_args(self): output_file = str(self.TEST_DIR / "metrics.json") kebab_args = [ "evaluate", str(self.FIXTURES_ROOT / "simple_tagger_with_span_f1" / "serialization" / "model.tar.gz"), str(self.FIXTURES_ROOT / "data" / "conll2003.txt"), "--cuda-device", "-1", "--output-file", output_file, ] args = self.parser.parse_args(kebab_args) computed_metrics = evaluate_from_args(args) with open(output_file, "r") as file: saved_metrics = json.load(file) assert computed_metrics == saved_metrics
def test_evaluate_from_args(self): parser = argparse.ArgumentParser(description="Testing") subparsers = parser.add_subparsers(title='Commands', metavar='') add_subparser(subparsers) raw_args = [ "evaluate", "--archive_file", "tests/fixtures/bidaf/serialization/model.tar.gz", "--evaluation_data_file", "tests/fixtures/data/squad.json" ] args = parser.parse_args(raw_args) metrics = evaluate_from_args(args) assert metrics.keys() == { 'span_acc', 'end_acc', 'start_acc', 'em', 'f1' }
def test_evaluate_from_args(self): parser = argparse.ArgumentParser(description="Testing") subparsers = parser.add_subparsers(title='Commands', metavar='') Evaluate().add_subparser('evaluate', subparsers) snake_args = ["evaluate", "--archive_file", "tests/fixtures/bidaf/serialization/model.tar.gz", "--evaluation_data_file", "tests/fixtures/data/squad.json", "--cuda_device", "-1"] kebab_args = ["evaluate", "--archive-file", "tests/fixtures/bidaf/serialization/model.tar.gz", "--evaluation-data-file", "tests/fixtures/data/squad.json", "--cuda-device", "-1"] for raw_args in [snake_args, kebab_args]: args = parser.parse_args(raw_args) metrics = evaluate_from_args(args) assert metrics.keys() == {'span_acc', 'end_acc', 'start_acc', 'em', 'f1'}
def test_evaluate_from_args(self): kebab_args = [ "evaluate", str(self.FIXTURES_ROOT / "simple_tagger_with_span_f1" / "serialization" / "model.tar.gz"), str(self.FIXTURES_ROOT / "data" / "conll2003.txt"), "--cuda-device", "-1", ] args = self.parser.parse_args(kebab_args) metrics = evaluate_from_args(args) assert metrics.keys() == { "accuracy", "accuracy3", "precision-overall", "recall-overall", "f1-measure-overall", "loss", }
def test_evaluate_from_args(self): snake_args = [ "evaluate", "--archive_file", "tests/fixtures/bidaf/serialization/model.tar.gz", "--evaluation_data_file", "tests/fixtures/data/squad.json", "--cuda_device", "-1" ] kebab_args = [ "evaluate", "--archive-file", "tests/fixtures/bidaf/serialization/model.tar.gz", "--evaluation-data-file", "tests/fixtures/data/squad.json", "--cuda-device", "-1" ] for raw_args in [snake_args, kebab_args]: args = self.parser.parse_args(raw_args) metrics = evaluate_from_args(args) assert metrics.keys() == { 'span_acc', 'end_acc', 'start_acc', 'em', 'f1' }
def generate_probs(model_dir, inp_fp, weights_fp, type_, out_fp, out_ext, cuda_device, overwrite, batch_size, extraction_ratio, hparams): import_submodules('imojie') if out_fp == None: inp_base = os.path.basename(inp_fp) inp_dir = os.path.basename(os.path.dirname(inp_fp)) eval_dir = os.path.basename(os.path.dirname( os.path.dirname(inp_fp))) # train, dev or test inp_base = inp_base.replace('extractions', eval_dir) prob_dir = model_dir + '/prob' os.makedirs(prob_dir, exist_ok=True) out_fp = prob_dir + '/' + inp_base + '.' + inp_dir + '.' + out_ext else: os.makedirs(os.path.dirname(out_fp), exist_ok=True) if os.path.exists(out_fp) and not overwrite: print('found ', out_fp) return args = argparse.Namespace() args.archive_file = model_dir args.cuda_device = cuda_device args.embedding_sources_mapping = {} args.extend_vocab = None args.batch_weight_key = '' args.output_file = '' args.overrides = "{'model': {'token_based_metric': null}, 'iterator': {'batch_size': " + str( batch_size) + ", \ 'instances_per_epoch': null}, 'trainer':{'num_epochs':1}, 'dataset_reader': {'max_tokens': 10000, \ 'gradients': false, 'max_extractions': 30, 'extraction_ratio': " + str( extraction_ratio) + ", 'probability': true \ }, 'validation_dataset_reader': null}" args.weights_file = weights_fp args.input_file = inp_fp probs = evaluate_from_args(args) probsD = dict() # For some reason the last batch results are repeated in the probs # Not an issue as they are just overwritten while forming the probsD for i in range(len(probs['example_ids'])): probsD[probs['example_ids'][i]] = probs['probs'][i] lines = open(inp_fp).readlines() all_fields = [] for line_number, line in enumerate(lines): line = line.strip('\n') fields = line.split('\t') if line_number not in probsD: # the example is too large and rejected by dataloader ('max_tokens' argument) continue # Removing appended extractions after reranking fields[0] = fields[0].split('[SEP]')[0].strip() fields[2] = str(probsD[line_number]) all_fields.append('\t'.join(fields)) # if type_ == 'single': # all_fields = [] # for line_number, line in enumerate(lines): # line = line.strip('\n') # fields = line.split('\t') # if line_number not in probsD: # the example is too large and rejected by dataloader ('max_tokens' argument) # continue # # Removing appended extractions after reranking # fields[0] = fields[0].split('[SEP]')[0].strip() # fields[2] = str(probsD[line_number]) # all_fields.append('\t'.join(fields)) # elif type_ == 'append': # all_fields = [] # all_examples = dict() # extractions = [] # lines = lines + [''] # for line_number, line in enumerate(lines): # line = line.strip('\n') # if line_number != len(lines)-1: # sentence, extraction, confidence = line.split('\t') # else: # sentence, extraction, confidence = '', '', 1 # if line_number == 0: # old_sentence = sentence # if line_number == len(lines)-1 or sentence != old_sentence: # # if line_number == len(lines)-1: # # extractions.append(extraction) # # old_sentence = sentence # all_examples[line_number-1] = [old_sentence, extractions] # old_sentence = sentence # extractions = [] # extractions.append(extraction) # for line_number in probsD: # # assert line_number in all_examples # if line_number not in all_examples: # continue # sentence, extractions = all_examples[line_number] # for ext_num, extraction in enumerate(extractions): # confidence = probsD[line_number][ext_num].item() # out = sentence+'\t'+extraction+'\t'+str(confidence) # all_fields.append(out) # sorting all_fields according to the confidences assigned by bert_encoder all_fields_sorted = [] prev_sent = None exts = [] for f in all_fields: sent = f.split('\t')[0] if sent != prev_sent: if prev_sent != None: exts = sorted(exts, reverse=True, key=lambda x: float(x.split('\t')[2])) if hparams.topk != None: exts = exts[:hparams.topk] all_fields_sorted.extend(exts) prev_sent = sent exts = [f] else: exts.append(f) exts = sorted(exts, reverse=True, key=lambda x: float(x.split('\t')[2])) all_fields_sorted.extend(exts) open(out_fp, 'w').write('\n'.join(all_fields_sorted)) print('Probabilities written to: ', out_fp) return
def generate_probs(model_dir, inp_fp, weights_fp, topk, out_ext, cuda_device, overwrite, batch_size, extraction_ratio, out): import_submodules('imojie') args = argparse.Namespace() args.archive_file = model_dir args.cuda_device = cuda_device args.embedding_sources_mapping = {} args.extend_vocab = None args.batch_weight_key = '' args.output_file = '' args.overrides = "{'model': {'token_based_metric': null}, 'iterator': {'batch_size': " + str( batch_size) + ", \ 'instances_per_epoch': null}, 'trainer':{'num_epochs':1}, 'dataset_reader': {'max_tokens': 10000, \ 'gradients': false, 'max_extractions': 30, 'extraction_ratio': " + str( extraction_ratio) + ", 'probability': true \ }, 'validation_dataset_reader': null}" args.weights_file = weights_fp args.input_file = inp_fp probs = evaluate_from_args(args) probsD = dict() # For some reason the last batch results are repeated in the probs # Not an issue as they are just overwritten while forming the probsD for i in range(len(probs['example_ids'])): probsD[probs['example_ids'][i]] = probs['probs'][i] lines = open(inp_fp).readlines() all_fields = [] for line_number, line in enumerate(lines): line = line.strip('\n') fields = line.split('\t') if line_number not in probsD: # the example is too large and rejected by dataloader ('max_tokens' argument) continue # Removing appended extractions after reranking fields[0] = fields[0].split('[SEP]')[0].strip() fields[2] = str(probsD[line_number]) fields.append(str(line_number)) all_fields.append('\t'.join(fields)) if topk == None: return all_fields else: # sorting all_fields according to the confidences assigned by bert_encoder all_fields_sorted = [] prev_sent = None exts = [] for f in all_fields: sent = f.split('\t')[0] if sent != prev_sent: if prev_sent != None: exts = toolz.unique(exts, key=lambda x: x.split('\t')[1]) exts = sorted(exts, reverse=True, key=lambda x: float(x.split('\t')[2])) if topk != None: exts = exts[:topk] all_fields_sorted.extend(exts) prev_sent = sent exts = [f] else: exts.append(f) exts = sorted(exts, reverse=True, key=lambda x: float(x.split('\t')[2])) all_fields_sorted.extend(exts) open(out, 'w').write('\n'.join(all_fields_sorted)) print('Probabilities written to: ', out) return all_fields_sorted