def test_evaluate(create1, create2, f1, units, errors): p1 = create1() p2 = create2() validation_errors_before = [list(validate(p, linkage=False)) for p in (p1, p2)] scores = evaluate(p1, p2, units=units, errors=errors) validation_errors_after = [list(validate(p, linkage=False)) for p in (p1, p2)] for before, after in zip(validation_errors_before, validation_errors_after): if not before: assert not after check_primary_remote(scores, f1)
def validate(passage, normalization=False, extra_normalization=False, ucca_validation=False, output_format=None, **kwargs): del kwargs if normalization: normalize(passage, extra=extra_normalization) if ucca_validation: yield from ucca_validations.validate(passage) else: # Generic validations depending on format-specific constraints try: constraints = CONSTRAINTS[passage.extra.get("format", output_format)]() except KeyError as e: raise ValueError("No validations defined for '%s' format" % output_format) from e yield from detect_cycles(passage) l0 = passage.layer(layer0.LAYER_ID) l1 = passage.layer(layer1.LAYER_ID) for terminal in l0.all: yield from check_orphan_terminals(constraints, terminal) yield from check_root_terminal_children(constraints, l1, terminal) yield from check_multiple_incoming(constraints, terminal) yield from check_top_level_allowed(constraints, l1) for node in l1.all: yield from check_multigraph(constraints, node) yield from check_implicit_children(constraints, node) yield from check_multiple_incoming(constraints, node) yield from check_top_level_only(constraints, l1, node) yield from check_required_outgoing(constraints, node) yield from check_tag_rules(constraints, node)
def download_task(self, task_id, normalize=False, write=True, validate=None, binary=None, log=None, out_dir=None, prefix=None, by_external_id=False, verbose=False, write_valid_only=False, **kwargs): del kwargs task = self.get_user_task(task_id) user_id = task["user"]["id"] try: passage = from_json(task, by_external_id=by_external_id) except ValueError as e: raise ValueError("Failed reading json for task %s:\n%s" % (task_id, json.dumps(task))) from e if normalize: try: normalization.normalize(passage) except AssertionError as e: raise ValueError("Failed normalizing task %s:\n%s" % (task_id, json.dumps(task))) from e if log: print(passage.ID, task_id, user_id, task["user_comment"], task["created_at"], task["updated_at"], file=log, sep="\t", flush=True) ret = passage, task_id, user_id if validate or write_valid_only: for error in validation.validate(passage, linkage=False): if validate: print(passage.ID, task_id, user_id, error, file=validate, sep="\t", flush=True) if write_valid_only: return ret if write: write_passage(passage, binary=binary, outdir=out_dir, prefix=prefix, verbose=verbose) return ret
def submit_tasks(self, filename, log_file, **kwargs): del kwargs log_file = open(log_file,'w') with open(filename) as f: task_ids = list(f.readlines()) for task_id in task_ids: try: task_id = task_id.strip() task = self.get_user_task(int(task_id)) if task['type'] not in ['ANNOTATION', 'REVIEW']: print(task_id, "NOT AN ANNOTATION/REVIEW TASK", file=log_file, sep="\t", flush=True) continue try: passage = next(iter(convert.from_json(task))) except ValueError as e: raise ValueError("Failed reading json for task %s:\n%s" % (task_id, json.dumps(task))) from e # validate the task normalization.normalize(passage) validation_errors = list(validation.validate(passage, linkage=False)) if len(validation_errors) == 0: self.submit_task(**task) print(task_id, "SUBMITTED", file=log_file, sep="\t", flush=True) else: for error in validation_errors: print(task_id, error, file=log_file, sep="\t", flush=True) except requests.exceptions.HTTPError as e: print(task_id, "HTTP Request Error: "+str(e), file=log_file, sep="\t", flush=True)
def submit_tasks(self, filename, log_file, **kwargs): del kwargs log_file = open(log_file,'w') with open(filename) as f: task_ids = list(f.readlines()) for task_id in task_ids: try: task_id = task_id.strip() task = self.get_user_task(int(task_id)) if task['type'] not in ['ANNOTATION', 'REVIEW']: print(task_id, "NOT AN ANNOTATION/REVIEW TASK", file=log_file, sep="\t", flush=True) continue try: passage = convert.from_json(task) except ValueError as e: raise ValueError("Failed reading json for task %s:\n%s" % (task_id, json.dumps(task))) from e # validate the task normalization.normalize(passage) validation_errors = list(validation.validate(passage, linkage=False)) if len(validation_errors) == 0: self.submit_task(**task) print(task_id, "SUBMITTED", file=log_file, sep="\t", flush=True) else: for error in validation_errors: print(task_id, error, file=log_file, sep="\t", flush=True) except requests.exceptions.HTTPError as e: print(task_id, "HTTP Request Error: "+str(e), file=log_file, sep="\t", flush=True)
def test_evaluate_self(create, valid): p = create() errors = list(validate(p)) if valid: assert not errors, p else: assert errors, p
def validate_passage(self, passage): if self.normalization: normalize(passage, extra=self.extra) errors = list(validate(passage, linkage=self.linkage)) if self.strict: print_errors(passage.ID, errors) return passage.ID, errors
def main(args): errors = ((p.ID, list(validate(p))) for p in get_passages_with_progress_bar(args.filenames, desc="Validating")) errors = {k: v for k, v in errors if v} if errors: id_len = max(map(len, errors)) for passage_id, es in sorted(errors.items()): for i, e in enumerate(es): print("%-*s|%s" % (id_len, "" if i else passage_id, e)) sys.exit(1) else: print("No errors found.")
def validate_passage(self, passage): if self.normalization: normalize(passage, extra=self.extra) errors = list(validate(passage, linkage=self.linkage, multigraph=self.multigraph)) passage_id = passage.ID user_id = passage.attrib.get("userID") if user_id: passage_id += " " + user_id task_id = passage.attrib.get("annotationID") if task_id: passage_id += " " + task_id if self.strict: print_errors(passage_id, errors) return passage_id, errors