def _ucca_parse_text(text, output_dir, filename, clean, normalize_sentence, model_path): # text = [normalize_sentence(x) for x in text] # text = from_text(text, split=True, one_per_line=True) # text = list(text) # By pass the UCCA tokenizor text = [ next( from_text(normalize_sentence(val).split(' '), passage_id=idx, tokenized=True)) for idx, val in enumerate(text) ] # print(text) parser = get_parser(model_path) out_location = os.path.dirname(parse_location(output_dir, filename, 0)) if not os.path.isdir(out_location): os.makedirs(out_location) for i, (passage, *_) in enumerate(parser.parse(text)): passage2file(passage, parse_location(output_dir, filename, i)) # create an empty file anounces parsing finished succsessfuly parsed_file = os.path.join(out_location, PARSED_FILE) with open(parsed_file, "w") as _: pass if clean: filenames = os.listdir(output_dir) for filename in filenames: if filename.endswith(".txt"): os.remove(os.path.join(output_dir, item))
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument('filenames', nargs='+', help="passage file names to convert") argparser.add_argument('-o', '--outdir', default='.', help="output directory") argparser.add_argument('-p', '--prefix', default='', help="output filename prefix") argparser.add_argument('-r', '--remarks', action='store_true', help="annotate original IDs") argparser.add_argument("-b", "--binary", action="store_true", help="write in pickle binary format (.pickle)") args = argparser.parse_args() for filename in args.filenames: passage = file2passage(filename) sentences = ucca.convert.split2sentences(passage, remarks=args.remarks) for i, sentence in enumerate(sentences): outfile = "%s/%s.%s" % (args.outdir, args.prefix + sentence.ID, "pickle" if args.binary else "xml") sys.stderr.write("Writing passage file for sentence '%s'...\n" % outfile) passage2file(sentence, outfile, args.binary) sys.exit(0)
def main(args): splitter = Splitter.read_file(args.sentences, enum=args.enumerate, suffix_format=args.suffix_format, suffix_start=args.suffix_start) os.makedirs(args.outdir, exist_ok=True) i = 0 for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for sentence in splitter.split( passage) if splitter else split2sentences( passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None): i += 1 outfile = os.path.join( args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml")) if args.verbose: with external_write_mode(): print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr) if args.normalize: normalize(sentence) passage2file(sentence, outfile, binary=args.binary) if splitter and len(splitter.matched_indices) < len(splitter.sentences): print("Unmatched sentences:", *[ s for i, s in enumerate(splitter.sentences) if i not in splitter.matched_indices ], sep="\n")
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument("filenames", nargs="+", help="file names to convert") argparser.add_argument("-f", "--format", choices=convert.CONVERTERS, help="input file format") argparser.add_argument("-o", "--outdir", default=".", help="output directory") argparser.add_argument("-p", "--prefix", default="", help="output filename prefix") argparser.add_argument("-b", "--binary", action="store_true", help="write in pickle binary format (.pickle)") argparser.add_argument("-s", "--split", action="store_true", help="split each sentence to its own passage") argparser.add_argument("-T", "--tree", action="store_true", help="currently unused") argparser.add_argument("-m", "--markaux", action="store_true", help="mark auxiliary edges introduced on conversion") args = argparser.parse_args() for pattern in args.filenames: filenames = glob.glob(pattern) if not filenames: raise IOError("Not found: " + pattern) for filename in filenames: no_ext, ext = os.path.splitext(filename) basename = os.path.basename(no_ext) try: passage_id = re.search(r"\d+", basename).group(0) except AttributeError: passage_id = basename converter = convert.FROM_FORMAT.get(args.format or ext.lstrip(".")) if converter is None: raise IOError("Unknown extension '%s'. Specify format using -f" % ext) with open(filename, encoding="utf-8") as f: for passage in converter(f, passage_id, args.split, args.markaux): outfile = "%s/%s.%s" % (args.outdir, args.prefix + passage.ID, "pickle" if args.binary else "xml") sys.stderr.write("Writing '%s'...\n" % outfile) passage2file(passage, outfile, args.binary) sys.exit(0)
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument("filenames", nargs="+", help="file names to convert and evaluate") add_verbose_argument(argparser, help="detailed evaluation output") argparser.add_argument( "-o", "--outdir", help="output directory (if unspecified, files are not written)") args = argparser.parse_args() scores = [] for pattern in args.filenames: filenames = glob.glob(pattern) if not filenames: raise IOError("Not found: " + pattern) for filename in filenames: sys.stdout.write("\rConverting '%s'" % filename) if args.outdir: sys.stdout.write("\n") sys.stdout.flush() basename = os.path.basename(os.path.splitext(filename)[0]) with open(filename, encoding="utf-8") as f: for passage, ref, amr_id in from_amr(f, passage_id=basename, return_amr=True): if args.outdir: outfile = "%s/%s.xml" % (args.outdir, passage.ID) print("Writing '%s'..." % outfile, file=sys.stderr, flush=True) ioutil.passage2file(passage, outfile) try: guessed = "\n".join(to_amr(passage, amr_id)) except Exception as e: raise ValueError("Error converting %s back from AMR" % filename) from e if args.outdir: outfile = "%s/%s.txt" % (args.outdir, passage.ID) print("Writing '%s'..." % outfile, file=sys.stderr, flush=True) with open(outfile, "w", encoding="utf-8") as f_out: print(str(guessed), file=f_out) try: s = evaluate(guessed, ref, verbose=args.verbose > 1) except Exception as e: raise ValueError("Error evaluating conversion of %s" % filename) from e scores.append(s) if args.verbose: s.print(flush=True) print() if args.verbose and len(scores) > 1: print("Aggregated scores:") SmatchScores.aggregate(scores).print() sys.exit(0)
def main(args): for filename in args.filenames: print("Reading passage '%s'..." % filename, file=sys.stderr) passage = file2passage(filename) basename = os.path.splitext(os.path.basename(filename))[0] outfile = args.outdir + os.path.sep + basename + ".pickle" print("Writing file '%s'..." % outfile, file=sys.stderr) passage2file(passage, outfile, binary=True)
def main(args): for filename in args.filenames: sys.stderr.write("Reading passage '%s'...\n" % filename) passage = file2passage(filename) basename = os.path.splitext(os.path.basename(filename))[0] outfile = args.outdir + os.path.sep + basename + ".xml" sys.stderr.write("Writing file '%s'...\n" % outfile) passage2file(passage, outfile)
def diff_passages(true_passage, pred_passage): """ Debug method to print missing or mistaken attributes, nodes and edges """ lines = list() if not true_passage._attrib.equals(pred_passage._attrib): lines.append("Passage attributes mismatch: %s, %s" % (true_passage._attrib, pred_passage._attrib)) try: for lid, l1 in true_passage._layers.items(): l2 = true_passage.layer(lid) if not l1._attrib.equals(l2._attrib): lines.append("Layer %d attributes mismatch: %s, %s" % (lid, l1._attrib, l2._attrib)) except KeyError: # no layer with same ID found lines.append("Missing layer: %s, %s" % (true_passage._layers, pred_passage._layers)) pred_ids = {node.extra["remarks"]: node for node in pred_passage.missing_nodes(true_passage)} true_ids = {node.ID: node for node in true_passage.missing_nodes(pred_passage)} for pred_id, pred_node in list(pred_ids.items()): true_node = true_ids.get(pred_id) if true_node: pred_ids.pop(pred_id) true_ids.pop(pred_id) pred_edges = {edge.tag + "->" + edge.child.ID: edge for edge in pred_node.missing_edges(true_node)} true_edges = {edge.tag + "->" + edge.child.ID: edge for edge in true_node.missing_edges(pred_node)} intersection = set(pred_edges).intersection(set(true_edges)) pred_edges = {s: edge for s, edge in pred_edges.items() if s not in intersection} true_edges = {s: edge for s, edge in true_edges.items() if s not in intersection} node_lines = [] if not pred_node._attrib.equals(true_node._attrib): node_lines.append(" Attributes mismatch: %s, %s" % (sorted(true_node._attrib.items()), sorted(pred_node._attrib.items()))) if pred_edges: node_lines.append(" Mistake edges: %s" % ", ".join(pred_edges)) if true_edges: node_lines.append(" Missing edges: %s" % ", ".join(true_edges)) if node_lines: lines.append("For node " + pred_id + ":") lines.extend(node_lines) if pred_ids: lines.append("Mistake nodes: %s" % ", ".join(pred_ids)) if true_ids: lines.append("Missing nodes: %s" % ", ".join(true_ids)) if lines: outfile = "%s.xml" % true_passage.ID sys.stderr.write("Writing passage '%s'...\n" % outfile) passage2file(true_passage, outfile) outfile = "%s_pred.xml" % pred_passage.ID sys.stderr.write("Writing passage '%s'...\n" % outfile) passage2file(pred_passage, outfile) return "\n" + "\n".join(lines)
def write_passage(passage, args): suffix = args.format or ("pickle" if args.binary else "xml") outfile = args.outdir + os.path.sep + args.prefix + passage.ID + "." + suffix print("Writing passage '%s'..." % outfile) if args.format is None: ioutil.passage2file(passage, outfile, binary=args.binary) else: converter = convert.TO_FORMAT[args.format] output = "\n".join(line for line in converter(passage)) with open(outfile, "w") as f: f.write(output + "\n")
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument("filenames", nargs="+", help="passage file names to join") argparser.add_argument("-o", "--outdir", default=".", help="output directory") argparser.add_argument("-p", "--prefix", default="", help="output filename prefix") argparser.add_argument("-r", "--remarks", action="store_true", help="annotate original IDs") argparser.add_argument("-b", "--binary", action="store_true", help="write in pickle binary format (.pickle)") argparser.add_argument( "-j", "--join-by-prefix", action="store_true", help= "join each set of passages whose IDs share all but the last 3 characters" ) args = argparser.parse_args() passages = [ file2passage(filename) for pattern in args.filenames for filename in sorted(glob.glob(pattern)) ] if args.join_by_prefix: subsets = defaultdict(list) for passage in passages: subsets[passage.ID[:-3]].append(passage) else: subsets = {passages[0].ID: passages} for passage_id, subset in sorted(subsets.items()): sys.stderr.write("Joining passages " + ", ".join(passage.ID for passage in subset) + "\n") joined = ucca.convert.join_passages(passages, passage_id=passage_id, remarks=args.remarks) outfile = "%s/%s.%s" % (args.outdir, args.prefix + joined.ID, "pickle" if args.binary else "xml") sys.stderr.write("Writing joined passage file '%s'...\n" % outfile) passage2file(joined, outfile, args.binary) sys.exit(0)
def main(args): os.makedirs(args.outdir, exist_ok=True) for filename in tqdm(args.filenames, desc="Converting", unit=" passages"): if args.verbose: with external_write_mode(): print("Reading passage '%s'..." % filename, file=sys.stderr) passage = file2passage(filename) basename = os.path.splitext(os.path.basename(filename))[0] outfile = args.outdir + os.path.sep + basename + ".pickle" if args.verbose: with external_write_mode(): print("Writing file '%s'..." % outfile, file=sys.stderr) passage2file(passage, outfile, binary=True)
def main(): argparser = configargparse.ArgParser(description=desc) argparser.add_argument("filenames", nargs="+", help="file names to convert and evaluate") add_verbose_arg(argparser, help="detailed evaluation output") add_boolean_option(argparser, "wikification", "Spotlight to wikify any named node (for AMR)") argparser.add_argument("-o", "--out-dir", help="output directory (if unspecified, files are not written)") args = argparser.parse_args() scores = [] for pattern in args.filenames: filenames = glob(pattern) if not filenames: raise IOError("Not found: " + pattern) for filename in filenames: print("\rConverting '%s'" % filename, end="") if args.out_dir or args.verbose: print(flush=True) basename, ext = os.path.splitext(os.path.basename(filename)) passage_format = ext.lstrip(".") converters = CONVERTERS.get(passage_format, CONVERTERS["amr"]) evaluator = EVALUATORS.get(passage_format, EVALUATORS["amr"]).evaluate with open(filename, encoding="utf-8") as f: for passage, ref, passage_id in converters[0](f, passage_id=basename, return_original=True): if args.out_dir: os.makedirs(args.out_dir, exist_ok=True) outfile = "%s/%s.xml" % (args.out_dir, passage.ID) print("Writing '%s'..." % outfile, file=sys.stderr, flush=True) ioutil.passage2file(passage, outfile) try: guessed = converters[1](passage, wikification=args.wikification, use_original=False) except Exception as e: raise ValueError("Error converting %s back from %s" % (filename, passage_format)) from e if args.out_dir: outfile = "%s/%s%s" % (args.out_dir, passage.ID, ext) print("Writing '%s'..." % outfile, file=sys.stderr, flush=True) with open(outfile, "w", encoding="utf-8") as f_out: print("\n".join(guessed), file=f_out) try: s = evaluator(guessed, ref, verbose=args.verbose > 1) except Exception as e: raise ValueError("Error evaluating conversion of %s" % filename) from e scores.append(s) if args.verbose: print(passage_id) s.print() print() if args.verbose and len(scores) > 1: print("Aggregated scores:") Scores(scores).print() sys.exit(0)
def main(args): splitter = Splitter.read_file(args.sentences, enum=args.enumerate) os.makedirs(args.outdir, exist_ok=True) i = 0 for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for sentence in splitter.split(passage) if splitter else split2sentences( passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None): i += 1 outfile = os.path.join(args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml")) with external_write_mode(): print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr) if args.normalize: normalize(sentence) passage2file(sentence, outfile, binary=args.binary)
def main(args): passages = list(get_passages(args.filenames)) if args.join_by_prefix: subsets = defaultdict(list) for passage in passages: subsets[passage.ID[:-3]].append(passage) else: subsets = {passages[0].ID: passages} for passage_id, subset in sorted(subsets.items()): print("Joining passages " + ", ".join(passage.ID for passage in subset), file=sys.stderr) joined = ucca.convert.join_passages(passages, passage_id=passage_id, remarks=args.remarks) outfile = "%s/%s.%s" % (args.outdir, args.prefix + joined.ID, "pickle" if args.binary else "xml") print("Writing joined passage file '%s'..." % outfile, file=sys.stderr) passage2file(joined, outfile, args.binary)
def main(args): order = None if args.sentences: with open(args.sentences, encoding="utf-8") as f: order = dict(map(reversed, enumerate(map(str.strip, f)))) for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for sentence in split(passage, order) if order else split2sentences( passage, remarks=args.remarks, lang=args.lang): outfile = os.path.join( args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml")) with tqdm.external_write_mode(): print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr) passage2file(sentence, outfile, args.binary)
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument('filenames', nargs='+', help="XML file names to convert") argparser.add_argument('-o', '--outdir', default='.', help="output directory") args = argparser.parse_args() for filename in args.filenames: sys.stderr.write("Reading passage '%s'...\n" % filename) passage = file2passage(filename) basename = os.path.splitext(os.path.basename(filename))[0] outfile = args.outdir + os.path.sep + basename + ".pickle" sys.stderr.write("Writing file '%s'...\n" % outfile) passage2file(passage, outfile, binary=True) sys.exit(0)
def write_passage(passage, args): ext = { None: UCCA_EXT[args.binary], "amr": ".txt" }.get(args.output_format) or "." + args.output_format outfile = args.outdir + os.path.sep + args.prefix + passage.ID + ext sys.stderr.write("Writing '%s'...\n" % outfile) if args.output_format is None: # UCCA output ioutil.passage2file(passage, outfile, args.binary) else: converter = CONVERTERS[args.output_format][1] output = "\n".join(converter(passage)) if args.output_format == "amr" else \ "\n".join(line for p in (convert.split2sentences(passage) if args.split else [passage]) for line in converter(p, test=args.test, tree=args.tree, mark_aux=args.mark_aux)) with open(outfile, "w", encoding="utf-8") as f: print(output, file=f)
def write_passage(passage, out_dir=".", output_format=None, binary=False, verbose=False, label_map=False, split=False, join=None, **kwargs): ext = {None: UCCA_EXT[binary], "amr": ".txt"}.get(output_format) or "." + output_format if join and join.endswith(ext): ext = "" outfile = os.path.join(out_dir, (join or passage.ID) + ext) if verbose: with ioutil.external_write_mode(): print("Writing '%s'..." % outfile, file=sys.stderr) if output_format is None: # UCCA output ioutil.passage2file(passage, outfile, binary=binary) else: converter = TO_FORMAT[output_format] with open(outfile, "a" if join else "w", encoding="utf-8") as f: for line in converter(passage, format=output_format if label_map else None, sentences=split, **kwargs): print(line, file=f)
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument('directory', help="directory containing XML files to process") args = argparser.parse_args() passages = glob.glob(args.directory + "/*.xml") for filename in passages: sys.stderr.write("Fixing passage '%s'...\n" % filename) passage = file2passage(filename) terminals = passage.layer(layer0.LAYER_ID).all for terminal in terminals: terminal.tag = layer0.NodeTags.Punct if is_punctuation( terminal.attrib.get("text")) else layer0.NodeTags.Word passage2file(passage, filename, indent=False) sys.exit(0)
def main(args): scores = [] for pattern in args.filenames: filenames = glob(pattern) if not filenames: raise IOError("Not found: " + pattern) for filename in filenames: print("\rConverting '%s'" % filename, end="") if args.out_dir or args.verbose: print(flush=True) basename, ext = os.path.splitext(os.path.basename(filename)) passage_format = ext.lstrip(".") converters = CONVERTERS.get(passage_format, CONVERTERS["amr"]) evaluator = EVALUATORS.get(passage_format, EVALUATORS["amr"]).evaluate with open(filename, encoding="utf-8") as f: for passage, ref, passage_id in converters[0](f, passage_id=basename, return_original=True): if args.normalize: normalize(passage, extra=args.extra_normalization) if args.out_dir: os.makedirs(args.out_dir, exist_ok=True) outfile = "%s/%s.xml" % (args.out_dir, passage.ID) print("Writing '%s'..." % outfile, file=sys.stderr, flush=True) ioutil.passage2file(passage, outfile) try: guessed = converters[1](passage, wikification=args.wikification, use_original=False) except Exception as e: raise ValueError("Error converting %s back from %s" % (filename, passage_format)) from e if args.out_dir: outfile = "%s/%s%s" % (args.out_dir, passage.ID, ext) print("Writing '%s'..." % outfile, file=sys.stderr, flush=True) with open(outfile, "w", encoding="utf-8") as f_out: print("\n".join(guessed), file=f_out) try: s = evaluator(guessed, ref, verbose=args.verbose > 1) except Exception as e: raise ValueError("Error evaluating conversion of %s" % filename) from e scores.append(s) if args.verbose: print(passage_id) s.print() print() if args.verbose and len(scores) > 1: print("Aggregated scores:") Scores(scores).print()
def main(args): splitter = Splitter.read_file(args.sentences, enum=args.enumerate, suffix_format=args.suffix_format, suffix_start=args.suffix_start) os.makedirs(args.outdir, exist_ok=True) i = 0 for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for sentence in splitter.split(passage) if splitter else split2sentences( passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None): i += 1 outfile = os.path.join(args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml")) if args.verbose: with external_write_mode(): print(sentence, file=sys.stderr) print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr) if args.normalize: normalize(sentence) passage2file(sentence, outfile, binary=args.binary) if splitter and len(splitter.matched_indices) < len(splitter.sentences): print("", "Unmatched sentences:", *[s for i, s in enumerate(splitter.sentences) if i not in splitter.matched_indices], sep="\n")
def main(args): os.makedirs(args.outdir, exist_ok=True) i = 0 for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for paragraph in split2paragraphs( passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None): i += 1 outfile = os.path.join( args.outdir, args.prefix + paragraph.ID + (".pickle" if args.binary else ".xml")) if args.verbose: with external_write_mode(): print(paragraph, file=sys.stderr) print("Writing passage file for paragraph '%s'..." % outfile, file=sys.stderr) if args.normalize: normalize(paragraph) passage2file(paragraph, outfile, binary=args.binary)
def _ucca_parse_text(text, output_dir, filename, clean, normalize_sentence): text = [normalize_sentence(x) for x in text] # print("parsing", text) text = from_text(text, split=True, one_per_line=True) text = list(text) # print("output_dir", output_dir) # print(filename, "filename") # print("parsed to", parse_location( # output_dir, filename, 0)) # raise parser = get_parser() for i, passage in enumerate(parser.parse(text)): passage2file(passage, parse_location(output_dir, filename, i)) # create a file anounces parsing finished succsessfuly parsed_file = os.path.join( os.path.dirname(parse_location(output_dir, filename, 0)), PARSED_FILE) with open(parsed_file, "w") as _: pass if clean: filenames = os.listdir(output_dir) for filename in filenames: if filename.endswith(".txt"): os.remove(os.path.join(output_dir, item))
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument("filenames", nargs="+", help="passage file names to annotate") argparser.add_argument("-v", "--verbose", action="store_true", help="print tagged text for each passage") args = argparser.parse_args() for pattern in args.filenames: filenames = glob.glob(pattern) if not filenames: raise IOError("Not found: " + pattern) for filename in filenames: passage = file2passage(filename) annotate(passage, verbose=args.verbose, replace=True) sys.stderr.write("Writing '%s'...\n" % filename) passage2file(passage, filename, binary=not filename.endswith("xml")) sys.exit(0)
def write_passage(passage, out_dir=".", output_format=None, binary=False, verbose=False, test=False, tree=False, mark_aux=False, wikification=False, default_label=None, label_map=False, split=False, **kwargs): del kwargs ext = { None: UCCA_EXT[binary], "amr": ".txt" }.get(output_format) or "." + output_format outfile = os.path.join(out_dir, passage.ID + ext) if verbose: with ioutil.external_write_mode(): print("Writing '%s'..." % outfile, file=sys.stderr) if output_format is None: # UCCA output ioutil.passage2file(passage, outfile, binary=binary) else: converter = TO_FORMAT[output_format] with open(outfile, "w", encoding="utf-8") as f: for line in converter(passage, test=test, tree=tree, mark_aux=mark_aux, wikification=wikification, default_label=default_label, format=output_format if label_map else None, sentences=split): print(line, file=f)
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument("filenames", nargs="+", help="file names to convert") argparser.add_argument("-f", "--format", choices=convert.CONVERTERS, help="input file format") argparser.add_argument("-o", "--outdir", default=".", help="output directory") argparser.add_argument("-p", "--prefix", default="", help="output filename prefix") argparser.add_argument("-b", "--binary", action="store_true", help="write in pickle binary format (.pickle)") argparser.add_argument("-s", "--split", action="store_true", help="split each sentence to its own passage") argparser.add_argument("-T", "--tree", action="store_true", help="currently unused") argparser.add_argument( "-m", "--markaux", action="store_true", help="mark auxiliary edges introduced on conversion") args = argparser.parse_args() for pattern in args.filenames: filenames = glob.glob(pattern) if not filenames: raise IOError("Not found: " + pattern) for filename in filenames: no_ext, ext = os.path.splitext(filename) basename = os.path.basename(no_ext) try: passage_id = re.search(r"\d+", basename).group(0) except AttributeError: passage_id = basename converter = convert.FROM_FORMAT.get(args.format or ext.lstrip(".")) if converter is None: raise IOError( "Unknown extension '%s'. Specify format using -f" % ext) with open(filename, encoding="utf-8") as f: for passage in converter(f, passage_id, args.split, args.markaux): outfile = "%s/%s.%s" % (args.outdir, args.prefix + passage.ID, "pickle" if args.binary else "xml") sys.stderr.write("Writing '%s'...\n" % outfile) passage2file(passage, outfile, args.binary) sys.exit(0)
def main(args): if args.out_dir: os.makedirs(args.out_dir, exist_ok=True) scores = [] for pattern in args.filenames: for filename in glob(pattern) or [pattern]: file_scores = [] basename, ext = os.path.splitext(os.path.basename(filename)) passage_format = ext.lstrip(".") if passage_format == "txt": passage_format = args.format in_converter, out_converter = CONVERTERS.get( passage_format, CONVERTERS[args.format]) evaluate = EVALUATORS.get(passage_format, EVALUATORS[args.format]) with open(filename, encoding="utf-8") as f: t = tqdm(in_converter(f, passage_id=basename, return_original=True), unit=" passages", desc=("Converting '%s'" % filename) + ((", writing to '%s'" % args.out_dir) if args.out_dir else "")) for passage, ref, passage_id in t: if args.normalize: normalize(passage, extra=args.extra_normalization) if args.out_dir: os.makedirs(args.out_dir, exist_ok=True) outfile = os.path.join(args.out_dir, passage.ID + ".xml") if args.verbose: with ioutil.external_write_mode(): print("Writing '%s'..." % outfile, file=sys.stderr, flush=True) ioutil.passage2file(passage, outfile) try: guessed = out_converter(passage, wikification=args.wikification, use_original=False) except Exception as e: raise ValueError("Error converting %s back from %s" % (filename, passage_format)) from e if args.out_dir: outfile = os.path.join(args.out_dir, passage.ID + ext) if args.verbose: with ioutil.external_write_mode(): print("Writing '%s'..." % outfile, file=sys.stderr, flush=True) with open(outfile, "w", encoding="utf-8") as f_out: print("\n".join(guessed), file=f_out) try: s = evaluate(guessed, ref, verbose=args.verbose > 1, units=args.units) except Exception as e: raise ValueError("Error evaluating conversion of %s" % filename) from e file_scores.append(s) if args.verbose: with ioutil.external_write_mode(): print(passage_id) s.print() t.set_postfix(F1="%.2f" % (100.0 * Scores(file_scores).average_f1())) scores += file_scores print() if args.verbose and len(scores) > 1: print("Aggregated scores:") Scores(scores).print()
def diff_passages(true_passage, pred_passage): """ Debug method to print missing or mistaken attributes, nodes and edges """ lines = list() if not true_passage._attrib.equals(pred_passage._attrib): lines.append("Passage attributes mismatch: %s, %s" % (true_passage._attrib, pred_passage._attrib)) try: for lid, l1 in true_passage._layers.items(): l2 = true_passage.layer(lid) if not l1._attrib.equals(l2._attrib): lines.append("Layer %d attributes mismatch: %s, %s" % (lid, l1._attrib, l2._attrib)) except KeyError: # no layer with same ID found lines.append("Missing layer: %s, %s" % (true_passage._layers, pred_passage._layers)) pred_ids = { node.extra["remarks"]: node for node in pred_passage.missing_nodes(true_passage) } true_ids = { node.ID: node for node in true_passage.missing_nodes(pred_passage) } for pred_id, pred_node in list(pred_ids.items()): true_node = true_ids.get(pred_id) if true_node: pred_ids.pop(pred_id) true_ids.pop(pred_id) pred_edges = { edge.tag + "->" + edge.child.ID: edge for edge in pred_node.missing_edges(true_node) } true_edges = { edge.tag + "->" + edge.child.ID: edge for edge in true_node.missing_edges(pred_node) } intersection = set(pred_edges).intersection(set(true_edges)) pred_edges = { s: edge for s, edge in pred_edges.items() if s not in intersection } true_edges = { s: edge for s, edge in true_edges.items() if s not in intersection } node_lines = [] if not pred_node._attrib.equals(true_node._attrib): node_lines.append(" Attributes mismatch: %s, %s" % (true_node._attrib, pred_node._attrib)) if pred_edges: node_lines.append(" Mistake edges: %s" % ", ".join(pred_edges)) if true_edges: node_lines.append(" Missing edges: %s" % ", ".join(true_edges)) if node_lines: lines.append("For node " + pred_id + ":") lines.extend(node_lines) if pred_ids: lines.append("Mistake nodes: %s" % ", ".join(pred_ids)) if true_ids: lines.append("Missing nodes: %s" % ", ".join(true_ids)) if lines: outfile = "ucca_passage%s.xml" % true_passage.ID sys.stderr.write("Writing passage '%s'...\n" % outfile) passage2file(true_passage, outfile) outfile = "ucca_passage%s_pred.xml" % pred_passage.ID sys.stderr.write("Writing passage '%s'...\n" % outfile) passage2file(pred_passage, outfile) return "\n" + "\n".join(lines)