def udpipe(sentences, model_name, verbose=False): """ Parse text to Universal Dependencies using UDPipe. :param sentences: iterable of iterables of strings (one string per line) :param model_name: filename containing UDPipe model to load :param verbose: print extra information :return: iterable of lines containing parsed output """ from ufal.udpipe import Model, Pipeline, ProcessingError model = Model.load(model_name) if not model: raise ValueError("Invalid model: '%s'" % model_name) pipeline = Pipeline(model, "conllu", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") lines1, lines2 = tee(l for s in sentences for l in s) text = "\n".join(lines1) error = ProcessingError() num_tokens = sum(1 for l in lines2 if l) with ioutil.external_write_mode(): print("Running %s on %d tokens... " % (model_name, num_tokens), end="", flush=True) start = time() processed = pipeline.process(text, error) duration = time() - start with ioutil.external_write_mode(): print("Done (%.3fs, %.0f tokens/s)" % (duration, num_tokens / duration if duration else 0)) if verbose: print(processed) if error.occurred(): raise RuntimeError(error.message) return processed.splitlines()
def get_most_recent_passage_by_uid(uid, passage_id, host_name, db_name, verbose=False, write_xids=None, **kwargs): del kwargs c = get_cursor(host_name, db_name) c.execute( "SELECT xml,status,ts,id FROM xmls WHERE uid=%s AND paid = %s ORDER BY ts DESC", (uid, passage_id)) queryset = c.fetchone() if queryset is None: raise Exception("The user " + uid + " did not annotate passage " + passage_id) raw_xml, status, ts, xid = queryset if int(status) != 1: # if not submitted with external_write_mode(): print("The most recent xml for uid " + uid + " and paid " + passage_id + " is not submitted.", file=sys.stderr) if verbose: with external_write_mode(): print("Timestamp: %s, xid: %d" % (ts, xid)) if write_xids: with open(write_xids, "a") as f: print(xid, file=f) return fromstring(raw_xml)
def main(args): os.makedirs(args.outdir, exist_ok=True) with open(args.filename, encoding="utf-8") as f: t = list(map(str.split, f)) if not args.verbose: t = tqdm(t, desc="Downloading", unit=" passages") for passage_id, id_field in t: if not args.verbose: t.set_postfix({ "passage_id": passage_id, args.method: id_field }) if args.verbose: with external_write_mode(): print("Getting passage " + passage_id + " with " + args.method + "=" + id_field, end="\t") xml_root = get_by_method(id_field=id_field.split(","), passage_id=passage_id, **vars(args)) if xml_root is None: continue if args.write_site: site_filename = passage_id + "_site_download.xml" with open(site_filename, "w", encoding="utf-8") as fsite: print(tostring(xml_root).decode(), file=fsite) if args.verbose: with external_write_mode(): print("Wrote '%s'" % site_filename) if args.write: write_passage(convert.from_site(xml_root), outdir=args.outdir, verbose=args.verbose)
def evaluate_all(evaluate, files, name=None, verbose=0, quiet=False, basename=False, matching_ids=False, units=False, errors=False, unlabeled=False, normalize=True, constructions=None, **kwargs): guessed, ref = [ iter(read_files(f, verbose=verbose, force_basename=basename, **kwargs)) for f in files[:2] ] ref_yield_tags = repeat(None) if len(files) < 3 or files[2] is None else \ iter(read_files(files[2], verbose=verbose, dep=True, **kwargs)) t = tqdm(zip(guessed, ref, ref_yield_tags), unit=" passages", desc=name, total=len(files[1])) for (g, r, ryt) in t: if matching_ids: while g.ID < r.ID: g = next(guessed) while g.ID > r.ID: r = next(ref) ryt = next(ref_yield_tags) if not quiet: with ioutil.external_write_mode(): print(r.ID, end=" ") t.set_postfix(ID=r.ID) if g.format != r.format: # noinspection PyCallingNonCallable g.passage = g.converted if r.out_converter is None else r.out_converter( g.converted) if ryt is not None and ryt.in_converter is not None: ryt.passage = ryt.converted # Passage for fine-grained yield reference must be in UCCA format or similar result = evaluate(g.passage, r.passage, verbose=verbose > 1 or units, units=units, errors=errors, eval_type=UNLABELED if unlabeled else None, normalize=normalize, constructions=constructions, ref_yield_tags=ryt.passage if ryt else None) if not quiet: with ioutil.external_write_mode(): print("F1: %.3f" % result.average_f1(UNLABELED if unlabeled else LABELED)) if verbose: with ioutil.external_write_mode(): result.print() yield result
def main(args): os.makedirs(args.outdir, exist_ok=True) for filename in tqdm(args.filenames, desc="Converting", unit=" passages"): if args.verbose: with external_write_mode(): print("Reading passage '%s'..." % filename, file=sys.stderr) passage = file2passage(filename) basename = os.path.splitext(os.path.basename(filename))[0] outfile = args.outdir + os.path.sep + basename + ".pickle" if args.verbose: with external_write_mode(): print("Writing file '%s'..." % outfile, file=sys.stderr) passage2file(passage, outfile, binary=True)
def main(args): splitter = Splitter.read_file(args.sentences, enum=args.enumerate, suffix_format=args.suffix_format, suffix_start=args.suffix_start) os.makedirs(args.outdir, exist_ok=True) i = 0 for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for sentence in splitter.split( passage) if splitter else split2sentences( passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None): i += 1 outfile = os.path.join( args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml")) if args.verbose: with external_write_mode(): print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr) if args.normalize: normalize(sentence) passage2file(sentence, outfile, binary=args.binary) if splitter and len(splitter.matched_indices) < len(splitter.sentences): print("Unmatched sentences:", *[ s for i, s in enumerate(splitter.sentences) if i not in splitter.matched_indices ], sep="\n")
def copy_annotation(passages, conllu, by_id=False, as_array=True, as_extra=True, verbose=False, lang=None): conllu_sentences = {annotated.ID: annotated for annotated in get_passages_with_progress_bar(conllu, converters=CONVERTERS, desc="Reading '%s'" % conllu)} \ if by_id else get_passages(conllu, converters=CONVERTERS) for passage in passages: try: annotated = conllu_sentences[passage.ID] if by_id else next( conllu_sentences) except (KeyError, StopIteration) as e: raise ValueError( "Missing annotation for passage ID '%s', by_id=%s" % (passage.ID, by_id)) from e if verbose: with external_write_mode(): print("Reading annotation from '%s'" % annotated.ID) if as_array: passage.layer(layer0.LAYER_ID).docs()[:] = annotated.layer( layer0.LAYER_ID).docs() if as_extra: for terminal, annotated_terminal in zip( passage.layer(layer0.LAYER_ID).all, annotated.layer(layer0.LAYER_ID).all): copy_tok_to_extra(annotated_terminal, terminal, lang=lang) yield passage
def compare_punct(files, name=None, verbose=0, basename=False, matching_ids=False, **kwargs): guessed, ref = [ iter(read_files(f, verbose=verbose, force_basename=basename, **kwargs)) for f in files ] for (g, r) in tqdm(zip(guessed, ref), unit=" passages", desc=name, total=len(files[-1])): if matching_ids: while g.ID < r.ID: g = next(guessed) while g.ID > r.ID: r = next(ref) terminals = [ (f.converted if f.format else f.passage).layer(layer0.LAYER_ID).all for f in (g, r) ] for t1, t2 in zip(*terminals): assert t1.text == t2.text, "Terminal text: %s != %s (passage %s, terminal %s)" % ( t1, t2, r.ID, t1.ID) if t1.punct != t2.punct: if verbose: with ioutil.external_write_mode(): print( "Passage %s: terminal '%s' (%s) is %s in left passage but %s in right passage" % (r.ID, t1, t1.ID, t1.tag, t2.tag)) yield r.ID, t1, t2
def print_text(args, text, suffix): if args.out_dir: with open(os.path.join(args.out_dir, suffix), "w") as f: print(text, file=f) else: with external_write_mode(): print(text)
def read_files(files, verbose=0, force_basename=False, **kw): try: files = sorted(files, key=lambda x: tuple(map(int, re.findall("\d+", x))) or (x, )) except TypeError as e: print("Cannot sort filenames: %s" % e, file=sys.stderr) for filename in files: basename, converted_format = passage_format(filename) if converted_format == "txt": converted_format = kw["format"] in_converter, out_converter = CONVERTERS.get(converted_format, CONVERTERS[kw["format"]]) kwargs = dict(converted_format=converted_format, in_converter=in_converter, out_converter=out_converter) if in_converter: with open(filename, encoding="utf-8") as f: for converted, passage, passage_id in in_converter( f, passage_id=basename, return_original=True, **kw): if verbose: with ioutil.external_write_mode(): print("Converting %s from %s" % (filename, converted_format)) yield ConvertedPassage( converted, passage, basename if force_basename else passage_id, **kwargs) else: passage_id = basename if force_basename else None yield ConvertedPassage(ioutil.file2passage(filename), passage_id=passage_id, **kwargs)
def main(args): os.makedirs(args.outdir, exist_ok=True) for passage in get_passages_with_progress_bar(args.filenames): site_filename = os.path.join(args.outdir, passage.ID + ".xml") with open(site_filename, "w", encoding="utf-8") as f: print(tostring(convert.to_site(passage)).decode(), file=f) if args.verbose: with external_write_mode(): print("Wrote '%s'" % site_filename)
def main(args): os.makedirs(args.outdir, exist_ok=True) for passage in get_passages_with_progress_bar(args.filenames): site_filename = os.path.join(args.outdir, passage.ID + ".xml") with open(site_filename, "w", encoding="utf-8") as f: print(tostring(convert.to_site(passage)).decode(), file=f) if args.verbose: with external_write_mode(): print("Wrote '%s'" % site_filename)
def get_most_recent_passage_by_uid(uid, passage_id, host_name, db_name, verbose=False, write_xids=None, strict=False, **kwargs): del kwargs c = get_cursor(host_name, db_name) uid = (uid, ) if isinstance(uid, (str, int)) else tuple(uid) if "*" in uid: c.execute( "SELECT xml,status,ts,id,uid FROM xmls WHERE paid = %s ORDER BY ts DESC", (passage_id, )) else: c.execute( "SELECT xml,status,ts,id,uid FROM xmls WHERE uid IN %s AND paid = %s ORDER BY ts DESC", (uid, passage_id)) queryset = c.fetchone() raw_xml, status, ts, xid, uid = 5 * [None] if queryset is None: if strict: raise Exception("The user %s did not annotate passage %s" % (uid, passage_id)) else: raw_xml, status, ts, xid, uid = queryset if write_xids: with open(write_xids, "a") as f: print(passage_id, xid, uid, ts, file=f, sep="\t") if queryset is None: return None if int(status) != 1: # if not submitted with external_write_mode(): print( "The most recent xml for uid %s and paid %s is not submitted." % (uid, passage_id), file=sys.stderr) if verbose: with external_write_mode(): print("Timestamp: %s, uid: %d, xid: %d" % (ts, uid, xid)) return fromstring(raw_xml)
def copy_annotation(passages, conllu, as_array=True, as_extra=True, verbose=False, lang=None): for passage, annotated in zip(passages, read_files_and_dirs(conllu, converters=CONVERTERS)): if verbose: with external_write_mode(): print("Reading annotation from '%s'" % annotated.ID) if as_array: passage.layer(layer0.LAYER_ID).docs()[:] = annotated.layer(layer0.LAYER_ID).docs() if as_extra: for terminal, annotated_terminal in zip(passage.layer(layer0.LAYER_ID).all, annotated.layer(layer0.LAYER_ID).all): copy_tok_to_extra(annotated_terminal, terminal, lang=lang) yield passage
def main(args): splitter = Splitter.read_file(args.sentences, enum=args.enumerate) os.makedirs(args.outdir, exist_ok=True) i = 0 for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for sentence in splitter.split(passage) if splitter else split2sentences( passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None): i += 1 outfile = os.path.join(args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml")) with external_write_mode(): print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr) if args.normalize: normalize(sentence) passage2file(sentence, outfile, binary=args.binary)
def main(args): for passage in get_passages_with_progress_bar(args.passages): extracted = constructions.extract_edges( passage, constructions=args.constructions, verbose=args.verbose) if any(extracted.values()): with external_write_mode(): if not args.verbose: print("%s:" % passage.ID) for construction, edges in extracted.items(): if edges: print(" %s:" % construction.description) for edge in edges: print(" %s [%s %s]" % (edge, edge.tag, edge.child)) print()
def main(args): for passage in get_passages_with_progress_bar(args.passages): c2es = OrderedDict((c, [candidate.edge for candidate in candidates]) for c, candidates in extract_candidates(passage, constructions=args.constructions, verbose=args.verbose).items() if candidates) if any(c2es.values()): with external_write_mode(): if not args.verbose: print("%s:" % passage.ID) for construction, edges in c2es.items(): if edges: print(" %s:" % construction.description) for edge in edges: print(" %s [%s %s]" % (edge, edge.tag, edge.child)) print()
def write_passage(passage, out_dir=".", output_format=None, binary=False, verbose=False, label_map=False, split=False, join=None, **kwargs): ext = {None: UCCA_EXT[binary], "amr": ".txt"}.get(output_format) or "." + output_format if join and join.endswith(ext): ext = "" outfile = os.path.join(out_dir, (join or passage.ID) + ext) if verbose: with ioutil.external_write_mode(): print("Writing '%s'..." % outfile, file=sys.stderr) if output_format is None: # UCCA output ioutil.passage2file(passage, outfile, binary=binary) else: converter = TO_FORMAT[output_format] with open(outfile, "a" if join else "w", encoding="utf-8") as f: for line in converter(passage, format=output_format if label_map else None, sentences=split, **kwargs): print(line, file=f)
def main(args): splitter = Splitter.read_file(args.sentences, enum=args.enumerate, suffix_format=args.suffix_format, suffix_start=args.suffix_start) os.makedirs(args.outdir, exist_ok=True) i = 0 for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for sentence in splitter.split(passage) if splitter else split2sentences( passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None): i += 1 outfile = os.path.join(args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml")) if args.verbose: with external_write_mode(): print(sentence, file=sys.stderr) print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr) if args.normalize: normalize(sentence) passage2file(sentence, outfile, binary=args.binary) if splitter and len(splitter.matched_indices) < len(splitter.sentences): print("", "Unmatched sentences:", *[s for i, s in enumerate(splitter.sentences) if i not in splitter.matched_indices], sep="\n")
def main(args): os.makedirs(args.outdir, exist_ok=True) i = 0 for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for paragraph in split2paragraphs( passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None): i += 1 outfile = os.path.join( args.outdir, args.prefix + paragraph.ID + (".pickle" if args.binary else ".xml")) if args.verbose: with external_write_mode(): print(paragraph, file=sys.stderr) print("Writing passage file for paragraph '%s'..." % outfile, file=sys.stderr) if args.normalize: normalize(paragraph) passage2file(paragraph, outfile, binary=args.binary)
def main(args): print("id,passages,paragraphs,sentences,nodes,terminals,non-terminals,implicit,linkage,discont," "edges,primary,remote,linkage,parents,children,mult-parents") data = [] for passage in get_passages_with_progress_bar(args.filenames): terminals = passage.layer(layer0.LAYER_ID).all non_terminals = [n for n in passage.layer(layer1.LAYER_ID).all if n.ID != "1.1"] non_linkage = [n for n in non_terminals if n.tag != NodeTags.Linkage] linkage_nodes = passage.layer(layer1.LAYER_ID).top_linkages edges = {e for n in non_terminals for e in n} remote = [e for e in edges if e.attrib.get("remote")] linkage_edges = [e for n in linkage_nodes for e in n] fields = (int(passage.ID), 1, len({t.paragraph for t in terminals}), len(break2sentences(passage)), len(terminals) + len(non_terminals), len(terminals), len(non_terminals), len([n for n in non_linkage if n.attrib.get("implicit")]), len(linkage_nodes), len([n for n in non_linkage if n.tag == NodeTags.Foundational and n.discontiguous]), len(edges), len(edges) - len(remote) - len(linkage_edges), len(remote), len(linkage_edges), sum(len([p for p in n.parents if p.ID != "1.1"]) for n in non_linkage), sum(len(n.children) for n in non_linkage), len([n for n in non_linkage if len([p for p in n.parents if p.ID != "1.1"]) > 1]), ) if not args.summary: with external_write_mode(): print(",".join("%d" % f for f in fields)) data.append(fields) data = np.array(data, dtype=int) if args.outfile: np.savetxt(args.outfile, data[data[:, 0].argsort()], fmt="%i", delimiter="\t") if args.summary: print(",".join("%d" % f for f in data.sum(axis=0)))
def write_passage(passage, out_dir=".", output_format=None, binary=False, verbose=False, test=False, tree=False, mark_aux=False, wikification=False, default_label=None, label_map=False, split=False, **kwargs): del kwargs ext = { None: UCCA_EXT[binary], "amr": ".txt" }.get(output_format) or "." + output_format outfile = os.path.join(out_dir, passage.ID + ext) if verbose: with ioutil.external_write_mode(): print("Writing '%s'..." % outfile, file=sys.stderr) if output_format is None: # UCCA output ioutil.passage2file(passage, outfile, binary=binary) else: converter = TO_FORMAT[output_format] with open(outfile, "w", encoding="utf-8") as f: for line in converter(passage, test=test, tree=tree, mark_aux=mark_aux, wikification=wikification, default_label=default_label, format=output_format if label_map else None, sentences=split): print(line, file=f)
def print_errors(passage_id, errors, id_len=None): for i, e in enumerate(errors): with external_write_mode(): print("%-*s|%s" % (id_len or len(passage_id), "" if i else passage_id, e), flush=True)
def print_errors(passage_id, errors, id_len=None): for i, e in enumerate(errors): with external_write_mode(): print("%-*s|%s" % (id_len or len(passage_id), "" if i else passage_id, e), flush=True)
help="image format") args = argparser.parse_args() if args.out_dir: os.makedirs(args.out_dir, exist_ok=True) if not args.tikz: import matplotlib matplotlib.use('Agg') for passage in get_passages_with_progress_bar(args.passages, desc="Visualizing"): if args.tikz: tikz = visualization.tikz(passage) if args.out_dir: with open(os.path.join(args.out_dir, passage.ID + ".tikz.txt"), "w") as f: print(tikz, file=f) else: with external_write_mode(): print(tikz) else: import matplotlib.pyplot as plt width = len(passage.layer(layer0.LAYER_ID).all) * 19 / 27 plt.figure(figsize=(width, width * 10 / 19)) visualization.draw(passage, node_ids=args.node_ids) if args.out_dir: plt.savefig( os.path.join(args.out_dir, passage.ID + "." + args.format)) plt.close() else: plt.show()
def main(args): if args.out_dir: os.makedirs(args.out_dir, exist_ok=True) scores = [] for pattern in args.filenames: for filename in glob(pattern) or [pattern]: file_scores = [] basename, ext = os.path.splitext(os.path.basename(filename)) passage_format = ext.lstrip(".") if passage_format == "txt": passage_format = args.format in_converter, out_converter = CONVERTERS.get( passage_format, CONVERTERS[args.format]) evaluate = EVALUATORS.get(passage_format, EVALUATORS[args.format]) with open(filename, encoding="utf-8") as f: t = tqdm(in_converter(f, passage_id=basename, return_original=True), unit=" passages", desc=("Converting '%s'" % filename) + ((", writing to '%s'" % args.out_dir) if args.out_dir else "")) for passage, ref, passage_id in t: if args.normalize: normalize(passage, extra=args.extra_normalization) if args.out_dir: os.makedirs(args.out_dir, exist_ok=True) outfile = os.path.join(args.out_dir, passage.ID + ".xml") if args.verbose: with ioutil.external_write_mode(): print("Writing '%s'..." % outfile, file=sys.stderr, flush=True) ioutil.passage2file(passage, outfile) try: guessed = out_converter(passage, wikification=args.wikification, use_original=False) except Exception as e: raise ValueError("Error converting %s back from %s" % (filename, passage_format)) from e if args.out_dir: outfile = os.path.join(args.out_dir, passage.ID + ext) if args.verbose: with ioutil.external_write_mode(): print("Writing '%s'..." % outfile, file=sys.stderr, flush=True) with open(outfile, "w", encoding="utf-8") as f_out: print("\n".join(guessed), file=f_out) try: s = evaluate(guessed, ref, verbose=args.verbose > 1, units=args.units) except Exception as e: raise ValueError("Error evaluating conversion of %s" % filename) from e file_scores.append(s) if args.verbose: with ioutil.external_write_mode(): print(passage_id) s.print() t.set_postfix(F1="%.2f" % (100.0 * Scores(file_scores).average_f1())) scores += file_scores print() if args.verbose and len(scores) > 1: print("Aggregated scores:") Scores(scores).print()