def main(): args = Config().args assert args.passages or args.train, "Either passages or --train is required (use -h for help)" assert args.model or args.train or args.folds, "Either --model or --train or --folds is required" assert not ( args.train or args.dev ) or args.folds is None, "--train and --dev are incompatible with --folds" assert args.train or not args.dev, "--dev is only possible together with --train" print("Running parser with %s" % Config()) test_scores = None dev_scores = None if Config().args.testscores: with open(Config().args.testscores, "w") as f: print(",".join( evaluation.Scores.field_titles(Config().args.constructions)), file=f) if args.folds is not None: fold_scores = [] all_passages = list( ioutil.read_files_and_dirs(args.passages, Config().args.sentences, Config().args.paragraphs)) assert len(all_passages) >= args.folds, \ "%d folds are not possible with only %d passages" % (args.folds, len(all_passages)) Config().random.shuffle(all_passages) folds = [all_passages[i::args.folds] for i in range(args.folds)] for i in range(args.folds): print("Fold %d of %d:" % (i + 1, args.folds)) dev_passages = folds[i] test_passages = folds[(i + 1) % args.folds] train_passages = [ passage for fold in folds if fold is not dev_passages and fold is not test_passages for passage in fold ] s, _ = train_test(train_passages, dev_passages, test_passages, args, "_%d" % i) if s is not None: fold_scores.append(s) if fold_scores: test_scores = evaluation.Scores.aggregate(fold_scores) print("Average labeled test F1 score for each fold: " + ", ".join("%.3f" % s.average_f1() for s in fold_scores)) print("Aggregated scores across folds:\n") test_scores.print() else: # Simple train/dev/test by given arguments train_passages, dev_passages, test_passages = [ ioutil.read_files_and_dirs(arg, Config().args.sentences, Config().args.paragraphs) for arg in (args.train, args.dev, args.passages) ] test_scores, dev_scores = train_test(train_passages, dev_passages, test_passages, args) return test_scores, dev_scores
def test_load_multiple_passages(): """Test lazy-loading passages""" files = 3 * ["test_files/standard3.xml"] passages = ioutil.read_files_and_dirs(files) assert len(files) == len(list(passages)), "Should load one passage per file" assert len(files) == len(passages) _test_passages(passages)
def test_shuffle_passages(): """Test lazy-loading passages and shuffling them""" files = 3 * ["test_files/standard3.xml"] passages = ioutil.read_files_and_dirs(files) random.shuffle(passages) assert len(files) == len(passages) _test_passages(passages)
def main(args): guessed, ref, ref_yield_tags = [ None if x is None else ioutil.read_files_and_dirs((x, )) for x in (args.guessed, args.ref, args.ref_yield_tags) ] if args.match_by_id: guessed = match_by_id(guessed, ref) ref_yield_tags = match_by_id(ref_yield_tags, ref) results = [] for g, r, ryt in zip(guessed, ref, ref_yield_tags or repeat(None)): if len(guessed) > 1: sys.stdout.write("\rEvaluating %s%s" % (g.ID, ":" if args.verbose else "...")) sys.stdout.flush() if args.verbose: print() result = evaluation.evaluate( g, r, constructions=args.constructions, units=args.units, fscore=args.fscore, errors=args.errors, verbose=args.verbose or len(guessed) == 1, normalize=args.normalize, ref_yield_tags=ryt, eval_type=evaluation.UNLABELED if args.unlabeled else None) if args.verbose: print_f1(result, args.unlabeled) results.append(result) summarize(args, results)
def main(args): guessed, ref, ref_yield_tags = [None if x is None else ioutil.read_files_and_dirs((x,)) for x in (args.guessed, args.ref, args.ref_yield_tags)] if args.match_by_id: guessed = match_by_id(guessed, ref) ref_yield_tags = match_by_id(ref_yield_tags, ref) results = [] eval_type = evaluation.UNLABELED if args.unlabeled else evaluation.LABELED verbose = args.verbose or len(guessed) == 1 for g, r, ryt in zip(guessed, ref, ref_yield_tags or repeat(None)): if len(guessed) > 1: print("Evaluating %s%s" % (g.ID, ":" if args.verbose else "..."), end="\r", flush=True) if args.verbose: print() result = evaluation.evaluate(g, r, constructions=args.constructions, units=args.units, fscore=args.fscore, errors=args.errors, verbose=verbose, normalize=args.normalize, ref_yield_tags=ryt, eval_type=evaluation.UNLABELED if args.unlabeled else None) if verbose: if args.errors: result.print_confusion_matrix(as_table=args.as_table) if not args.quiet: print_f1(result, eval_type) results.append(result) summarize(args, results, eval_type=eval_type)
def main(filenames, write, **kwargs): uploader = TaskUploader(**kwargs) downloader = TaskDownloader(**kwargs) scores = [] try: for pattern in filenames: filenames = glob(pattern) if not filenames: raise IOError("Not found: " + pattern) for ref in read_files_and_dirs(filenames): print("Converting passage " + ref.ID + "... ", end="") task = uploader.upload_task(ref) guessed = downloader.download_task(task["id"], write=write, **kwargs) score = evaluate(guessed, ref, **kwargs) print("F1=%.3f" % score.average_f1()) scores.append(score) except HTTPError as e: try: raise ValueError(e.response.json()) from e except JSONDecodeError: raise ValueError(e.response.text) from e print() if len(scores) > 1: print("Aggregated scores:") Scores.aggregate(scores).print()
def main(args): guessed, ref = [ ioutil.read_files_and_dirs((x, )) for x in (args.guessed, args.ref) ] guessed = match_by_id(guessed, ref) results = [] for g, r in zip(guessed, ref): if len(guessed) > 1: sys.stdout.write("\rEvaluating %s%s" % (g.ID, ":" if args.verbose else "...")) sys.stdout.flush() if args.verbose: print() result = evaluation.evaluate(g, r, constructions=args.constructions, units=args.units, fscore=args.fscore, errors=args.errors, verbose=args.verbose or len(guessed) == 1, normalize=args.normalize) if args.verbose: print("Average labeled F1 score: %.3f\n" % result.average_f1()) results.append(result) summarize(args, results)
def read_passages(args, files): expanded = [f for pattern in files for f in glob(pattern) or (pattern, )] return ioutil.read_files_and_dirs(expanded, sentences=args.sentences, paragraphs=args.paragraphs, converters=CONVERTERS, lang=Config().args.lang)
def test_read_files_and_dirs(suffix): for passage in read_files_and_dirs(glob( os.path.join("test_files", "*." + suffix)), converters=FROM_FORMAT): assert passage.layer( layer0.LAYER_ID).all, "No terminals in passage " + passage.ID assert len(passage.layer(layer1.LAYER_ID).all ), "No non-terminals but the root in passage " + passage.ID
def test_shuffle_passages(self): """Test lazy-loading passages and shuffling them""" files = ["test_files/standard3.%s" % s for s in ("xml", "conll", "export", "sdp")] passages = ioutil.read_files_and_dirs(files) print("Passages:\n" + "\n".join(str(p.layer(layer1.LAYER_ID).heads[0]) for p in passages)) random.shuffle(passages) print("Shuffled passages:\n" + "\n".join(str(p.layer(layer1.LAYER_ID).heads[0]) for p in passages)) self.assertEqual(len(files), len(passages))
def load_passage(filename, annotate=False): WIKIFIER.enabled = False converters = {k: partial(c, annotate=annotate) for k, c in FROM_FORMAT.items()} passages = ioutil.read_files_and_dirs(filename, converters=converters, attempts=1, delay=0) try: return next(iter(passages)) except StopIteration: return passages
def copy_annotation(passages, conllu, as_array=True, verbose=False): if not as_array: raise ValueError("Annotating with CoNLL-U files and as_array=False are currently not supported; use --as-array") for passage, annotated in zip(passages, read_files_and_dirs(conllu, converters=CONVERTERS)): if verbose: with tqdm.external_write_mode(): print("Reading annotation from '%s'" % annotated.ID) passage.layer(layer0.LAYER_ID).docs()[:] = annotated.layer(layer0.LAYER_ID).docs() yield passage
def read_specs(args): specs = [(pattern, args.out_dir, args.lang) for pattern in args.filenames] if args.list_file: with open(args.list_file, encoding="utf-8") as f: specs += [l.strip().split() for l in f if not l.startswith("#")] for pattern, out_dir, lang in specs: os.makedirs(out_dir, exist_ok=True) filenames = glob(pattern) if not filenames: raise IOError("Not found: " + pattern) yield read_files_and_dirs(filenames, converters=FROM_FORMAT), out_dir, lang
def copy_annotation(passages, conllu, as_array=True, as_extra=True, verbose=False, lang=None): for passage, annotated in zip(passages, read_files_and_dirs(conllu, converters=CONVERTERS)): if verbose: with external_write_mode(): print("Reading annotation from '%s'" % annotated.ID) if as_array: passage.layer(layer0.LAYER_ID).docs()[:] = annotated.layer(layer0.LAYER_ID).docs() if as_extra: for terminal, annotated_terminal in zip(passage.layer(layer0.LAYER_ID).all, annotated.layer(layer0.LAYER_ID).all): copy_tok_to_extra(annotated_terminal, terminal, lang=lang) yield passage
def upload_tasks(self, filenames, **kwargs): del kwargs try: for pattern in filenames: filenames = glob(pattern) if not filenames: raise IOError("Not found: " + pattern) for passage in read_files_and_dirs(filenames): task = self.upload_task(passage) print("Submitted task %d" % task["id"]) yield task except HTTPError as e: try: raise ValueError(e.response.json()) from e except JSONDecodeError: raise ValueError(e.response.text) from e
def read_specs(args, converters=None): specs = [(pattern, args.out_dir, args.lang, args.udpipe, args.conllu, args.join) for pattern in args.filenames] if args.list_file: with open(args.list_file, encoding="utf-8") as f: specs += [l.strip().split() for l in f if not l.startswith("#")] for spec in specs: pattern = spec[0] filenames = glob(pattern) if not filenames: raise IOError("Not found: " + pattern) yield AnnotationSpecification(passages=read_files_and_dirs(filenames, converters=converters), out_dir=spec[1] if len(spec) > 1 else args.out_dir, lang=spec[2] if len(spec) > 2 else args.lang, udpipe=spec[3] if len(spec) > 3 else args.udpipe, conllu=spec[4] if len(spec) > 4 else args.conllu, join=spec[5] if len(spec) > 5 else args.join)
def probs(d): filename = d + ".freq.csv" try: with open(filename, encoding="utf-8") as f: counts = dict((key, int(value)) for key, value in csv.reader(f)) print("Loaded '%s'" % filename) except IOError: counts = Counter() for p in tqdm(read_files_and_dirs(d), unit=" passages", desc="Reading %s" % d): for t in p.layer(layer0.LAYER_ID).all: counts[t.text] += 1 with open(filename, "w", encoding="utf-8") as f: csv.writer(f).writerows(counts.most_common()) print("Saved '%s'" % filename) s = sum(counts.values()) return {key: float(value) / s for key, value in counts.items()}
def main(args): guessed, ref = [ ioutil.read_files_and_dirs((x, ), converters=FROM_FORMAT) for x in (args.guessed, args.ref) ] if len(guessed) != len(ref): raise ValueError( "Number of passages to compare does not match: %d != %d" % (len(guessed), len(ref))) if len(guessed) > 1: guessed_by_id = { g.ID: g for g in tqdm( guessed, desc="Reading " + args.guessed, unit=" passages") } try: guessed = [ guessed_by_id[p.ID] for p in tqdm( ref, desc="Reading " + args.ref, unit=" passages") ] except KeyError as e: raise ValueError("Passage IDs do not match") from e results = [ evaluate(g, r, errors=True) for g, r in zip( tqdm(guessed, desc="Evaluating", unit=" passages"), ref) ] confusion_matrix = Scores.aggregate( results).evaluators[LABELED].results[PRIMARY].errors.most_common() label_map = {} for (g, r), _ in confusion_matrix: g, *_ = g.partition("|") prefix, *_ = g.partition(":") if not any(l.startswith(prefix) for l in label_map): # drop suffix for most common label g = prefix if g not in label_map: label_map[g], *_ = r.partition("|") with open(args.out_file, "w", encoding="utf-8") as f: csv.writer(f).writerows( tqdm(sorted(label_map.items()), desc="Writing " + args.out_file, unit=" rows"))
help="prints the error distribution according to its frequency") argparser.add_argument("--no-normalize", dest="normalize", action="store_false", help="do not normalize passages before evaluation") argparser.add_argument("--out-file", help="file to write results for each evaluated passage to, in CSV format") argparser.add_argument("--summary-file", help="file to write aggregated results to, in CSV format") group = argparser.add_mutually_exclusive_group() group.add_argument("-v", "--verbose", action="store_true", help="prints the results for every single pair (always true if there is only one pair)") group.add_argument("-q", "--quiet", action="store_true", help="do not print anything") constructions.add_argument(argparser) args = argparser.parse_args() if not (args.units or args.fscore or args.errors): argparser.error("At least one of -u, -f or -e is required.") guessed, ref = [ioutil.read_files_and_dirs((x,)) for x in (args.guessed, args.ref)] if len(guessed) != len(ref): raise ValueError("Number of passages to compare does not match: %d != %d" % (len(guessed), len(ref))) if len(guessed) > 1: guessed_by_id = {} for g in guessed: sys.stdout.write("\rReading %s..." % g.ID) sys.stdout.flush() guessed_by_id[g.ID] = g ids = [p.ID for p in ref] try: guessed = [guessed_by_id[i] for i in ids] except KeyError as e: raise ValueError("Passage IDs do not match") from e results = [] for g, r in zip(guessed, ref):
import os import oracle from ucca import diffutil, ioutil, textutil, layer1, evaluation from pdb import set_trace files = [ '../ucca_corpus_pickle/' + f for f in os.listdir('../ucca_corpus_pickle') ] passages = list(ioutil.read_files_and_dirs(files)) passage = passages[0] ora = oracle.Oracle(passage) set_trace()
from argparse import ArgumentParser from ucca import constructions from ucca.ioutil import read_files_and_dirs if __name__ == "__main__": argparser = ArgumentParser(description="Extract linguistic constructions from UCCA corpus.") argparser.add_argument("passages", nargs="+", help="the corpus, given as xml/pickle file names") constructions.add_argument(argparser, False) argparser.add_argument("-v", "--verbose", action="store_true", help="print tagged text for each passage") args = argparser.parse_args() for passage in read_files_and_dirs(args.passages): if args.verbose: print("%s:" % passage.ID) extracted = constructions.extract_edges(passage, constructions=args.constructions, verbose=args.verbose) if any(extracted.values()): if not args.verbose: print("%s:" % passage.ID) for construction, edges in extracted.items(): if edges: print(" %s:" % construction.description) for edge in edges: print(" %s [%s %s]" % (edge, edge.tag, edge.child)) print()
def test_passage(): return next(iter(ioutil.read_files_and_dirs(("test_files/120.xml",))))
def read_passages(args, files): expanded = [f for pattern in files for f in sorted(glob(pattern)) or (pattern,)] return ioutil.read_files_and_dirs(expanded, sentences=args.sentences, paragraphs=args.paragraphs, converters=CONVERTERS, lang=Config().args.lang)
def load_passage(filename): passages = ioutil.read_files_and_dirs(filename, attempts=1, delay=0) try: return next(iter(passages)) except StopIteration: return passages
action="store_true", help= "prints the results for every single pair (always true if there is only one pair)" ) group.add_argument("-q", "--quiet", action="store_true", help="do not print anything") constructions.add_argument(argparser) args = argparser.parse_args() if not (args.units or args.fscore or args.errors): argparser.error("At least one of -u, -f or -e is required.") guessed, ref = [ ioutil.read_files_and_dirs((x, )) for x in (args.guessed, args.ref) ] if len(guessed) != len(ref): raise ValueError( "Number of passages to compare does not match: %d != %d" % (len(guessed), len(ref))) if len(guessed) > 1: guessed_by_id = {} for g in guessed: sys.stdout.write("\rReading %s..." % g.ID) sys.stdout.flush() guessed_by_id[g.ID] = g ids = [p.ID for p in ref] try: guessed = [guessed_by_id[i] for i in ids] except KeyError as e:
def read_passages(args, files): return ioutil.read_files_and_dirs( files, args.sentences, args.paragraphs, defaultdict(lambda: Config().input_converter or from_text, FROM_FORMAT))
def load_passages(): passages = [] for _ in range(NUM_PASSAGES): passages += ioutil.read_files_and_dirs(("ucca/test_files/standard3.xml",)) return passages
from configargparse import ArgParser from tqdm import tqdm from ucca import layer1 from ucca.ioutil import read_files_and_dirs argparser = ArgParser() argparser.add_argument("dir") argparser.add_argument("-v", "--verbose", action="store_true") args = argparser.parse_args() all_tags = {} childful = {} for p in tqdm(read_files_and_dirs(args.dir)): for n in p.layer(layer1.LAYER_ID).all: for e in n: all_tags[e.tag] = n if any( isinstance(x, layer1.FoundationalNode) for x in e.child.children): childful[e.tag] = n print("All tags: %d" % len(all_tags)) childless = set(all_tags).difference(childful) print("Childless: %d (%s)" % (len(childless), ", ".join(childless))) if args.verbose: print("\n".join("%s: %s" % (tag, node) for tag, node in childful.items()))
import os import oracle from ucca import diffutil, ioutil, textutil, layer1, evaluation from pdb import set_trace files = ['../ucca_corpus_pickle/' + f for f in os.listdir('../ucca_corpus_pickle')] passages = list(ioutil.read_files_and_dirs(files)) passage = passages[0] ora = oracle.Oracle(passage) set_trace()
def loaded(filename=None): return next( iter(read_files_and_dirs(filename or "test_files/conversion/120.xml")))
def test_load_passage(): _test_passages(ioutil.read_files_and_dirs(glob(os.path.join("test_files", "standard3.xml"))))
from ucca import constructions from ucca.ioutil import read_files_and_dirs if __name__ == "__main__": argparser = ArgumentParser( description="Extract linguistic constructions from UCCA corpus.") argparser.add_argument("passages", nargs="+", help="the corpus, given as xml/pickle file names") constructions.add_argument(argparser, False) argparser.add_argument("-v", "--verbose", action="store_true", help="print tagged text for each passage") args = argparser.parse_args() for passage in read_files_and_dirs(args.passages): if args.verbose: print("%s:" % passage.ID) extracted = constructions.extract_edges( passage, constructions=args.constructions, verbose=args.verbose) if any(extracted.values()): if not args.verbose: print("%s:" % passage.ID) for construction, edges in extracted.items(): if edges: print(" %s:" % construction.description) for edge in edges: print(" %s [%s %s]" % (edge, edge.tag, edge.child)) print()
def read_passages(file_dirs): logger.info("") logger.info("Reading from %s" % file_dirs) return tqdm(ioutil.read_files_and_dirs(file_dirs))