def parsed_sentence2xml(sentence, parse_dir, sent_id=None, normalize_sentence=normalize_sentence): if sent_id is None: location = get_parsed_subdir(sentence, parse_dir) filename = parse_location(location, "", get_sentence_id( sentence, location, False, normalize_sentence)) # print("reading parse from ", filename) # with open(filename) as fl: # print("sentence:", sentence) # print("xml first lines:", fl.readlines()[:30]) return file2passage(filename) else: return file2passage(parse_location(parse_dir, sentence, sent_id))
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument("filenames", nargs="+", help="file names to convert and evaluate") argparser.add_argument("-f", "--format", required=True, choices=convert.CONVERTERS, help="input file format") argparser.add_argument("-T", "--tree", action="store_true", help="remove multiple parents to get a tree") args = argparser.parse_args() converter1 = convert.TO_FORMAT[args.format] converter2 = convert.FROM_FORMAT[args.format] scores = [] for pattern in args.filenames: filenames = glob.glob(pattern) if not filenames: raise IOError("Not found: " + pattern) for filename in filenames: ref = file2passage(filename) try: guessed = next(converter2(converter1(ref, tree=args.tree), ref.ID)) scores.append(evaluate(guessed, ref, fscore=True, verbose=False, units=False, errors=False)) except Exception as e: raise ValueError("Error evaluating conversion of %s" % filename, e) if len(scores) > 1: print("Aggregated scores:") Scores.aggregate(scores).print() sys.exit(0)
def main(args): os.makedirs(args.outdir, exist_ok=True) if args.join: out_file = os.path.join(args.outdir, args.join) with open(out_file, "w", encoding="utf-8") as f: for passage in get_passages_with_progress_bar(sorted( args.filenames, key=numeric), desc="Converting"): write_text(passage, f, sentences=args.sentences, lang=args.lang, prepend_id=args.prepend_id) print("Wrote '%s'." % out_file) else: # one file per passage for pattern in args.filenames: for filename in tqdm(glob(pattern) or [pattern], desc="Converting", unit=" passages"): passage = file2passage(filename) basename = os.path.splitext(os.path.basename(filename))[0] with open(os.path.join(args.outdir, basename + ".txt"), "w", encoding="utf-8") as f: write_text(passage, f, sentences=args.sentences, lang=args.lang, prepend_id=args.prepend_id)
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument('filenames', nargs='+', help="file names to analyze") argparser.add_argument('-o', '--outfile', default="data/counts_", help="output file prefix for histogram") argparser.add_argument('-p', '--plot', default="data/plot_", help="output file prefix for plot image file") args = argparser.parse_args() histograms = defaultdict(Counter) for pattern in args.filenames: for filename in glob.glob(pattern): sys.stderr.write("Reading passage '%s'...\n" % filename) passage = file2passage(filename) for node in passage.layer("1").all: if node.ID != "1.1": # Exclude the root node histograms["parents"][clip(node.incoming, 3)] += 1 histograms["children"][clip(node.outgoing, 7)] += 1 for label, counter in histograms.items(): handle = open(args.outfile + label + ".txt", 'w') if args.outfile else sys.stdout handle.writelines(["%s\t%d\n" % (num, count) for num, count in counter.items()]) if handle is not sys.stdout: handle.close() try: plot_histogram(counter, label, plot=args.plot) plot_pie(counter, label, plot=args.plot) except: pass sys.exit(0)
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument('filenames', nargs='+', help="passage file names to convert") argparser.add_argument('-o', '--outdir', default='.', help="output directory") argparser.add_argument('-p', '--prefix', default='', help="output filename prefix") argparser.add_argument('-r', '--remarks', action='store_true', help="annotate original IDs") argparser.add_argument("-b", "--binary", action="store_true", help="write in pickle binary format (.pickle)") args = argparser.parse_args() for filename in args.filenames: passage = file2passage(filename) sentences = ucca.convert.split2sentences(passage, remarks=args.remarks) for i, sentence in enumerate(sentences): outfile = "%s/%s.%s" % (args.outdir, args.prefix + sentence.ID, "pickle" if args.binary else "xml") sys.stderr.write("Writing passage file for sentence '%s'...\n" % outfile) passage2file(sentence, outfile, args.binary) sys.exit(0)
def main(args): for pattern in args.filenames: filenames = glob.glob(pattern) if not filenames: raise IOError("Not found: " + pattern) for filename in filenames: no_ext, ext = os.path.splitext(filename) if ext in UCCA_EXT: # UCCA input write_passage(ioutil.file2passage(filename), args) else: basename = os.path.basename(no_ext) try: passage_id = re.search(r"\d+", basename).group(0) except AttributeError: passage_id = basename converter = CONVERTERS.get(args.input_format or ext.lstrip(".")) if converter is None: raise IOError( "Unknown extension '%s'. Specify format using -f" % ext) converter = converter[0] with open(filename, encoding="utf-8") as f: for passage in converter(f, passage_id, split=args.split, mark_aux=args.mark_aux): write_passage(passage, args)
def read_files(files, verbose=0, force_basename=False, **kw): try: files = sorted(files, key=lambda x: tuple(map(int, re.findall("\d+", x))) or (x, )) except TypeError as e: print("Cannot sort filenames: %s" % e, file=sys.stderr) for filename in files: basename, converted_format = passage_format(filename) if converted_format == "txt": converted_format = kw["format"] in_converter, out_converter = CONVERTERS.get(converted_format, CONVERTERS[kw["format"]]) kwargs = dict(converted_format=converted_format, in_converter=in_converter, out_converter=out_converter) if in_converter: with open(filename, encoding="utf-8") as f: for converted, passage, passage_id in in_converter( f, passage_id=basename, return_original=True, **kw): if verbose: with ioutil.external_write_mode(): print("Converting %s from %s" % (filename, converted_format)) yield ConvertedPassage( converted, passage, basename if force_basename else passage_id, **kwargs) else: passage_id = basename if force_basename else None yield ConvertedPassage(ioutil.file2passage(filename), passage_id=passage_id, **kwargs)
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument('-d', '--directory', required=True, help="directory with passage files to process") argparser.add_argument('-o', '--outfile', default="data/unique_roles.txt", help="output file for data") argparser.add_argument('-D', '--direction', default="out", help="direction of edges to check (out|in)") args = argparser.parse_args() out = args.direction == "out" if not os.path.isdir(args.directory): raise Exception("Not a directory: " + args.directory) roles = set(tag for name, tag in layer1.EdgeTags.__dict__.items() if isinstance(tag, str) and not name.startswith('__')) for filename in os.listdir(args.directory): sys.stderr.write("Reading passage '%s'...\n" % filename) passage = file2passage(args.directory + os.path.sep + filename) for node in passage.layer(layer1.LAYER_ID).all: counts = Counter(edge.tag for edge in (node if out else node.incoming)) roles.difference_update(tag for tag, count in counts.items() if count > 1) lines = "\n".join(sorted(roles)) print(lines) if args.outfile: with open(args.outfile, "w", encoding="utf-8") as f: print(lines, file=f) sys.exit(0)
def read_words_and_punctuations(args): words = set() punctuations = set() passages = glob.glob(args.directory + "/*.xml") words_file_name = os.path.join(args.directory, "words.txt") punctuations_file_name = os.path.join(args.directory, "punctuations.txt") if passages: for filename in passages: sys.stderr.write("Reading passage '%s'...\n" % filename) passage = file2passage(filename) terminals = passage.layer(layer0.LAYER_ID).all w, p = [[ terminal.attrib.get("text") for terminal in terminals if terminal.tag == tag ] for tag in (layer0.NodeTags.Word, layer0.NodeTags.Punct)] words.update(w) punctuations.update(p) words = sorted(words) punctuations = sorted(punctuations) with open(words_file_name, "w") as words_file: words_file.writelines(word + "\n" for word in words) with open(punctuations_file_name, "w") as punctuations_file: punctuations_file.writelines(punctuation + "\n" for punctuation in punctuations) else: with open(words_file_name) as words_file: words = [word.rstrip() for word in words_file.readlines()] with open(punctuations_file_name) as punctuations_file: punctuations = [ punctuation.rstrip() for punctuation in punctuations_file.readlines() ] return punctuations, words
def main(args): os.makedirs(args.out_dir, exist_ok=True) for filename in tqdm(list(iter_files(args.filenames)), unit="file", desc="Converting"): if not os.path.isfile(filename): raise IOError("Not a file: %s" % filename) no_ext, ext = os.path.splitext(filename) if ext in UCCA_EXT: # UCCA input write_passage(ioutil.file2passage(filename), args) else: basename = os.path.basename(no_ext) try: passage_id = re.search(r"\d+(\.\d+)*", basename).group(0) except AttributeError: passage_id = basename converter = CONVERTERS.get(args.input_format or ext.lstrip(".")) if converter is None: raise IOError( "Unknown extension '%s'. Specify format using -f" % ext) converter = converter[0] with open(filename, encoding="utf-8") as f: for passage in converter(f, args.prefix + passage_id, split=args.split, mark_aux=args.mark_aux): write_passage(passage, args)
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument("filenames", nargs="+", help="file names to analyze") argparser.add_argument("-o", "--outfile", default="data/counts_", help="output file prefix for histogram") argparser.add_argument("-p", "--plot", default="data/plot_", help="output file prefix for plot image file") args = argparser.parse_args() histograms = defaultdict(Counter) for pattern in args.filenames: for filename in glob.glob(pattern): sys.stderr.write("Reading passage '%s'...\n" % filename) passage = file2passage(filename) for node in passage.layer(layer1.LAYER_ID).all: if node.ID != "1.1": # Exclude the root node histograms["parents"][clip(node.incoming, 3)] += 1 histograms["children"][clip(node.outgoing, 7)] += 1 for label, counter in histograms.items(): handle = open(args.outfile + label + ".txt", "w", encoding="utf-8") if args.outfile else sys.stdout handle.writelines(["%s\t%d\n" % (num, count) for num, count in counter.items()]) if handle is not sys.stdout: handle.close() # noinspection PyBroadException try: plot_histogram(counter, label, plot=args.plot) plot_pie(counter, label, plot=args.plot) except: pass sys.exit(0)
def read_words_and_punctuations(args): words = set() punctuations = set() passages = glob.glob(args.directory + "/*.xml") words_file_name = os.path.join(args.directory, "words.txt") punctuations_file_name = os.path.join(args.directory, "punctuations.txt") if passages: for filename in passages: sys.stderr.write("Reading passage '%s'...\n" % filename) passage = file2passage(filename) terminals = passage.layer(layer0.LAYER_ID).all w, p = [[terminal.attrib.get("text") for terminal in terminals if terminal.tag == tag] for tag in (layer0.NodeTags.Word, layer0.NodeTags.Punct)] words.update(w) punctuations.update(p) words = sorted(words) punctuations = sorted(punctuations) with open(words_file_name, "w") as words_file: words_file.writelines(word + "\n" for word in words) with open(punctuations_file_name, "w") as punctuations_file: punctuations_file.writelines(punctuation + "\n" for punctuation in punctuations) else: with open(words_file_name) as words_file: words = [word.rstrip() for word in words_file.readlines()] with open(punctuations_file_name) as punctuations_file: punctuations = [punctuation.rstrip() for punctuation in punctuations_file.readlines()] return punctuations, words
def iter_passages(patterns, desc=None, input_format=None, prefix="", split=False, mark_aux=False, annotate=False): t = tqdm(list(iter_files(patterns)), unit="file", desc=desc) for filename in t: t.set_postfix(file=filename) if not os.path.isfile(filename): raise IOError("Not a file: %s" % filename) no_ext, ext = os.path.splitext(filename) if ext in UCCA_EXT: # UCCA input yield ioutil.file2passage(filename) else: basename = os.path.basename(no_ext) try: passage_id = re.search(r"\d+(\.\d+)*", basename).group(0) except AttributeError: passage_id = basename converter, _ = CONVERTERS.get(input_format or ext.lstrip("."), (from_text, )) with open(filename, encoding="utf-8") as f: yield from converter(f, prefix + passage_id, split=split, mark_aux=mark_aux, annotate=annotate)
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument('-d', '--directory', required=True, help="directory with passage files to process") argparser.add_argument('-o', '--outfile', default="data/unique_roles.txt", help="output file for data") argparser.add_argument('-D', '--direction', default="out", help="direction of edges to check (out|in)") args = argparser.parse_args() out = args.direction == "out" if not os.path.isdir(args.directory): raise Exception("Not a directory: " + args.directory) roles = set(tag for name, tag in layer1.EdgeTags.__dict__.items() if isinstance(tag, str) and not name.startswith('__')) for filename in os.listdir(args.directory): sys.stderr.write("Reading passage '%s'...\n" % filename) passage = file2passage(args.directory + os.path.sep + filename) for node in passage.layer(layer1.LAYER_ID).all: counts = Counter(edge.tag for edge in (node if out else node.incoming)) roles.difference_update(tag for tag, count in counts.items() if count > 1) lines = "\n".join(sorted(roles)) print(lines) if args.outfile: with open(args.outfile, "w") as f: print(lines, file=f) sys.exit(0)
def main(args): for filename in args.filenames: print("Reading passage '%s'..." % filename, file=sys.stderr) passage = file2passage(filename) basename = os.path.splitext(os.path.basename(filename))[0] outfile = args.outdir + os.path.sep + basename + ".pickle" print("Writing file '%s'..." % outfile, file=sys.stderr) passage2file(passage, outfile, binary=True)
def main(args): for filename in args.filenames: sys.stderr.write("Reading passage '%s'...\n" % filename) passage = file2passage(filename) basename = os.path.splitext(os.path.basename(filename))[0] outfile = args.outdir + os.path.sep + basename + ".xml" sys.stderr.write("Writing file '%s'...\n" % outfile) passage2file(passage, outfile)
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument("filenames", nargs="+", help="file names to convert and evaluate") argparser.add_argument("-f", "--format", required=True, choices=convert.CONVERTERS, help="input file format") argparser.add_argument("-T", "--tree", action="store_true", help="remove multiple parents to get a tree") argparser.add_argument( "-s", "--strict", action="store_true", help="stop immediately if failed to convert or evaluate a file") argparser.add_argument( "-v", "--verbose", action="store_true", help="print evaluation results for each file separately") args = argparser.parse_args() converter1 = convert.TO_FORMAT[args.format] converter2 = convert.FROM_FORMAT[args.format] scores = [] for pattern in args.filenames: filenames = glob.glob(pattern) if not filenames: raise IOError("Not found: " + pattern) for filename in filenames: sys.stdout.write("\rConverting %s" % filename) sys.stdout.flush() ref = file2passage(filename) try: guessed = next( converter2(converter1(ref, tree=args.tree), ref.ID)) scores.append(evaluate(guessed, ref, verbose=args.verbose)) except Exception as e: if args.strict: raise ValueError("Error evaluating conversion of %s" % filename) from e else: print("Error evaluating conversion of %s: %s" % (filename, e), file=sys.stderr) print() if args.verbose and len(scores) > 1: print("Aggregated scores:") Scores.aggregate(scores).print() sys.exit(0)
def convert_passage(filename, converter, args): """Opens a passage file and returns a string after conversion :param filename: input passage file :param converter: function to use for conversion :param args: ArgumentParser object """ passage = file2passage(filename) passages = convert.split2sentences(passage) if args.sentences else [passage] output = "\n".join(line for p in passages for line in converter(p, args.test, args.tree, args.markaux)) return output, passage.ID
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument("filenames", nargs="+", help="passage file names to join") argparser.add_argument("-o", "--outdir", default=".", help="output directory") argparser.add_argument("-p", "--prefix", default="", help="output filename prefix") argparser.add_argument("-r", "--remarks", action="store_true", help="annotate original IDs") argparser.add_argument("-b", "--binary", action="store_true", help="write in pickle binary format (.pickle)") argparser.add_argument( "-j", "--join-by-prefix", action="store_true", help= "join each set of passages whose IDs share all but the last 3 characters" ) args = argparser.parse_args() passages = [ file2passage(filename) for pattern in args.filenames for filename in sorted(glob.glob(pattern)) ] if args.join_by_prefix: subsets = defaultdict(list) for passage in passages: subsets[passage.ID[:-3]].append(passage) else: subsets = {passages[0].ID: passages} for passage_id, subset in sorted(subsets.items()): sys.stderr.write("Joining passages " + ", ".join(passage.ID for passage in subset) + "\n") joined = ucca.convert.join_passages(passages, passage_id=passage_id, remarks=args.remarks) outfile = "%s/%s.%s" % (args.outdir, args.prefix + joined.ID, "pickle" if args.binary else "xml") sys.stderr.write("Writing joined passage file '%s'...\n" % outfile) passage2file(joined, outfile, args.binary) sys.exit(0)
def main(args): os.makedirs(args.outdir, exist_ok=True) for filename in tqdm(args.filenames, desc="Converting", unit=" passages"): if args.verbose: with external_write_mode(): print("Reading passage '%s'..." % filename, file=sys.stderr) passage = file2passage(filename) basename = os.path.splitext(os.path.basename(filename))[0] outfile = args.outdir + os.path.sep + basename + ".pickle" if args.verbose: with external_write_mode(): print("Writing file '%s'..." % outfile, file=sys.stderr) passage2file(passage, outfile, binary=True)
def convert_passage(filename, converter, args): """Opens a passage file and returns a string after conversion :param filename: input passage file :param converter: function to use for conversion :param args: ArgumentParser object """ passage = file2passage(filename) passages = convert.split2sentences(passage) if args.sentences else [ passage ] output = "\n".join( line for p in passages for line in converter(p, args.test, args.tree, args.markaux)) return output, passage.ID
def main(args): os.makedirs(args.outdir, exist_ok=True) if args.join: out_file = os.path.join(args.outdir, args.join) with open(out_file, "w", encoding="utf-8") as f: for passage in get_passages_with_progress_bar(sorted(args.filenames, key=numeric), desc="Converting"): write_text(passage, f, sentences=args.sentences, lang=args.lang, prepend_id=args.prepend_id) print("Wrote '%s'." % out_file) else: # one file per passage for pattern in args.filenames: for filename in tqdm(glob(pattern) or [pattern], desc="Converting", unit=" passages"): passage = file2passage(filename) basename = os.path.splitext(os.path.basename(filename))[0] with open(os.path.join(args.outdir, basename + ".txt"), "w", encoding="utf-8") as f: write_text(passage, f, sentences=args.sentences, lang=args.lang, prepend_id=args.prepend_id)
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument('filenames', nargs='+', help="XML file names to convert") argparser.add_argument('-o', '--outdir', default='.', help="output directory") args = argparser.parse_args() for filename in args.filenames: sys.stderr.write("Reading passage '%s'...\n" % filename) passage = file2passage(filename) basename = os.path.splitext(os.path.basename(filename))[0] outfile = args.outdir + os.path.sep + basename + ".pickle" sys.stderr.write("Writing file '%s'...\n" % outfile) passage2file(passage, outfile, binary=True) sys.exit(0)
def read_files(files, default_format=None, verbose=0, force_basename=False): for filename in sorted(files, key=lambda x: tuple(map(int, re.findall("\d+", x))) or x): basename, converted_format = passage_format(filename) in_converter, out_converter = CONVERTERS.get(converted_format, CONVERTERS[default_format]) kwargs = dict(converted_format=converted_format, in_converter=in_converter, out_converter=out_converter) if in_converter: with open(filename, encoding="utf-8") as f: for converted, passage, passage_id in in_converter(f, passage_id=basename, return_original=True): if verbose: with tqdm.external_write_mode(): print("Converting %s from %s" % (filename, converted_format)) yield ConvertedPassage(converted, passage, basename if force_basename else passage_id, **kwargs) else: passage_id = basename if force_basename else None yield ConvertedPassage(ioutil.file2passage(filename), passage_id=passage_id, **kwargs)
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument('directory', help="directory containing XML files to process") args = argparser.parse_args() passages = glob.glob(args.directory + "/*.xml") for filename in passages: sys.stderr.write("Fixing passage '%s'...\n" % filename) passage = file2passage(filename) terminals = passage.layer(layer0.LAYER_ID).all for terminal in terminals: terminal.tag = layer0.NodeTags.Punct if is_punctuation( terminal.attrib.get("text")) else layer0.NodeTags.Word passage2file(passage, filename, indent=False) sys.exit(0)
def parse_sentences(self, sentences): parsed_passages = [] # the command tempfile.mkdtemp() leaves the directory in place .. # consider replacing with 'with tempfile.TemporaryDirectory() as dir_name', # which will created the directory and then delete it with all it's contents # at the end of the 'with' block dir_name = tempfile.mkdtemp() print( "using directory {} for input to and output from 'python -m tupa command'" .format(dir_name), file=sys.stderr) for count, sentence in enumerate(sentences): input_path = '{}/file_{}'.format(dir_name, count) with open(input_path, 'w') as input: input.write(sentence) command = 'cd {}; python -m tupa {} -m {} -p parsed_ -o {}'.format( self._tupa_utility_path, dir_name, self._model_prefix, dir_name) result = subprocess.run([command], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True) if result.returncode != 0: print("commnd '{}' failed ".format(command), file=sys.stderr) print("it\'s output was:\n{}".format(result.stdout), file=sys.stderr) # return empty list of parsed outputs return [] for count, _ in enumerate(sentences): output_file = '{}/parsed_file_{}_0.xml'.format(dir_name, count) internal_parsed_passage = file2passage(output_file) parsed_passage = TupaParser2.__get_ucca_parsed_passage_from_passage( internal_parsed_passage) parsed_passages.append(parsed_passage) return parsed_passages
def iter_passages(patterns, desc=None, input_format=None, prefix="", label_map=None, output_format=None, **kwargs): t = tqdm(list(iter_files(patterns)), unit="file", desc=desc) for filename in t: t.set_postfix(file=os.path.basename(filename)) if not os.path.isfile(filename): raise IOError("Not a file: %s" % filename) no_ext, ext = os.path.splitext(filename) if ext in UCCA_EXT: # UCCA input yield ioutil.file2passage(filename) else: basename = os.path.basename(no_ext) try: passage_id = re.search(r"\d+(\.\d+)*", basename).group(0) except AttributeError: passage_id = basename converter = FROM_FORMAT.get(input_format or ext.lstrip("."), (from_text,)) with open(filename, encoding="utf-8") as f: yield from converter(f, prefix + passage_id, format=output_format if label_map else None, **kwargs)
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument("filenames", nargs="+", help="files to process") argparser.add_argument("-o", "--outfile", help="output file for data") args = argparser.parse_args() print("id,passages,paragraphs,sentences,nodes,terminals,non-terminals,implicit,linkage,discont," "edges,primary,remote,linkage,parents,children,mult-parents") data = [] for pattern in args.filenames: for filename in glob.glob(pattern): passage = file2passage(filename) terminals = passage.layer(layer0.LAYER_ID).all non_terminals = [n for n in passage.layer(layer1.LAYER_ID).all if n.ID != "1.1"] non_linkage = [n for n in non_terminals if n.tag != NodeTags.Linkage] linkage_nodes = passage.layer(layer1.LAYER_ID).top_linkages edges = {e for n in non_terminals for e in n} remote = [e for e in edges if e.attrib.get("remote")] linkage_edges = [e for n in linkage_nodes for e in n] fields = (int(passage.ID), 1, len({t.paragraph for t in terminals}), len(break2sentences(passage)), len(terminals) + len(non_terminals), len(terminals), len(non_terminals), len([n for n in non_linkage if n.attrib.get("implicit")]), len(linkage_nodes), len([n for n in non_linkage if n.tag == NodeTags.Foundational and n.discontiguous]), len(edges), len(edges) - len(remote) - len(linkage_edges), len(remote), len(linkage_edges), sum(len([p for p in n.parents if p.ID != "1.1"]) for n in non_linkage), sum(len(n.children) for n in non_linkage), len([n for n in non_linkage if len([p for p in n.parents if p.ID != "1.1"]) > 1]), ) print(",".join("%d" % f for f in fields)) data.append(fields) data = np.array(data, dtype=int) if args.outfile: np.savetxt(args.outfile, data[data[:, 0].argsort()], fmt="%i", delimiter="\t") sys.exit(0)
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument("filenames", nargs="+", help="files to process") argparser.add_argument("-o", "--outfile", help="output file for data") args = argparser.parse_args() print("id,passages,paragraphs,sentences,nodes,terminals,non-terminals,implicit,linkage,discont," "edges,primary,remote,linkage,parents,children,mult-parents") data = [] for pattern in args.filenames: for filename in glob.glob(pattern): passage = file2passage(filename) terminals = passage.layer(layer0.LAYER_ID).all non_terminals = [n for n in passage.layer(layer1.LAYER_ID).all if n.ID != "1.1"] non_linkage = [n for n in non_terminals if n.tag != NodeTags.Linkage] linkage_nodes = passage.layer(layer1.LAYER_ID).top_linkages edges = {e for n in non_terminals for e in n} remote = [e for e in edges if e.attrib.get("remote")] linkage_edges = [e for n in linkage_nodes for e in n] fields = (int(passage.ID), 1, len({t.paragraph for t in terminals}), len(break2sentences(passage)), len(terminals) + len(non_terminals), len(terminals), len(non_terminals), len([n for n in non_linkage if n.attrib.get("implicit")]), len(linkage_nodes), len([n for n in non_linkage if n.tag == NodeTags.Foundational and n.discontiguous]), len(edges), len(edges) - len(remote) - len(linkage_edges), len(remote), len(linkage_edges), sum(len([p for p in n.parents if p.ID != "1.1"]) for n in non_linkage), sum(len(n.children) for n in non_linkage), len([n for n in non_linkage if len([p for p in n.parents if p.ID != "1.1"]) > 1]), ) print(",".join("%d" % f for f in fields)) data.append(fields) data = np.array(data, dtype=int) np.savetxt(args.outfile, data[data[:, 0].argsort()], fmt="%i", delimiter="\t") sys.exit(0)
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument("filenames", nargs="+", help="file names to convert and evaluate") argparser.add_argument("-f", "--format", required=True, choices=convert.CONVERTERS, help="input file format") argparser.add_argument("-T", "--tree", action="store_true", help="remove multiple parents to get a tree") argparser.add_argument("-s", "--strict", action="store_true", help="stop immediately if failed to convert or evaluate a file") argparser.add_argument("-v", "--verbose", action="store_true", help="print evaluation results for each file separately") args = argparser.parse_args() converter1 = convert.TO_FORMAT[args.format] converter2 = convert.FROM_FORMAT[args.format] scores = [] for pattern in args.filenames: filenames = glob.glob(pattern) if not filenames: raise IOError("Not found: " + pattern) for filename in filenames: sys.stdout.write("\rConverting %s" % filename) sys.stdout.flush() ref = file2passage(filename) try: guessed = next(converter2(converter1(ref, tree=args.tree), ref.ID)) scores.append(evaluate(guessed, ref, verbose=args.verbose)) except Exception as e: if args.strict: raise ValueError("Error evaluating conversion of %s" % filename) from e else: print("Error evaluating conversion of %s: %s" % (filename, e), file=sys.stderr) print() if args.verbose and len(scores) > 1: print("Aggregated scores:") Scores.aggregate(scores).print() sys.exit(0)
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument("filenames", nargs="+", help="file names to convert and evaluate") argparser.add_argument("-f", "--format", required=True, choices=convert.CONVERTERS, help="input file format") argparser.add_argument("-T", "--tree", action="store_true", help="remove multiple parents to get a tree") args = argparser.parse_args() converter1 = convert.TO_FORMAT[args.format] converter2 = convert.FROM_FORMAT[args.format] scores = [] for pattern in args.filenames: filenames = glob.glob(pattern) if not filenames: raise IOError("Not found: " + pattern) for filename in filenames: ref = file2passage(filename) guessed = next(converter2(converter1(ref), ref.ID)) scores.append( evaluate(guessed, ref, fscore=True, verbose=True, units=False, errors=False)) if len(scores) > 1: print("Aggregated scores:") Scores.aggregate(scores).print() sys.exit(0)
def read_passages(files): """ :param files: iterable of files or Passage objects :return: generator of passages from all files given """ for file in files: if isinstance(file, core.Passage): # Not really a file, but a Passage passage = file elif os.path.exists(file): # A file try: passage = ioutil.file2passage(file) # XML or binary format except (IOError, ParseError): # Failed to read as passage file base, ext = os.path.splitext(os.path.basename(file)) converter = convert.FROM_FORMAT.get(ext.lstrip("."), convert.from_text) with open(file) as f: yield from converter(f, passage_id=base, split=Config().split) continue else: raise IOError("File not found: %s" % file) if Config().split: yield from convert.split2segments(passage, is_sentences=Config().sentences) else: yield passage
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument("filenames", nargs="+", help="passage file names to annotate") argparser.add_argument("-v", "--verbose", action="store_true", help="print tagged text for each passage") args = argparser.parse_args() for pattern in args.filenames: filenames = glob.glob(pattern) if not filenames: raise IOError("Not found: " + pattern) for filename in filenames: passage = file2passage(filename) annotate(passage, verbose=args.verbose, replace=True) sys.stderr.write("Writing '%s'...\n" % filename) passage2file(passage, filename, binary=not filename.endswith("xml")) sys.exit(0)
from argparse import ArgumentParser from ucca.evaluation import evaluate from ucca.ioutil import file2passage ################ # MAIN # ################ if __name__ == "__main__": argparser = ArgumentParser(description="Compare two UCCA passages.") argparser.add_argument("guessed", help="xml/pickle file name for the guessed annotation") argparser.add_argument("ref", help="xml/pickle file name for the reference annotation") argparser.add_argument("--units", "-u", dest="units", action="store_true", help="the units the annotations have in common, and those each has separately") argparser.add_argument("--fscore", "-f", dest="fscore", action="store_true", help="outputs the traditional P,R,F instead of the scene structure evaluation") argparser.add_argument("--errors", "-e", dest="errors", action="store_true", help="prints the error distribution according to its frequency") args = argparser.parse_args() if not (args.units or args.fscore or args.errors): argparser.error("At least one of -u, -f or -e is required.") guessed, ref = [file2passage(x) for x in (args.guessed, args.ref)] if args.units or args.fscore or args.errors: evaluate(guessed, ref, units=args.units, fscore=args.fscore, errors=args.errors, verbose=True)
def main(): print( align.align("what has is by the meaning of the word is", "what is the men for the wk is are be")) # read xml files print("reading db xmls") p = [] for filename in filenames: with open(add_path(filename), "rb") as fl: p += pickle.load(fl)[0] print( "read ", filename, " it starts with ", tuple(term.text for term in textutil.extract_terminals( convert.from_site(p[-1]))[:6])) # convert xml to passages p = list(map(convert.from_site, p)) print("reading passage xmls") # read passage files for filename in passage_filenames: print("reading" + filename) if os.path.isfile(add_path(os.path.splitext(filename)[0] + ".pkl")): with open(add_path(os.path.splitext(filename)[0] + ".pkl"), "rb") as fl: p.append(pickle.load(fl)) else: p.append(file2passage(add_path(filename))) with open(add_path(os.path.splitext(filename)[0] + ".pkl"), "wb") as fl: pickle.dump(p[-1], fl) print("dumping", add_path(os.path.splitext(filename)[0] + ".pkl")) all_filenames = filenames + passage_filenames print("read ", all_filenames) word2word = align.align_yields(p[0], p[1]) assert align.reverse_mapping(word2word) == align.align_yields( p[1], p[0]), "align_yields asymmetrical" # create symmilarity matrix sources = [] goals = [] names = [] i = 0 while i < len(p): names.append(all_filenames[i]) sources.append(p[i]) i += 1 goals.append(p[i]) i += 1 chunksize = 1 if (len(goals) > 100): chunksize = int(len(goals) / POOL_SIZE / 10) print("multithreading with chunksize", chunksize) pool = Pool(POOL_SIZE) if r2s: results = pool.starmap(distances, zip(goals, sources, names), chunksize) else: results = pool.starmap(distances, zip(sources, goals, names), chunksize) print(results) pool.close() pool.join() sym_mat = [] keys = [] for row, key in results: keys.append(key) sym_mat.append(row) print("functions and matrix") print(funcs + keys) for item in sym_mat: print(item) print("overall token analysis") print(align.token_level_analysis(p)) output_path = trial_name + "output.csv" with open(output_path, "w") as f: print("writing output to " + output_path) writer = csv.writer(f) writer.writerows(sym_mat) send_mail("*****@*****.**", "finished", os.path.abspath(output_path)) return
"the units the annotations have in common, and those each has separately" ) argparser.add_argument( "--fscore", "-f", dest="fscore", action="store_true", help= "outputs the traditional P,R,F instead of the scene structure evaluation" ) argparser.add_argument( "--errors", "-e", dest="errors", action="store_true", help="prints the error distribution according to its frequency") args = argparser.parse_args() if not (args.units or args.fscore or args.errors): argparser.error("At least one of -u, -f or -e is required.") guessed, ref = [file2passage(x) for x in (args.guessed, args.ref)] if args.units or args.fscore or args.errors: evaluate(guessed, ref, units=args.units, fscore=args.fscore, errors=args.errors, verbose=True)