Esempio n. 1
0
def _ucca_parse_text(text, output_dir, filename, clean, normalize_sentence,
                     model_path):
    # text = [normalize_sentence(x) for x in text]
    # text = from_text(text, split=True, one_per_line=True)
    # text = list(text)
    # By pass the UCCA tokenizor
    text = [
        next(
            from_text(normalize_sentence(val).split(' '),
                      passage_id=idx,
                      tokenized=True)) for idx, val in enumerate(text)
    ]
    # print(text)
    parser = get_parser(model_path)
    out_location = os.path.dirname(parse_location(output_dir, filename, 0))
    if not os.path.isdir(out_location):
        os.makedirs(out_location)
    for i, (passage, *_) in enumerate(parser.parse(text)):
        passage2file(passage, parse_location(output_dir, filename, i))
    # create an empty file anounces parsing finished succsessfuly
    parsed_file = os.path.join(out_location, PARSED_FILE)
    with open(parsed_file, "w") as _:
        pass
    if clean:
        filenames = os.listdir(output_dir)
        for filename in filenames:
            if filename.endswith(".txt"):
                os.remove(os.path.join(output_dir, item))
Esempio n. 2
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument('filenames',
                           nargs='+',
                           help="passage file names to convert")
    argparser.add_argument('-o',
                           '--outdir',
                           default='.',
                           help="output directory")
    argparser.add_argument('-p',
                           '--prefix',
                           default='',
                           help="output filename prefix")
    argparser.add_argument('-r',
                           '--remarks',
                           action='store_true',
                           help="annotate original IDs")
    argparser.add_argument("-b",
                           "--binary",
                           action="store_true",
                           help="write in pickle binary format (.pickle)")
    args = argparser.parse_args()

    for filename in args.filenames:
        passage = file2passage(filename)
        sentences = ucca.convert.split2sentences(passage, remarks=args.remarks)
        for i, sentence in enumerate(sentences):
            outfile = "%s/%s.%s" % (args.outdir, args.prefix + sentence.ID,
                                    "pickle" if args.binary else "xml")
            sys.stderr.write("Writing passage file for sentence '%s'...\n" %
                             outfile)
            passage2file(sentence, outfile, args.binary)

    sys.exit(0)
Esempio n. 3
0
def main(args):
    splitter = Splitter.read_file(args.sentences,
                                  enum=args.enumerate,
                                  suffix_format=args.suffix_format,
                                  suffix_start=args.suffix_start)
    os.makedirs(args.outdir, exist_ok=True)
    i = 0
    for passage in get_passages_with_progress_bar(args.filenames, "Splitting"):
        for sentence in splitter.split(
                passage) if splitter else split2sentences(
                    passage,
                    remarks=args.remarks,
                    lang=args.lang,
                    ids=map(str, count(i)) if args.enumerate else None):
            i += 1
            outfile = os.path.join(
                args.outdir, args.prefix + sentence.ID +
                (".pickle" if args.binary else ".xml"))
            if args.verbose:
                with external_write_mode():
                    print("Writing passage file for sentence '%s'..." %
                          outfile,
                          file=sys.stderr)
            if args.normalize:
                normalize(sentence)
            passage2file(sentence, outfile, binary=args.binary)
    if splitter and len(splitter.matched_indices) < len(splitter.sentences):
        print("Unmatched sentences:",
              *[
                  s for i, s in enumerate(splitter.sentences)
                  if i not in splitter.matched_indices
              ],
              sep="\n")
Esempio n. 4
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument("filenames", nargs="+", help="file names to convert")
    argparser.add_argument("-f", "--format", choices=convert.CONVERTERS, help="input file format")
    argparser.add_argument("-o", "--outdir", default=".", help="output directory")
    argparser.add_argument("-p", "--prefix", default="", help="output filename prefix")
    argparser.add_argument("-b", "--binary", action="store_true", help="write in pickle binary format (.pickle)")
    argparser.add_argument("-s", "--split", action="store_true", help="split each sentence to its own passage")
    argparser.add_argument("-T", "--tree", action="store_true", help="currently unused")
    argparser.add_argument("-m", "--markaux", action="store_true", help="mark auxiliary edges introduced on conversion")
    args = argparser.parse_args()

    for pattern in args.filenames:
        filenames = glob.glob(pattern)
        if not filenames:
            raise IOError("Not found: " + pattern)
        for filename in filenames:
            no_ext, ext = os.path.splitext(filename)
            basename = os.path.basename(no_ext)
            try:
                passage_id = re.search(r"\d+", basename).group(0)
            except AttributeError:
                passage_id = basename

            converter = convert.FROM_FORMAT.get(args.format or ext.lstrip("."))
            if converter is None:
                raise IOError("Unknown extension '%s'. Specify format using -f" % ext)

            with open(filename, encoding="utf-8") as f:
                for passage in converter(f, passage_id, args.split, args.markaux):
                    outfile = "%s/%s.%s" % (args.outdir, args.prefix + passage.ID, "pickle" if args.binary else "xml")
                    sys.stderr.write("Writing '%s'...\n" % outfile)
                    passage2file(passage, outfile, args.binary)

    sys.exit(0)
Esempio n. 5
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument("filenames",
                           nargs="+",
                           help="file names to convert and evaluate")
    add_verbose_argument(argparser, help="detailed evaluation output")
    argparser.add_argument(
        "-o",
        "--outdir",
        help="output directory (if unspecified, files are not written)")
    args = argparser.parse_args()

    scores = []
    for pattern in args.filenames:
        filenames = glob.glob(pattern)
        if not filenames:
            raise IOError("Not found: " + pattern)
        for filename in filenames:
            sys.stdout.write("\rConverting '%s'" % filename)
            if args.outdir:
                sys.stdout.write("\n")
            sys.stdout.flush()
            basename = os.path.basename(os.path.splitext(filename)[0])
            with open(filename, encoding="utf-8") as f:
                for passage, ref, amr_id in from_amr(f,
                                                     passage_id=basename,
                                                     return_amr=True):
                    if args.outdir:
                        outfile = "%s/%s.xml" % (args.outdir, passage.ID)
                        print("Writing '%s'..." % outfile,
                              file=sys.stderr,
                              flush=True)
                        ioutil.passage2file(passage, outfile)
                    try:
                        guessed = "\n".join(to_amr(passage, amr_id))
                    except Exception as e:
                        raise ValueError("Error converting %s back from AMR" %
                                         filename) from e
                    if args.outdir:
                        outfile = "%s/%s.txt" % (args.outdir, passage.ID)
                        print("Writing '%s'..." % outfile,
                              file=sys.stderr,
                              flush=True)
                        with open(outfile, "w", encoding="utf-8") as f_out:
                            print(str(guessed), file=f_out)
                    try:
                        s = evaluate(guessed, ref, verbose=args.verbose > 1)
                    except Exception as e:
                        raise ValueError("Error evaluating conversion of %s" %
                                         filename) from e
                    scores.append(s)
                    if args.verbose:
                        s.print(flush=True)
    print()
    if args.verbose and len(scores) > 1:
        print("Aggregated scores:")
    SmatchScores.aggregate(scores).print()

    sys.exit(0)
Esempio n. 6
0
def main(args):
    for filename in args.filenames:
        print("Reading passage '%s'..." % filename, file=sys.stderr)
        passage = file2passage(filename)
        basename = os.path.splitext(os.path.basename(filename))[0]
        outfile = args.outdir + os.path.sep + basename + ".pickle"
        print("Writing file '%s'..." % outfile, file=sys.stderr)
        passage2file(passage, outfile, binary=True)
Esempio n. 7
0
def main(args):
    for filename in args.filenames:
        sys.stderr.write("Reading passage '%s'...\n" % filename)
        passage = file2passage(filename)
        basename = os.path.splitext(os.path.basename(filename))[0]
        outfile = args.outdir + os.path.sep + basename + ".xml"
        sys.stderr.write("Writing file '%s'...\n" % outfile)
        passage2file(passage, outfile)
Esempio n. 8
0
def diff_passages(true_passage, pred_passage):
    """
    Debug method to print missing or mistaken attributes, nodes and edges
    """
    lines = list()
    if not true_passage._attrib.equals(pred_passage._attrib):
        lines.append("Passage attributes mismatch: %s, %s" %
                     (true_passage._attrib, pred_passage._attrib))
    try:
        for lid, l1 in true_passage._layers.items():
            l2 = true_passage.layer(lid)
            if not l1._attrib.equals(l2._attrib):
                lines.append("Layer %d attributes mismatch: %s, %s" %
                             (lid, l1._attrib, l2._attrib))
    except KeyError:  # no layer with same ID found
        lines.append("Missing layer: %s, %s" %
                     (true_passage._layers, pred_passage._layers))
    pred_ids = {node.extra["remarks"]: node
                for node in pred_passage.missing_nodes(true_passage)}
    true_ids = {node.ID: node
                for node in true_passage.missing_nodes(pred_passage)}
    for pred_id, pred_node in list(pred_ids.items()):
        true_node = true_ids.get(pred_id)
        if true_node:
            pred_ids.pop(pred_id)
            true_ids.pop(pred_id)
            pred_edges = {edge.tag + "->" + edge.child.ID: edge for edge in
                          pred_node.missing_edges(true_node)}
            true_edges = {edge.tag + "->" + edge.child.ID: edge for edge in
                          true_node.missing_edges(pred_node)}
            intersection = set(pred_edges).intersection(set(true_edges))
            pred_edges = {s: edge for s, edge in pred_edges.items() if s not in intersection}
            true_edges = {s: edge for s, edge in true_edges.items() if s not in intersection}

            node_lines = []
            if not pred_node._attrib.equals(true_node._attrib):
                node_lines.append("  Attributes mismatch: %s, %s" %
                                  (sorted(true_node._attrib.items()), sorted(pred_node._attrib.items())))
            if pred_edges:
                node_lines.append("  Mistake edges: %s" % ", ".join(pred_edges))
            if true_edges:
                node_lines.append("  Missing edges: %s" % ", ".join(true_edges))
            if node_lines:
                lines.append("For node " + pred_id + ":")
                lines.extend(node_lines)
    if pred_ids:
        lines.append("Mistake nodes: %s" % ", ".join(pred_ids))
    if true_ids:
        lines.append("Missing nodes: %s" % ", ".join(true_ids))
    if lines:
        outfile = "%s.xml" % true_passage.ID
        sys.stderr.write("Writing passage '%s'...\n" % outfile)
        passage2file(true_passage, outfile)
        outfile = "%s_pred.xml" % pred_passage.ID
        sys.stderr.write("Writing passage '%s'...\n" % outfile)
        passage2file(pred_passage, outfile)
    return "\n" + "\n".join(lines)
Esempio n. 9
0
File: util.py Progetto: viksit/ucca
def write_passage(passage, args):
    suffix = args.format or ("pickle" if args.binary else "xml")
    outfile = args.outdir + os.path.sep + args.prefix + passage.ID + "." + suffix
    print("Writing passage '%s'..." % outfile)
    if args.format is None:
        ioutil.passage2file(passage, outfile, binary=args.binary)
    else:
        converter = convert.TO_FORMAT[args.format]
        output = "\n".join(line for line in converter(passage))
        with open(outfile, "w") as f:
            f.write(output + "\n")
Esempio n. 10
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument("filenames",
                           nargs="+",
                           help="passage file names to join")
    argparser.add_argument("-o",
                           "--outdir",
                           default=".",
                           help="output directory")
    argparser.add_argument("-p",
                           "--prefix",
                           default="",
                           help="output filename prefix")
    argparser.add_argument("-r",
                           "--remarks",
                           action="store_true",
                           help="annotate original IDs")
    argparser.add_argument("-b",
                           "--binary",
                           action="store_true",
                           help="write in pickle binary format (.pickle)")
    argparser.add_argument(
        "-j",
        "--join-by-prefix",
        action="store_true",
        help=
        "join each set of passages whose IDs share all but the last 3 characters"
    )
    args = argparser.parse_args()

    passages = [
        file2passage(filename) for pattern in args.filenames
        for filename in sorted(glob.glob(pattern))
    ]
    if args.join_by_prefix:
        subsets = defaultdict(list)
        for passage in passages:
            subsets[passage.ID[:-3]].append(passage)
    else:
        subsets = {passages[0].ID: passages}
    for passage_id, subset in sorted(subsets.items()):
        sys.stderr.write("Joining passages " +
                         ", ".join(passage.ID for passage in subset) + "\n")
        joined = ucca.convert.join_passages(passages,
                                            passage_id=passage_id,
                                            remarks=args.remarks)
        outfile = "%s/%s.%s" % (args.outdir, args.prefix + joined.ID,
                                "pickle" if args.binary else "xml")
        sys.stderr.write("Writing joined passage file '%s'...\n" % outfile)
        passage2file(joined, outfile, args.binary)

    sys.exit(0)
Esempio n. 11
0
def main(args):
    os.makedirs(args.outdir, exist_ok=True)
    for filename in tqdm(args.filenames, desc="Converting", unit=" passages"):
        if args.verbose:
            with external_write_mode():
                print("Reading passage '%s'..." % filename, file=sys.stderr)
        passage = file2passage(filename)
        basename = os.path.splitext(os.path.basename(filename))[0]
        outfile = args.outdir + os.path.sep + basename + ".pickle"
        if args.verbose:
            with external_write_mode():
                print("Writing file '%s'..." % outfile, file=sys.stderr)
        passage2file(passage, outfile, binary=True)
Esempio n. 12
0
def main():
    argparser = configargparse.ArgParser(description=desc)
    argparser.add_argument("filenames", nargs="+", help="file names to convert and evaluate")
    add_verbose_arg(argparser, help="detailed evaluation output")
    add_boolean_option(argparser, "wikification", "Spotlight to wikify any named node (for AMR)")
    argparser.add_argument("-o", "--out-dir", help="output directory (if unspecified, files are not written)")
    args = argparser.parse_args()

    scores = []
    for pattern in args.filenames:
        filenames = glob(pattern)
        if not filenames:
            raise IOError("Not found: " + pattern)
        for filename in filenames:
            print("\rConverting '%s'" % filename, end="")
            if args.out_dir or args.verbose:
                print(flush=True)
            basename, ext = os.path.splitext(os.path.basename(filename))
            passage_format = ext.lstrip(".")
            converters = CONVERTERS.get(passage_format, CONVERTERS["amr"])
            evaluator = EVALUATORS.get(passage_format, EVALUATORS["amr"]).evaluate
            with open(filename, encoding="utf-8") as f:
                for passage, ref, passage_id in converters[0](f, passage_id=basename, return_original=True):
                    if args.out_dir:
                        os.makedirs(args.out_dir, exist_ok=True)
                        outfile = "%s/%s.xml" % (args.out_dir, passage.ID)
                        print("Writing '%s'..." % outfile, file=sys.stderr, flush=True)
                        ioutil.passage2file(passage, outfile)
                    try:
                        guessed = converters[1](passage, wikification=args.wikification, use_original=False)
                    except Exception as e:
                        raise ValueError("Error converting %s back from %s" % (filename, passage_format)) from e
                    if args.out_dir:
                        outfile = "%s/%s%s" % (args.out_dir, passage.ID, ext)
                        print("Writing '%s'..." % outfile, file=sys.stderr, flush=True)
                        with open(outfile, "w", encoding="utf-8") as f_out:
                            print("\n".join(guessed), file=f_out)
                    try:
                        s = evaluator(guessed, ref, verbose=args.verbose > 1)
                    except Exception as e:
                        raise ValueError("Error evaluating conversion of %s" % filename) from e
                    scores.append(s)
                    if args.verbose:
                        print(passage_id)
                        s.print()
    print()
    if args.verbose and len(scores) > 1:
        print("Aggregated scores:")
    Scores(scores).print()

    sys.exit(0)
Esempio n. 13
0
def main(args):
    splitter = Splitter.read_file(args.sentences, enum=args.enumerate)
    os.makedirs(args.outdir, exist_ok=True)
    i = 0
    for passage in get_passages_with_progress_bar(args.filenames, "Splitting"):
        for sentence in splitter.split(passage) if splitter else split2sentences(
                passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None):
            i += 1
            outfile = os.path.join(args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml"))
            with external_write_mode():
                print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr)
            if args.normalize:
                normalize(sentence)
            passage2file(sentence, outfile, binary=args.binary)
Esempio n. 14
0
def main(args):
    passages = list(get_passages(args.filenames))
    if args.join_by_prefix:
        subsets = defaultdict(list)
        for passage in passages:
            subsets[passage.ID[:-3]].append(passage)
    else:
        subsets = {passages[0].ID: passages}
    for passage_id, subset in sorted(subsets.items()):
        print("Joining passages " + ", ".join(passage.ID for passage in subset), file=sys.stderr)
        joined = ucca.convert.join_passages(passages, passage_id=passage_id, remarks=args.remarks)
        outfile = "%s/%s.%s" % (args.outdir, args.prefix + joined.ID, "pickle" if args.binary else "xml")
        print("Writing joined passage file '%s'..." % outfile, file=sys.stderr)
        passage2file(joined, outfile, args.binary)
Esempio n. 15
0
def main(args):
    order = None
    if args.sentences:
        with open(args.sentences, encoding="utf-8") as f:
            order = dict(map(reversed, enumerate(map(str.strip, f))))
    for passage in get_passages_with_progress_bar(args.filenames, "Splitting"):
        for sentence in split(passage, order) if order else split2sentences(
                passage, remarks=args.remarks, lang=args.lang):
            outfile = os.path.join(
                args.outdir, args.prefix + sentence.ID +
                (".pickle" if args.binary else ".xml"))
            with tqdm.external_write_mode():
                print("Writing passage file for sentence '%s'..." % outfile,
                      file=sys.stderr)
            passage2file(sentence, outfile, args.binary)
Esempio n. 16
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument('filenames', nargs='+', help="XML file names to convert")
    argparser.add_argument('-o', '--outdir', default='.', help="output directory")
    args = argparser.parse_args()

    for filename in args.filenames:
        sys.stderr.write("Reading passage '%s'...\n" % filename)
        passage = file2passage(filename)
        basename = os.path.splitext(os.path.basename(filename))[0]
        outfile = args.outdir + os.path.sep + basename + ".pickle"
        sys.stderr.write("Writing file '%s'...\n" % outfile)
        passage2file(passage, outfile, binary=True)

    sys.exit(0)
Esempio n. 17
0
def write_passage(passage, args):
    ext = {
        None: UCCA_EXT[args.binary],
        "amr": ".txt"
    }.get(args.output_format) or "." + args.output_format
    outfile = args.outdir + os.path.sep + args.prefix + passage.ID + ext
    sys.stderr.write("Writing '%s'...\n" % outfile)
    if args.output_format is None:  # UCCA output
        ioutil.passage2file(passage, outfile, args.binary)
    else:
        converter = CONVERTERS[args.output_format][1]
        output = "\n".join(converter(passage)) if args.output_format == "amr" else \
            "\n".join(line for p in (convert.split2sentences(passage) if args.split else [passage]) for line in
                      converter(p, test=args.test, tree=args.tree, mark_aux=args.mark_aux))
        with open(outfile, "w", encoding="utf-8") as f:
            print(output, file=f)
Esempio n. 18
0
def write_passage(passage, out_dir=".", output_format=None, binary=False, verbose=False, label_map=False, split=False,
                  join=None, **kwargs):
    ext = {None: UCCA_EXT[binary], "amr": ".txt"}.get(output_format) or "." + output_format
    if join and join.endswith(ext):
        ext = ""
    outfile = os.path.join(out_dir, (join or passage.ID) + ext)
    if verbose:
        with ioutil.external_write_mode():
            print("Writing '%s'..." % outfile, file=sys.stderr)
    if output_format is None:  # UCCA output
        ioutil.passage2file(passage, outfile, binary=binary)
    else:
        converter = TO_FORMAT[output_format]
        with open(outfile, "a" if join else "w", encoding="utf-8") as f:
            for line in converter(passage, format=output_format if label_map else None, sentences=split, **kwargs):
                print(line, file=f)
Esempio n. 19
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument('directory', help="directory containing XML files to process")
    args = argparser.parse_args()

    passages = glob.glob(args.directory + "/*.xml")
    for filename in passages:
        sys.stderr.write("Fixing passage '%s'...\n" % filename)
        passage = file2passage(filename)
        terminals = passage.layer(layer0.LAYER_ID).all
        for terminal in terminals:
            terminal.tag = layer0.NodeTags.Punct if is_punctuation(
                terminal.attrib.get("text")) else layer0.NodeTags.Word
        passage2file(passage, filename, indent=False)

    sys.exit(0)
Esempio n. 20
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument('directory',
                           help="directory containing XML files to process")
    args = argparser.parse_args()

    passages = glob.glob(args.directory + "/*.xml")
    for filename in passages:
        sys.stderr.write("Fixing passage '%s'...\n" % filename)
        passage = file2passage(filename)
        terminals = passage.layer(layer0.LAYER_ID).all
        for terminal in terminals:
            terminal.tag = layer0.NodeTags.Punct if is_punctuation(
                terminal.attrib.get("text")) else layer0.NodeTags.Word
        passage2file(passage, filename, indent=False)

    sys.exit(0)
Esempio n. 21
0
def main(args):
    scores = []
    for pattern in args.filenames:
        filenames = glob(pattern)
        if not filenames:
            raise IOError("Not found: " + pattern)
        for filename in filenames:
            print("\rConverting '%s'" % filename, end="")
            if args.out_dir or args.verbose:
                print(flush=True)
            basename, ext = os.path.splitext(os.path.basename(filename))
            passage_format = ext.lstrip(".")
            converters = CONVERTERS.get(passage_format, CONVERTERS["amr"])
            evaluator = EVALUATORS.get(passage_format, EVALUATORS["amr"]).evaluate
            with open(filename, encoding="utf-8") as f:
                for passage, ref, passage_id in converters[0](f, passage_id=basename, return_original=True):
                    if args.normalize:
                        normalize(passage, extra=args.extra_normalization)
                    if args.out_dir:
                        os.makedirs(args.out_dir, exist_ok=True)
                        outfile = "%s/%s.xml" % (args.out_dir, passage.ID)
                        print("Writing '%s'..." % outfile, file=sys.stderr, flush=True)
                        ioutil.passage2file(passage, outfile)
                    try:
                        guessed = converters[1](passage, wikification=args.wikification, use_original=False)
                    except Exception as e:
                        raise ValueError("Error converting %s back from %s" % (filename, passage_format)) from e
                    if args.out_dir:
                        outfile = "%s/%s%s" % (args.out_dir, passage.ID, ext)
                        print("Writing '%s'..." % outfile, file=sys.stderr, flush=True)
                        with open(outfile, "w", encoding="utf-8") as f_out:
                            print("\n".join(guessed), file=f_out)
                    try:
                        s = evaluator(guessed, ref, verbose=args.verbose > 1)
                    except Exception as e:
                        raise ValueError("Error evaluating conversion of %s" % filename) from e
                    scores.append(s)
                    if args.verbose:
                        print(passage_id)
                        s.print()
    print()
    if args.verbose and len(scores) > 1:
        print("Aggregated scores:")
    Scores(scores).print()
Esempio n. 22
0
def main(args):
    splitter = Splitter.read_file(args.sentences, enum=args.enumerate,
                                  suffix_format=args.suffix_format, suffix_start=args.suffix_start)
    os.makedirs(args.outdir, exist_ok=True)
    i = 0
    for passage in get_passages_with_progress_bar(args.filenames, "Splitting"):
        for sentence in splitter.split(passage) if splitter else split2sentences(
                passage, remarks=args.remarks, lang=args.lang, ids=map(str, count(i)) if args.enumerate else None):
            i += 1
            outfile = os.path.join(args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml"))
            if args.verbose:
                with external_write_mode():
                    print(sentence, file=sys.stderr)
                    print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr)
            if args.normalize:
                normalize(sentence)
            passage2file(sentence, outfile, binary=args.binary)
    if splitter and len(splitter.matched_indices) < len(splitter.sentences):
        print("", "Unmatched sentences:", *[s for i, s in enumerate(splitter.sentences)
                                            if i not in splitter.matched_indices], sep="\n")
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument('filenames', nargs='+', help="passage file names to convert")
    argparser.add_argument('-o', '--outdir', default='.', help="output directory")
    argparser.add_argument('-p', '--prefix', default='', help="output filename prefix")
    argparser.add_argument('-r', '--remarks', action='store_true', help="annotate original IDs")
    argparser.add_argument("-b", "--binary", action="store_true",
                           help="write in pickle binary format (.pickle)")
    args = argparser.parse_args()

    for filename in args.filenames:
        passage = file2passage(filename)
        sentences = ucca.convert.split2sentences(passage, remarks=args.remarks)
        for i, sentence in enumerate(sentences):
            outfile = "%s/%s.%s" % (args.outdir, args.prefix + sentence.ID,
                                    "pickle" if args.binary else "xml")
            sys.stderr.write("Writing passage file for sentence '%s'...\n" % outfile)
            passage2file(sentence, outfile, args.binary)

    sys.exit(0)
Esempio n. 24
0
def main(args):
    os.makedirs(args.outdir, exist_ok=True)
    i = 0
    for passage in get_passages_with_progress_bar(args.filenames, "Splitting"):
        for paragraph in split2paragraphs(
                passage,
                remarks=args.remarks,
                lang=args.lang,
                ids=map(str, count(i)) if args.enumerate else None):
            i += 1
            outfile = os.path.join(
                args.outdir, args.prefix + paragraph.ID +
                (".pickle" if args.binary else ".xml"))
            if args.verbose:
                with external_write_mode():
                    print(paragraph, file=sys.stderr)
                    print("Writing passage file for paragraph '%s'..." %
                          outfile,
                          file=sys.stderr)
            if args.normalize:
                normalize(paragraph)
            passage2file(paragraph, outfile, binary=args.binary)
Esempio n. 25
0
def _ucca_parse_text(text, output_dir, filename, clean, normalize_sentence):
    text = [normalize_sentence(x) for x in text]
    # print("parsing", text)
    text = from_text(text, split=True, one_per_line=True)
    text = list(text)
    # print("output_dir", output_dir)
    # print(filename, "filename")
    # print("parsed to", parse_location(
    # output_dir, filename, 0))
    # raise
    parser = get_parser()
    for i, passage in enumerate(parser.parse(text)):
        passage2file(passage, parse_location(output_dir, filename, i))
    # create a file anounces parsing finished succsessfuly
    parsed_file = os.path.join(
        os.path.dirname(parse_location(output_dir, filename, 0)), PARSED_FILE)
    with open(parsed_file, "w") as _:
        pass
    if clean:
        filenames = os.listdir(output_dir)
        for filename in filenames:
            if filename.endswith(".txt"):
                os.remove(os.path.join(output_dir, item))
Esempio n. 26
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument("filenames",
                           nargs="+",
                           help="passage file names to annotate")
    argparser.add_argument("-v",
                           "--verbose",
                           action="store_true",
                           help="print tagged text for each passage")
    args = argparser.parse_args()

    for pattern in args.filenames:
        filenames = glob.glob(pattern)
        if not filenames:
            raise IOError("Not found: " + pattern)
        for filename in filenames:
            passage = file2passage(filename)
            annotate(passage, verbose=args.verbose, replace=True)
            sys.stderr.write("Writing '%s'...\n" % filename)
            passage2file(passage,
                         filename,
                         binary=not filename.endswith("xml"))

    sys.exit(0)
Esempio n. 27
0
def write_passage(passage,
                  out_dir=".",
                  output_format=None,
                  binary=False,
                  verbose=False,
                  test=False,
                  tree=False,
                  mark_aux=False,
                  wikification=False,
                  default_label=None,
                  label_map=False,
                  split=False,
                  **kwargs):
    del kwargs
    ext = {
        None: UCCA_EXT[binary],
        "amr": ".txt"
    }.get(output_format) or "." + output_format
    outfile = os.path.join(out_dir, passage.ID + ext)
    if verbose:
        with ioutil.external_write_mode():
            print("Writing '%s'..." % outfile, file=sys.stderr)
    if output_format is None:  # UCCA output
        ioutil.passage2file(passage, outfile, binary=binary)
    else:
        converter = TO_FORMAT[output_format]
        with open(outfile, "w", encoding="utf-8") as f:
            for line in converter(passage,
                                  test=test,
                                  tree=tree,
                                  mark_aux=mark_aux,
                                  wikification=wikification,
                                  default_label=default_label,
                                  format=output_format if label_map else None,
                                  sentences=split):
                print(line, file=f)
Esempio n. 28
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument("filenames",
                           nargs="+",
                           help="file names to convert")
    argparser.add_argument("-f",
                           "--format",
                           choices=convert.CONVERTERS,
                           help="input file format")
    argparser.add_argument("-o",
                           "--outdir",
                           default=".",
                           help="output directory")
    argparser.add_argument("-p",
                           "--prefix",
                           default="",
                           help="output filename prefix")
    argparser.add_argument("-b",
                           "--binary",
                           action="store_true",
                           help="write in pickle binary format (.pickle)")
    argparser.add_argument("-s",
                           "--split",
                           action="store_true",
                           help="split each sentence to its own passage")
    argparser.add_argument("-T",
                           "--tree",
                           action="store_true",
                           help="currently unused")
    argparser.add_argument(
        "-m",
        "--markaux",
        action="store_true",
        help="mark auxiliary edges introduced on conversion")
    args = argparser.parse_args()

    for pattern in args.filenames:
        filenames = glob.glob(pattern)
        if not filenames:
            raise IOError("Not found: " + pattern)
        for filename in filenames:
            no_ext, ext = os.path.splitext(filename)
            basename = os.path.basename(no_ext)
            try:
                passage_id = re.search(r"\d+", basename).group(0)
            except AttributeError:
                passage_id = basename

            converter = convert.FROM_FORMAT.get(args.format or ext.lstrip("."))
            if converter is None:
                raise IOError(
                    "Unknown extension '%s'. Specify format using -f" % ext)

            with open(filename, encoding="utf-8") as f:
                for passage in converter(f, passage_id, args.split,
                                         args.markaux):
                    outfile = "%s/%s.%s" % (args.outdir,
                                            args.prefix + passage.ID,
                                            "pickle" if args.binary else "xml")
                    sys.stderr.write("Writing '%s'...\n" % outfile)
                    passage2file(passage, outfile, args.binary)

    sys.exit(0)
Esempio n. 29
0
def main(args):
    if args.out_dir:
        os.makedirs(args.out_dir, exist_ok=True)
    scores = []
    for pattern in args.filenames:
        for filename in glob(pattern) or [pattern]:
            file_scores = []
            basename, ext = os.path.splitext(os.path.basename(filename))
            passage_format = ext.lstrip(".")
            if passage_format == "txt":
                passage_format = args.format
            in_converter, out_converter = CONVERTERS.get(
                passage_format, CONVERTERS[args.format])
            evaluate = EVALUATORS.get(passage_format, EVALUATORS[args.format])
            with open(filename, encoding="utf-8") as f:
                t = tqdm(in_converter(f,
                                      passage_id=basename,
                                      return_original=True),
                         unit=" passages",
                         desc=("Converting '%s'" % filename) +
                         ((", writing to '%s'" %
                           args.out_dir) if args.out_dir else ""))
                for passage, ref, passage_id in t:
                    if args.normalize:
                        normalize(passage, extra=args.extra_normalization)
                    if args.out_dir:
                        os.makedirs(args.out_dir, exist_ok=True)
                        outfile = os.path.join(args.out_dir,
                                               passage.ID + ".xml")
                        if args.verbose:
                            with ioutil.external_write_mode():
                                print("Writing '%s'..." % outfile,
                                      file=sys.stderr,
                                      flush=True)
                        ioutil.passage2file(passage, outfile)
                    try:
                        guessed = out_converter(passage,
                                                wikification=args.wikification,
                                                use_original=False)
                    except Exception as e:
                        raise ValueError("Error converting %s back from %s" %
                                         (filename, passage_format)) from e
                    if args.out_dir:
                        outfile = os.path.join(args.out_dir, passage.ID + ext)
                        if args.verbose:
                            with ioutil.external_write_mode():
                                print("Writing '%s'..." % outfile,
                                      file=sys.stderr,
                                      flush=True)
                        with open(outfile, "w", encoding="utf-8") as f_out:
                            print("\n".join(guessed), file=f_out)
                    try:
                        s = evaluate(guessed,
                                     ref,
                                     verbose=args.verbose > 1,
                                     units=args.units)
                    except Exception as e:
                        raise ValueError("Error evaluating conversion of %s" %
                                         filename) from e
                    file_scores.append(s)
                    if args.verbose:
                        with ioutil.external_write_mode():
                            print(passage_id)
                            s.print()
                    t.set_postfix(F1="%.2f" %
                                  (100.0 * Scores(file_scores).average_f1()))
            scores += file_scores
    print()
    if args.verbose and len(scores) > 1:
        print("Aggregated scores:")
    Scores(scores).print()
Esempio n. 30
0
def diff_passages(true_passage, pred_passage):
    """
    Debug method to print missing or mistaken attributes, nodes and edges
    """
    lines = list()
    if not true_passage._attrib.equals(pred_passage._attrib):
        lines.append("Passage attributes mismatch: %s, %s" %
                     (true_passage._attrib, pred_passage._attrib))
    try:
        for lid, l1 in true_passage._layers.items():
            l2 = true_passage.layer(lid)
            if not l1._attrib.equals(l2._attrib):
                lines.append("Layer %d attributes mismatch: %s, %s" %
                             (lid, l1._attrib, l2._attrib))
    except KeyError:  # no layer with same ID found
        lines.append("Missing layer: %s, %s" %
                     (true_passage._layers, pred_passage._layers))
    pred_ids = {
        node.extra["remarks"]: node
        for node in pred_passage.missing_nodes(true_passage)
    }
    true_ids = {
        node.ID: node
        for node in true_passage.missing_nodes(pred_passage)
    }
    for pred_id, pred_node in list(pred_ids.items()):
        true_node = true_ids.get(pred_id)
        if true_node:
            pred_ids.pop(pred_id)
            true_ids.pop(pred_id)
            pred_edges = {
                edge.tag + "->" + edge.child.ID: edge
                for edge in pred_node.missing_edges(true_node)
            }
            true_edges = {
                edge.tag + "->" + edge.child.ID: edge
                for edge in true_node.missing_edges(pred_node)
            }
            intersection = set(pred_edges).intersection(set(true_edges))
            pred_edges = {
                s: edge
                for s, edge in pred_edges.items() if s not in intersection
            }
            true_edges = {
                s: edge
                for s, edge in true_edges.items() if s not in intersection
            }

            node_lines = []
            if not pred_node._attrib.equals(true_node._attrib):
                node_lines.append("  Attributes mismatch: %s, %s" %
                                  (true_node._attrib, pred_node._attrib))
            if pred_edges:
                node_lines.append("  Mistake edges: %s" %
                                  ", ".join(pred_edges))
            if true_edges:
                node_lines.append("  Missing edges: %s" %
                                  ", ".join(true_edges))
            if node_lines:
                lines.append("For node " + pred_id + ":")
                lines.extend(node_lines)
    if pred_ids:
        lines.append("Mistake nodes: %s" % ", ".join(pred_ids))
    if true_ids:
        lines.append("Missing nodes: %s" % ", ".join(true_ids))
    if lines:
        outfile = "ucca_passage%s.xml" % true_passage.ID
        sys.stderr.write("Writing passage '%s'...\n" % outfile)
        passage2file(true_passage, outfile)
        outfile = "ucca_passage%s_pred.xml" % pred_passage.ID
        sys.stderr.write("Writing passage '%s'...\n" % outfile)
        passage2file(pred_passage, outfile)
    return "\n" + "\n".join(lines)