Example #1
0
    def test_anno_from_xml_with_trans_token_mismatch(self):

        anno_element = ET.fromstring("""<token id="t1324" trans="her#aws">
                 <dipl id="t1324_d1" trans="her#aws" utf="heraws"/>
                 <mod id="t1324_m1" trans="her#aws" utf="heraws" ascii="heraws" checked="y" />
               </token>""")

        ## strict mode: error
        with self.assertLogs(None, 'ERROR'):
            create_importer('coraxml',
                            'anselm')._create_cora_token(anno_element, set())

        ## nonstrict mode: token is created as given by XML
        expected_token = CoraToken(AnselmParser().parse('her#aws'), [
            TokDipl(AnselmParser().parse('her#aws', output_type="dipl"),
                    extid='t1324_d1')
        ], [
            TokAnno(AnselmParser().parse('her#aws', output_type="anno"),
                    extid='t1324_m1',
                    checked=True)
        ],
                                   extid='t1324')
        self.assertEquals(
            expected_token,
            create_importer('coraxml', 'anselm',
                            strict=False)._create_cora_token(
                                anno_element, set()))
Example #2
0
    def test_anno_with_doubled_tags(self):

        anno_element = ET.fromstring(
            '<mod id="t1_m1" trans="priuilegien" utf="priuilegien" ascii="priuilegien" checked="y"><pos tag="NA"/><pos tag="NA"/></mod>'
        )

        with self.assertLogs(None, 'WARN'):
            create_importer('coraxml')._create_anno_token(
                anno_element,
                PlainParser().parse('priuilegien', output_type="dipl"))
Example #3
0
def convert(infile, from_, to, parser, strict_parsing, outfile):

    MyImporter = create_importer(from_, parser, strict=strict_parsing)
    MyExporter = create_exporter(to)

    doc = MyImporter.import_from_file(infile)
    if doc:
        outdoc = MyExporter.export(doc)

        # convert special documents to text
        if isinstance(outdoc, dict):
            # json
            outdoc = json.dumps(outdoc)
        elif isinstance(outdoc, etree._ElementTree):
            # xml
            outdoc = etree.tostring(outdoc,
                                    xml_declaration=True,
                                    pretty_print=True,
                                    encoding="utf-8")

        # default: text
        click.echo(outdoc, file=outfile)
    else:
        logging.error("Input document invalid")
        exit(1)
Example #4
0
    def test_dipl_from_xml(self):

        expected_dipl = TokDipl(PlainParser().parse('test',
                                                    output_type="dipl"),
                                extid='t1_d1')
        dipl_element = ET.fromstring('<dipl id="t1_d1" trans="test" />')

        self.assertEquals(
            expected_dipl,
            create_importer('coraxml')._create_dipl_token(
                dipl_element,
                PlainParser().parse('test', output_type="dipl")))
Example #5
0
    def test_anno_from_xml_with_transcription_mismatch(self):

        anno_element = ET.fromstring(
            """<token id="t924" trans="hin#cz&#xFC;|hin(.)">
                 <dipl id="t924_d1" trans="hin#cz&#xFC;|" utf="hincz&#xFC;"/>
                 <dipl id="t924_d2" trans="hin" utf="hin"/>
                 <mod id="t924_m1" trans="hin#cz&#xFC;|" utf="hincz&#xFC;" ascii="hincz&#xFC;" checked="y" />
                 <mod id="t924_m2" trans="hin" utf="hin" ascii="hin" checked="y" />
                 <mod id="t924_m3" trans="(.)" utf="." ascii="." checked="y" />
               </token>""")

        ## strict mode: error
        with self.assertLogs(None, 'ERROR'):
            create_importer('coraxml',
                            'anselm')._create_cora_token(anno_element, set())

        ## nonstrict mode: token is created as given by XML
        expected_token = CoraToken(AnselmParser().parse('hin#czü|hin'), [
            TokDipl(AnselmParser().parse('hin#czü|', output_type="dipl"),
                    extid='t924_d1'),
            TokDipl(AnselmParser().parse('hin', output_type="dipl"),
                    extid='t924_d2')
        ], [
            TokAnno(AnselmParser().parse('hin#czü|', output_type="anno"),
                    extid='t924_m1',
                    checked=True),
            TokAnno(AnselmParser().parse('hin', output_type="anno"),
                    extid='t924_m2',
                    checked=True),
            TokAnno(AnselmParser().parse('(.)', output_type="anno"),
                    extid='t924_m3',
                    checked=True)
        ],
                                   extid='t924')
        self.assertEquals(
            expected_token,
            create_importer('coraxml', 'anselm',
                            strict=False)._create_cora_token(
                                anno_element, set()))
Example #6
0
    def test_cora_token_from_xml(self):

        expected_token = CoraToken(PlainParser().parse('test|case'), [
            TokDipl(PlainParser().parse('test|case', output_type="dipl"),
                    extid='t1_d1')
        ], [
            TokAnno(PlainParser().parse('test|', output_type="anno"),
                    extid='t1_m1',
                    checked=True),
            TokAnno(PlainParser().parse('case', output_type="anno"),
                    extid='t1_m2')
        ],
                                   extid='t1')
        token_element = ET.fromstring(
            '<token id="t1" trans="test|case"><dipl id="t1_d1" trans="test|case" /><mod id="t1_m1" trans="test|" checked="y" /><mod id="t1_m2" trans="case" /></token>'
        )

        self.assertEquals(
            expected_token,
            create_importer('coraxml')._create_cora_token(
                token_element, set()))
Example #7
0
    def test_anno_from_xml(self):

        expected_anno = TokAnno(PlainParser().parse('priuilegien',
                                                    output_type="anno"),
                                tags={
                                    'lemma': 'privileg',
                                    'pos': 'NA',
                                    'morph': 'Fem.Dat.Pl',
                                    'boundary': 'Satz'
                                },
                                flags=set(['lemma verified', 'boundary']),
                                checked=True,
                                extid='t1_m1')
        anno_element = ET.fromstring(
            '<mod id="t1_m1" trans="priuilegien" utf="priuilegien" ascii="priuilegien" checked="y"><lemma tag="privileg"/><pos tag="NA"/><boundary tag="Satz"/><morph tag="Fem.Dat.Pl"/><cora-flag name="lemma verified"/><cora-flag name="boundary"/></mod>'
        )

        self.assertEquals(
            expected_anno,
            create_importer('coraxml')._create_anno_token(
                anno_element,
                PlainParser().parse('priuilegien', output_type="anno")))
Example #8
0
from coraxml_utils.importer import create_importer
from coraxml_utils.exporter import create_exporter

if __name__ == "__main__":
    description = "Konvertiert eine CorA-XML-Datei ins TEI-Format."
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument("infile", help="Eingabedatei (XML)")
    parser.add_argument("outfile", nargs="?", help="Ausgabedatei (XML)")
    parser.add_argument(
        "-P",
        "--parser",
        choices=["rem", "anselm", "ref", "redi"],
        default="ref",
        help="Token parser to use, default: %(default)s",
    )
    args, _ = parser.parse_known_args()

    MyImporter = create_importer("coraxml", args.parser)
    MyExporter = create_exporter("tei")

    doc = MyImporter.import_from_file(args.infile)
    tei_doc = MyExporter.export(doc)
    ausgabe = etree.tounicode(tei_doc)
    ausgabe = (
        ausgabe.replace("<lb ", "\n<lb ")
        .replace("<pb ", "\n<pb ")
        .replace("<space ", " <space ")
    )
    print(ausgabe, file=open(args.outfile, "w", encoding="utf-8"))
Example #9
0
#!/usr/bin/env python3
# coding: utf-8

from coraxml_utils.importer import create_importer
from coraxml_utils.exporter import create_exporter
from coraxml_utils.modifier import postprocess, no_postprocess, prepare_for_cora

if __name__ == "__main__":

    postprocess(
        create_importer(
            "coraxml",
            dialect="anselm",
            strict=False,
            tok_dipl_tag="tok_dipl",
            tok_anno_tag="tok_anno",
        ),
        create_exporter(
            "coraxml",
            options={
                # name mod -> tok_anno, dipl -> tok_dipl
                "dipl_tag_name": "dipl",
                "anno_tag_name": "mod",
            },
        ),
        prepare_for_cora
        # ,no_postprocess
    )
Example #10
0
    def test_unsupported_file_format(self):

        with self.assertRaises(ValueError):
            create_importer('"some unknown format"')
Example #11
0
    def test_coraxml_unsupported_dialect(self):

        with self.assertRaises(ValueError):
            create_importer('coraxml', '"some unknown dialect"')
Example #12
0
        "--parser",
        choices=["rem", "anselm", "ref", "redi"],
        default="ref",
        help="Token parser to use, default: %(default)s",
    )
    parser.add_argument(
        "--postprocessing",
        choices=["ref"],
        default=None,
        help="Script used to postprocess the xml file",
    )
    args, _ = parser.parse_known_args()
    if _:
        logging.warn("Unknown args: %s", _)

    MyImporter = create_importer("trans", args.parser)
    MyExporter = create_exporter("coraxml")

    print("~BEGIN CHECK")
    doc = None
    if os.path.splitext(args.infile)[-1].lower() == ".docx":
        trans = importTextFromDocx(Path(args.infile))
        doc = MyImporter.import_from_string(trans)
    else:
        with open(args.infile, "r", encoding="utf-8") as infile:
            doc = MyImporter.import_from_string(infile.read().replace(
                "\ufeff", ""))

    if doc:

        # do postprocessing
Example #13
0
#!/usr/bin/env python3
# coding: utf-8
from coraxml_utils.importer import create_importer
from coraxml_utils.exporter import create_exporter
from coraxml_utils.modifier import postprocess, ref_convert

if __name__ == "__main__":

    postprocess(
        create_importer("coraxml", dialect="ref", strict=False),
        create_exporter("coraxml",),
        ref_convert,
    )
Example #14
0
#!/usr/bin/env python3
# coding: utf-8

from coraxml_utils.importer import create_importer
from coraxml_utils.exporter import create_exporter
from coraxml_utils.modifier import (
    postprocess,
    anselm_postprocess,
    anselm_document_postprocess,
)

if __name__ == "__main__":

    postprocess(
        create_importer("coraxml", dialect="anselm", strict=False),
        create_exporter(
            "coraxml",
            options={
                # name mod -> tok_anno, dipl -> tok_dipl
                "dipl_tag_name": "tok_dipl",
                "anno_tag_name": "tok_anno",
            },
        ),
        anselm_postprocess,
        anselm_document_postprocess,
    )
Example #15
0
    ap.add_argument(
        "-q",
        "--nowarnings",
        help="Quiet mode: show only errors, no warnings",
        action="store_true",
    )

    args = ap.parse_args()

    if args.taggermode:
        args.tokenize = "all"
        args.bibinfo = "none"

    if args.nowarnings:
        logging.basicConfig(level=logging.ERROR)

    MyImporter = create_importer("trans", dialect=args.parser)
    MyExporter = create_exporter("trans")

    doc = None
    with open(args.inputfile, "r", encoding="utf-8") as infile:
        doc = MyImporter.import_from_string(infile.read().replace(
            "\ufeff", ""))

    export_contents = MyExporter.export(doc, token_form=args.format)
    if not args.output:
        sys.stdout.write(export_contents)
    else:
        with open(args.output, "wb", encoding="utf-8") as outputfile:
            outputfile.write(export_contents)
Example #16
0
#!/usr/bin/env python3
# coding: utf-8
from coraxml_utils.importer import create_importer
from coraxml_utils.exporter import create_exporter
from coraxml_utils.modifier import postprocess, ref_postprocess

if __name__ == "__main__":

    postprocess(
        create_importer("coraxml", dialect="ref"),
        create_exporter(
            "coraxml",
            options={
                # name mod -> tok_anno, dipl -> tok_dipl
                "dipl_tag_name": "tok_dipl",
                "anno_tag_name": "tok_anno",
            },
        ),
        ref_postprocess,
    )