Esempio n. 1
0
def test_to_standard():
    passage = convert.from_site(load_xml("test_files/site3.xml"))
    ref = load_xml("test_files/standard3.xml")  # old format of xml
    new_ref = convert.to_standard(convert.from_standard(ref))   # converting to the new xml format
    root = convert.to_standard(passage)
    assert (textutil.indent_xml(ETree.tostring(new_ref)).splitlines() ==
            textutil.indent_xml(ETree.tostring(root)).splitlines())
Esempio n. 2
0
def test_preannotate_passage(create, as_array, convert_and_back, partial, monkeypatch):
    if not partial:
        monkeypatch.setattr(textutil, "get_nlp", assert_spacy_not_loaded)
    passage = create()
    l0 = passage.layer(layer0.LAYER_ID)
    attr_values = list(range(10, 10 + len(textutil.Attr)))
    if partial:
        attr_values[textutil.Attr.ENT_TYPE.value] = ""
    if as_array:
        l0.extra["doc"] = [len(p) * [attr_values] for p in textutil.break2paragraphs(passage, return_terminals=True)]
    else:
        for terminal in l0.all:
            for attr, value in zip(textutil.Attr, attr_values):
                if value:
                    terminal.extra[attr.key] = value
    passage = (passage, convert.from_standard(convert.to_standard(passage)))[convert_and_back]
    if not partial:
        assert textutil.is_annotated(passage, as_array=as_array, as_extra=not as_array), \
            "Passage %s is not pre-annotated" % passage.ID
    textutil.annotate(passage, as_array=as_array, as_extra=not as_array)
    assert textutil.is_annotated(passage, as_array=as_array, as_extra=not as_array), \
        "Passage %s is not annotated" % passage.ID
    for terminal in l0.all:
        for i, (attr, value) in enumerate(zip(textutil.Attr, attr_values)):
            if value:
                assert (terminal.tok[i] if as_array else terminal.extra.get(attr.key)) == value, \
                    "Terminal %s has wrong %s" % (terminal, attr.name)
Esempio n. 3
0
def test_preannotate_passage(create, as_array, convert_and_back, partial, monkeypatch):
    if not partial:
        monkeypatch.setattr(textutil, "get_nlp", assert_spacy_not_loaded)
    passage = create()
    l0 = passage.layer(layer0.LAYER_ID)
    attr_values = list(range(10, 10 + len(textutil.Attr)))
    if partial:
        attr_values[textutil.Attr.ENT_TYPE.value] = ""
    if as_array:
        l0.extra["doc"] = [len(p) * [attr_values] for p in textutil.break2paragraphs(passage, return_terminals=True)]
    else:
        for terminal in l0.all:
            for attr, value in zip(textutil.Attr, attr_values):
                if value:
                    terminal.extra[attr.key] = value
    passage = (passage, convert.from_standard(convert.to_standard(passage)))[convert_and_back]
    if not partial:
        assert textutil.is_annotated(passage, as_array=as_array), "Passage %s is not pre-annotated" % passage.ID
    textutil.annotate(passage, as_array=as_array)
    assert textutil.is_annotated(passage, as_array=as_array), "Passage %s is not annotated" % passage.ID
    for terminal in l0.all:
        for i, (attr, value) in enumerate(zip(textutil.Attr, attr_values)):
            if value:
                assert (terminal.tok[i] if as_array else terminal.extra.get(attr.key)) == value, \
                    "Terminal %s has wrong %s" % (terminal, attr.name)
Esempio n. 4
0
 def test_annotate_all(self):
     passages = [
         convert.from_standard(
             TestUtil.load_xml("test_files/standard3.xml")),
         TestUtil.create_passage(),
         TestUtil.create_crossing_passage(),
         TestUtil.create_discontiguous(),
         TestUtil.create_multi_passage()
     ]
     list(textutil.annotate_all(passages))
     for passage, compare in textutil.annotate_all(
         ((p, p) for p in passages), as_array=True, as_tuples=True):
         assert passage is compare
         for p in passage, convert.from_standard(
                 convert.to_standard(passage)):
             self.assertTrue(is_annotated(p, as_array=True),
                             "Passage %s is not annotated" % passage.ID)
             self.assertTrue(is_annotated(p, as_array=False),
                             "Passage %s is not annotated" % passage.ID)
             for terminal in p.layer(layer0.LAYER_ID).all:
                 for attr in textutil.Attr:
                     self.assertIn(
                         attr.key, terminal.extra,
                         "Terminal %s in passage %s has no %s" %
                         (terminal, passage.ID, attr.name))
                 self.assertIsNotNone(
                     terminal.tok,
                     "Terminal %s in passage %s has no annotation" %
                     (terminal, passage.ID))
                 self.assertEqual(len(terminal.tok), len(textutil.Attr))
Esempio n. 5
0
def parse():
    text = request.values["input"]
    print("Parsing text: '%s'" % text)
    in_passage = next(from_text(text))
    out_passage = next(get_parser().parse(in_passage))[0]
    root = to_standard(out_passage)
    xml = tostring(root).decode()
    return Response(indent_xml(xml), headers={"Content-Type": "xml/application"})
Esempio n. 6
0
def parse():
    text = request.values["input"]
    print("Parsing text: '%s'" % text)
    in_passage = next(from_text(text))
    out_passage = next(get_parser().parse(in_passage))[0]
    root = to_standard(out_passage)
    xml = tostring(root).decode()
    return Response(indent_xml(xml),
                    headers={"Content-Type": "xml/application"})
Esempio n. 7
0
def print_passages_to_file(host_name,
                           db_name,
                           paids,
                           write_xml=False,
                           write_site_xml=False,
                           prefix='',
                           start_index=0):
    """
    Returns for that user a list of submitted passages and a list of assigned but not submitted passages.
    Each passage is given in the format: (<passage ID>, <source>, <recent submitted xid or -1 if not submitted>,
    <number of tokens in the passage>, <number of units in the passage>, <number of scenes in the passage>,
    <average length of a scene>). It also returns a distribution of the categories.
    write_xml: determines whether to write it to a file, named <prefix><the number of the xml>.xml
    skip_first: the index of the passage where it should start looking (the ones before are skipped)
    """
    c = get_cursor(host_name, db_name)

    for paid in paids:
        if paid < start_index:  # skipping training passages
            continue
        c.execute("SELECT passage,source FROM passages WHERE id=%s", (paid, ))
        r = c.fetchone()
        if r is not None:
            source = r[1]
            c.execute(
                "SELECT id, xml,uid,ts FROM xmls WHERE paid=%s ORDER BY ts DESC",
                (paid, ))
            r = c.fetchone()
            if r is not None:
                xid = r[0]
                uid = r[2]
                ts = r[3]
                print('\t'.join(
                    [str(paid),
                     str(uid),
                     str(source),
                     str(xid),
                     str(ts)]))

                if write_site_xml:
                    f = open(prefix + str(paid) + '_site.xml',
                             'w',
                             encoding='utf-8')
                    f.write(r[1] + '\n')
                    f.close()
                # noinspection PyBroadException
                try:
                    ucca_dag = convert.from_site(fromstring(r[1]))
                except Exception:
                    sys.stderr.write("Skipped xid,paid " + str((xid, paid)) +
                                     "\n")
                    continue
                if write_xml:
                    f = open(prefix + str(paid) + '.xml', 'w')
                    f.write(tostring(convert.to_standard(ucca_dag)).decode())
                    f.close()
Esempio n. 8
0
def test_annotate_passage(create, as_array):
    passage = create()
    textutil.annotate(passage, as_array=as_array)
    for p in passage, convert.from_standard(convert.to_standard(passage)):
        assert textutil.is_annotated(p, as_array=as_array), "Passage %s is not annotated" % passage.ID
        for terminal in p.layer(layer0.LAYER_ID).all:
            if as_array:
                assert terminal.tok is not None, "Terminal %s has no annotation" % terminal
                assert len(terminal.tok) == len(textutil.Attr)
            else:
                for attr in textutil.Attr:
                    assert attr.key in terminal.extra, "Terminal %s has no %s" % (terminal, attr.name)
Esempio n. 9
0
def test_annotate_passage(create, as_array):
    passage = create()
    textutil.annotate(passage, as_array=as_array)
    for p in passage, convert.from_standard(convert.to_standard(passage)):
        assert textutil.is_annotated(p, as_array=as_array), "Passage %s is not annotated" % passage.ID
        for terminal in p.layer(layer0.LAYER_ID).all:
            if as_array:
                assert terminal.tok is not None, "Terminal %s has no annotation" % terminal
                assert len(terminal.tok) == len(textutil.Attr)
            else:
                for attr in textutil.Attr:
                    assert attr.key in terminal.extra, "Terminal %s has no %s" % (terminal, attr.name)
Esempio n. 10
0
def test_annotate_all(as_array, convert_and_back):
    passages = [create() for create in PASSAGES]
    list(textutil.annotate_all(passages))
    for passage, compare in textutil.annotate_all(((p, p) for p in passages), as_array=as_array, as_tuples=True):
        assert passage is compare
        p = (passage, convert.from_standard(convert.to_standard(passage)))[convert_and_back]
        assert textutil.is_annotated(p, as_array=as_array), "Passage %s is not annotated" % passage.ID
        for terminal in p.layer(layer0.LAYER_ID).all:
            if as_array:
                assert terminal.tok is not None, "Terminal %s in passage %s has no annotation" % (terminal, passage.ID)
                assert len(terminal.tok) == len(textutil.Attr)
            else:
                for attr in textutil.Attr:
                    assert attr.key in terminal.extra, "Terminal %s in passage %s has no %s" % (
                        terminal, passage.ID, attr.name)
Esempio n. 11
0
def test_annotate_all(as_array, convert_and_back):
    passages = [create() for create in PASSAGES]
    list(textutil.annotate_all(passages))
    for passage, compare in textutil.annotate_all(((p, p) for p in passages), as_array=as_array, as_tuples=True):
        assert passage is compare
        p = (passage, convert.from_standard(convert.to_standard(passage)))[convert_and_back]
        assert textutil.is_annotated(p, as_array=as_array), "Passage %s is not annotated" % passage.ID
        for terminal in p.layer(layer0.LAYER_ID).all:
            if as_array:
                assert terminal.tok is not None, "Terminal %s in passage %s has no annotation" % (terminal, passage.ID)
                assert len(terminal.tok) == len(textutil.Attr)
            else:
                for attr in textutil.Attr:
                    assert attr.key in terminal.extra, "Terminal %s in passage %s has no %s" % (
                        terminal, passage.ID, attr.name)
Esempio n. 12
0
def passage2file(passage, filename, indent=True, binary=False):
    """Writes a UCCA passage as a standard XML file or a binary pickle
    :param passage: passage object to write
    :param filename: file name to write to
    :param indent: whether to indent each line
    :param binary: whether to write pickle format (or XML)
    """
    if binary:
        with open(filename, 'wb') as h:
            pickle.dump(passage, h)
    else:  # xml
        root = to_standard(passage)
        xml = tostring(root).decode()
        output = indent_xml(xml) if indent else xml
        with open(filename, 'w') as h:
            h.write(output)
Esempio n. 13
0
 def test_annotate_passage(self):
     passage = convert.from_standard(
         TestUtil.load_xml("test_files/standard3.xml"))
     textutil.annotate(passage)
     textutil.annotate(passage, as_array=True)
     for p in passage, convert.from_standard(convert.to_standard(passage)):
         self.assertTrue(is_annotated(p, as_array=True),
                         "Passage %s is not annotated" % passage.ID)
         self.assertTrue(is_annotated(p, as_array=False),
                         "Passage %s is not annotated" % passage.ID)
         for terminal in p.layer(layer0.LAYER_ID).all:
             for attr in textutil.Attr:
                 self.assertIn(
                     attr.key, terminal.extra,
                     "Terminal %s has no %s" % (terminal, attr.name))
             self.assertIsNotNone(
                 terminal.tok, "Terminal %s has no annotation" % terminal)
             self.assertEqual(len(terminal.tok), len(textutil.Attr))
Esempio n. 14
0
def write(graph, input, file):
    passage = graph2passage(graph, input)
    root = to_standard(passage)
    xml_string = ET.tostring(root).decode()
    output = textutil.indent_xml(xml_string)
    file.write(output)
Esempio n. 15
0
def test_to_standard():
    passage = convert.from_site(load_xml("test_files/site3.xml"))
    ref = load_xml("test_files/standard3.xml")
    root = convert.to_standard(passage)
    assert ETree.tostring(ref) == ETree.tostring(root)
Esempio n. 16
0
 def test_to_standard(self):
     passage = convert.from_site(TestUtil.load_xml("test_files/site3.xml"))
     ref = TestUtil.load_xml("test_files/standard3.xml")
     root = convert.to_standard(passage)
     self.assertEqual(ETree.tostring(ref), ETree.tostring(root))
Esempio n. 17
0
 def test_to_standard(self):
     passage = convert.from_site(self._load_xml('./site3.xml'))
     ref = self._load_xml('./standard3.xml')
     root = convert.to_standard(passage)
     self.assertEqual(ETree.tostring(ref), ETree.tostring(root))
Esempio n. 18
0
 def test_to_standard(self):
     passage = convert.from_site(TestUtil.load_xml("test_files/site3.xml"))
     ref = TestUtil.load_xml("test_files/standard3.xml")
     root = convert.to_standard(passage)
     self.assertEqual(ETree.tostring(ref), ETree.tostring(root))
Esempio n. 19
0
def test_to_standard():
    passage = convert.from_site(load_xml("test_files/site3.xml"))
    ref = load_xml("test_files/standard3.xml")  # old format of xml
    new_ref = convert.to_standard(convert.from_standard(ref))   # converting to the new xml format
    root = convert.to_standard(passage)
    assert ETree.tostring(new_ref) == ETree.tostring(root)