def test_to_standard(): passage = convert.from_site(load_xml("test_files/site3.xml")) ref = load_xml("test_files/standard3.xml") # old format of xml new_ref = convert.to_standard(convert.from_standard(ref)) # converting to the new xml format root = convert.to_standard(passage) assert (textutil.indent_xml(ETree.tostring(new_ref)).splitlines() == textutil.indent_xml(ETree.tostring(root)).splitlines())
def test_preannotate_passage(create, as_array, convert_and_back, partial, monkeypatch): if not partial: monkeypatch.setattr(textutil, "get_nlp", assert_spacy_not_loaded) passage = create() l0 = passage.layer(layer0.LAYER_ID) attr_values = list(range(10, 10 + len(textutil.Attr))) if partial: attr_values[textutil.Attr.ENT_TYPE.value] = "" if as_array: l0.extra["doc"] = [len(p) * [attr_values] for p in textutil.break2paragraphs(passage, return_terminals=True)] else: for terminal in l0.all: for attr, value in zip(textutil.Attr, attr_values): if value: terminal.extra[attr.key] = value passage = (passage, convert.from_standard(convert.to_standard(passage)))[convert_and_back] if not partial: assert textutil.is_annotated(passage, as_array=as_array, as_extra=not as_array), \ "Passage %s is not pre-annotated" % passage.ID textutil.annotate(passage, as_array=as_array, as_extra=not as_array) assert textutil.is_annotated(passage, as_array=as_array, as_extra=not as_array), \ "Passage %s is not annotated" % passage.ID for terminal in l0.all: for i, (attr, value) in enumerate(zip(textutil.Attr, attr_values)): if value: assert (terminal.tok[i] if as_array else terminal.extra.get(attr.key)) == value, \ "Terminal %s has wrong %s" % (terminal, attr.name)
def test_preannotate_passage(create, as_array, convert_and_back, partial, monkeypatch): if not partial: monkeypatch.setattr(textutil, "get_nlp", assert_spacy_not_loaded) passage = create() l0 = passage.layer(layer0.LAYER_ID) attr_values = list(range(10, 10 + len(textutil.Attr))) if partial: attr_values[textutil.Attr.ENT_TYPE.value] = "" if as_array: l0.extra["doc"] = [len(p) * [attr_values] for p in textutil.break2paragraphs(passage, return_terminals=True)] else: for terminal in l0.all: for attr, value in zip(textutil.Attr, attr_values): if value: terminal.extra[attr.key] = value passage = (passage, convert.from_standard(convert.to_standard(passage)))[convert_and_back] if not partial: assert textutil.is_annotated(passage, as_array=as_array), "Passage %s is not pre-annotated" % passage.ID textutil.annotate(passage, as_array=as_array) assert textutil.is_annotated(passage, as_array=as_array), "Passage %s is not annotated" % passage.ID for terminal in l0.all: for i, (attr, value) in enumerate(zip(textutil.Attr, attr_values)): if value: assert (terminal.tok[i] if as_array else terminal.extra.get(attr.key)) == value, \ "Terminal %s has wrong %s" % (terminal, attr.name)
def test_annotate_all(self): passages = [ convert.from_standard( TestUtil.load_xml("test_files/standard3.xml")), TestUtil.create_passage(), TestUtil.create_crossing_passage(), TestUtil.create_discontiguous(), TestUtil.create_multi_passage() ] list(textutil.annotate_all(passages)) for passage, compare in textutil.annotate_all( ((p, p) for p in passages), as_array=True, as_tuples=True): assert passage is compare for p in passage, convert.from_standard( convert.to_standard(passage)): self.assertTrue(is_annotated(p, as_array=True), "Passage %s is not annotated" % passage.ID) self.assertTrue(is_annotated(p, as_array=False), "Passage %s is not annotated" % passage.ID) for terminal in p.layer(layer0.LAYER_ID).all: for attr in textutil.Attr: self.assertIn( attr.key, terminal.extra, "Terminal %s in passage %s has no %s" % (terminal, passage.ID, attr.name)) self.assertIsNotNone( terminal.tok, "Terminal %s in passage %s has no annotation" % (terminal, passage.ID)) self.assertEqual(len(terminal.tok), len(textutil.Attr))
def parse(): text = request.values["input"] print("Parsing text: '%s'" % text) in_passage = next(from_text(text)) out_passage = next(get_parser().parse(in_passage))[0] root = to_standard(out_passage) xml = tostring(root).decode() return Response(indent_xml(xml), headers={"Content-Type": "xml/application"})
def print_passages_to_file(host_name, db_name, paids, write_xml=False, write_site_xml=False, prefix='', start_index=0): """ Returns for that user a list of submitted passages and a list of assigned but not submitted passages. Each passage is given in the format: (<passage ID>, <source>, <recent submitted xid or -1 if not submitted>, <number of tokens in the passage>, <number of units in the passage>, <number of scenes in the passage>, <average length of a scene>). It also returns a distribution of the categories. write_xml: determines whether to write it to a file, named <prefix><the number of the xml>.xml skip_first: the index of the passage where it should start looking (the ones before are skipped) """ c = get_cursor(host_name, db_name) for paid in paids: if paid < start_index: # skipping training passages continue c.execute("SELECT passage,source FROM passages WHERE id=%s", (paid, )) r = c.fetchone() if r is not None: source = r[1] c.execute( "SELECT id, xml,uid,ts FROM xmls WHERE paid=%s ORDER BY ts DESC", (paid, )) r = c.fetchone() if r is not None: xid = r[0] uid = r[2] ts = r[3] print('\t'.join( [str(paid), str(uid), str(source), str(xid), str(ts)])) if write_site_xml: f = open(prefix + str(paid) + '_site.xml', 'w', encoding='utf-8') f.write(r[1] + '\n') f.close() # noinspection PyBroadException try: ucca_dag = convert.from_site(fromstring(r[1])) except Exception: sys.stderr.write("Skipped xid,paid " + str((xid, paid)) + "\n") continue if write_xml: f = open(prefix + str(paid) + '.xml', 'w') f.write(tostring(convert.to_standard(ucca_dag)).decode()) f.close()
def test_annotate_passage(create, as_array): passage = create() textutil.annotate(passage, as_array=as_array) for p in passage, convert.from_standard(convert.to_standard(passage)): assert textutil.is_annotated(p, as_array=as_array), "Passage %s is not annotated" % passage.ID for terminal in p.layer(layer0.LAYER_ID).all: if as_array: assert terminal.tok is not None, "Terminal %s has no annotation" % terminal assert len(terminal.tok) == len(textutil.Attr) else: for attr in textutil.Attr: assert attr.key in terminal.extra, "Terminal %s has no %s" % (terminal, attr.name)
def test_annotate_all(as_array, convert_and_back): passages = [create() for create in PASSAGES] list(textutil.annotate_all(passages)) for passage, compare in textutil.annotate_all(((p, p) for p in passages), as_array=as_array, as_tuples=True): assert passage is compare p = (passage, convert.from_standard(convert.to_standard(passage)))[convert_and_back] assert textutil.is_annotated(p, as_array=as_array), "Passage %s is not annotated" % passage.ID for terminal in p.layer(layer0.LAYER_ID).all: if as_array: assert terminal.tok is not None, "Terminal %s in passage %s has no annotation" % (terminal, passage.ID) assert len(terminal.tok) == len(textutil.Attr) else: for attr in textutil.Attr: assert attr.key in terminal.extra, "Terminal %s in passage %s has no %s" % ( terminal, passage.ID, attr.name)
def passage2file(passage, filename, indent=True, binary=False): """Writes a UCCA passage as a standard XML file or a binary pickle :param passage: passage object to write :param filename: file name to write to :param indent: whether to indent each line :param binary: whether to write pickle format (or XML) """ if binary: with open(filename, 'wb') as h: pickle.dump(passage, h) else: # xml root = to_standard(passage) xml = tostring(root).decode() output = indent_xml(xml) if indent else xml with open(filename, 'w') as h: h.write(output)
def test_annotate_passage(self): passage = convert.from_standard( TestUtil.load_xml("test_files/standard3.xml")) textutil.annotate(passage) textutil.annotate(passage, as_array=True) for p in passage, convert.from_standard(convert.to_standard(passage)): self.assertTrue(is_annotated(p, as_array=True), "Passage %s is not annotated" % passage.ID) self.assertTrue(is_annotated(p, as_array=False), "Passage %s is not annotated" % passage.ID) for terminal in p.layer(layer0.LAYER_ID).all: for attr in textutil.Attr: self.assertIn( attr.key, terminal.extra, "Terminal %s has no %s" % (terminal, attr.name)) self.assertIsNotNone( terminal.tok, "Terminal %s has no annotation" % terminal) self.assertEqual(len(terminal.tok), len(textutil.Attr))
def write(graph, input, file): passage = graph2passage(graph, input) root = to_standard(passage) xml_string = ET.tostring(root).decode() output = textutil.indent_xml(xml_string) file.write(output)
def test_to_standard(): passage = convert.from_site(load_xml("test_files/site3.xml")) ref = load_xml("test_files/standard3.xml") root = convert.to_standard(passage) assert ETree.tostring(ref) == ETree.tostring(root)
def test_to_standard(self): passage = convert.from_site(TestUtil.load_xml("test_files/site3.xml")) ref = TestUtil.load_xml("test_files/standard3.xml") root = convert.to_standard(passage) self.assertEqual(ETree.tostring(ref), ETree.tostring(root))
def test_to_standard(self): passage = convert.from_site(self._load_xml('./site3.xml')) ref = self._load_xml('./standard3.xml') root = convert.to_standard(passage) self.assertEqual(ETree.tostring(ref), ETree.tostring(root))
def test_to_standard(): passage = convert.from_site(load_xml("test_files/site3.xml")) ref = load_xml("test_files/standard3.xml") # old format of xml new_ref = convert.to_standard(convert.from_standard(ref)) # converting to the new xml format root = convert.to_standard(passage) assert ETree.tostring(new_ref) == ETree.tostring(root)