def test_to_sdp(self): passage = convert.from_standard(TestUtil.load_xml("test_files/standard3.xml")) converted = convert.to_sdp(passage) with open("test_files/standard3.sdp", encoding="utf-8") as f: # f.write("\n".join(converted)) self.assertSequenceEqual(converted, f.read().splitlines() + [""]) converted_passage = next(convert.from_sdp(converted, passage.ID)) # ioutil.passage2file(converted_passage, "test_files/standard3.sdp.xml") ref = convert.from_standard(TestUtil.load_xml("test_files/standard3.sdp.xml")) self.assertTrue(converted_passage.equals(ref))
def test_to_export(self): passage = convert.from_standard(TestUtil.load_xml('test_files/standard3.xml')) converted = convert.to_export(passage) with open('test_files/standard3.export') as f: # f.write("\n".join(converted)) self.assertSequenceEqual(converted, f.read().splitlines()) converted_passage = next(convert.from_export(converted, passage.ID)) # ioutil.passage2file(converted_passage, 'test_files/standard3.export.xml') ref = convert.from_standard(TestUtil.load_xml('test_files/standard3.export.xml')) self.assertTrue(converted_passage.equals(ref))
def test_to_conll(self): passage = convert.from_standard(TestUtil.load_xml("test_files/standard3.xml")) converted = convert.to_conll(passage) with open("test_files/standard3.conll", encoding="utf-8") as f: # f.write("\n".join(converted)) self.assertSequenceEqual(converted, f.read().splitlines() + [""]) converted_passage = next(convert.from_conll(converted, passage.ID)) # ioutil.passage2file(converted_passage, "test_files/standard3.conll.xml") ref = convert.from_standard(TestUtil.load_xml("test_files/standard3.conll.xml")) self.assertTrue(converted_passage.equals(ref)) # Put the same sentence twice and try converting again for converted_passage in convert.from_conll(converted * 2, passage.ID): ref = convert.from_standard(TestUtil.load_xml("test_files/standard3.conll.xml")) self.assertTrue(converted_passage.equals(ref))
def test_preannotate_passage(create, as_array, convert_and_back, partial, monkeypatch): if not partial: monkeypatch.setattr(textutil, "get_nlp", assert_spacy_not_loaded) passage = create() l0 = passage.layer(layer0.LAYER_ID) attr_values = list(range(10, 10 + len(textutil.Attr))) if partial: attr_values[textutil.Attr.ENT_TYPE.value] = "" if as_array: l0.extra["doc"] = [len(p) * [attr_values] for p in textutil.break2paragraphs(passage, return_terminals=True)] else: for terminal in l0.all: for attr, value in zip(textutil.Attr, attr_values): if value: terminal.extra[attr.key] = value passage = (passage, convert.from_standard(convert.to_standard(passage)))[convert_and_back] if not partial: assert textutil.is_annotated(passage, as_array=as_array, as_extra=not as_array), \ "Passage %s is not pre-annotated" % passage.ID textutil.annotate(passage, as_array=as_array, as_extra=not as_array) assert textutil.is_annotated(passage, as_array=as_array, as_extra=not as_array), \ "Passage %s is not annotated" % passage.ID for terminal in l0.all: for i, (attr, value) in enumerate(zip(textutil.Attr, attr_values)): if value: assert (terminal.tok[i] if as_array else terminal.extra.get(attr.key)) == value, \ "Terminal %s has wrong %s" % (terminal, attr.name)
def __init__(self, f): if isinstance(f, Node): self.root = f else: print("Reading '%s'..." % f) passage = convert.from_standard(ET.parse(f).getroot()) self.root = Node('ROOT') children = [self.build(x) for l in passage.layers for x in l.all if not x.incoming] self.root.set_children_binarized(children)
def visualize(): xml = request.get_data() passage = from_standard(fromstring(xml)) print("Visualizing passage %s: %s" % (passage.ID, passage.layer(layer1.LAYER_ID).heads[0])) canvas = FigureCanvasAgg(plt.figure()) draw(passage) image = BytesIO() canvas.print_png(image) data = b64encode(image.getvalue()).decode() return Response(quote(data.rstrip("\n")))
def test_annotate_passage(create, as_array): passage = create() textutil.annotate(passage, as_array=as_array) for p in passage, convert.from_standard(convert.to_standard(passage)): assert textutil.is_annotated(p, as_array=as_array), "Passage %s is not annotated" % passage.ID for terminal in p.layer(layer0.LAYER_ID).all: if as_array: assert terminal.tok is not None, "Terminal %s has no annotation" % terminal assert len(terminal.tok) == len(textutil.Attr) else: for attr in textutil.Attr: assert attr.key in terminal.extra, "Terminal %s has no %s" % (terminal, attr.name)
def file2passage(filename): """Opens a file and returns its parsed Passage object Tries to read both as a standard XML file and as a binary pickle :param filename: file name to write to """ try: with open(filename) as f: etree = ElementTree().parse(f) return from_standard(etree) except Exception as e: try: with open(filename, 'rb') as h: return pickle.load(h) except Exception: raise e
def test_annotate_all(as_array, convert_and_back): passages = [create() for create in PASSAGES] list(textutil.annotate_all(passages)) for passage, compare in textutil.annotate_all(((p, p) for p in passages), as_array=as_array, as_tuples=True): assert passage is compare p = (passage, convert.from_standard(convert.to_standard(passage)))[convert_and_back] assert textutil.is_annotated(p, as_array=as_array), "Passage %s is not annotated" % passage.ID for terminal in p.layer(layer0.LAYER_ID).all: if as_array: assert terminal.tok is not None, "Terminal %s in passage %s has no annotation" % (terminal, passage.ID) assert len(terminal.tok) == len(textutil.Attr) else: for attr in textutil.Attr: assert attr.key in terminal.extra, "Terminal %s in passage %s has no %s" % ( terminal, passage.ID, attr.name)
def test_from_standard(self): passage = convert.from_standard(TestUtil.load_xml("test_files/standard3.xml")) ref = convert.from_site(TestUtil.load_xml("test_files/site3.xml")) self.assertTrue(passage.equals(ref, ordered=True))
def loaded_valid(): return convert.from_standard(load_xml("test_files/standard3_valid.xml"))
def loaded(): return convert.from_standard(load_xml("test_files/standard3.xml"))
x.append(par) if I == []: x.append(par) y.append(x) return (y) index = list(range(0, 100)) for t in index: f1 = open('UCCAannotated_source/%s.xml' % t) xml_string1 = f1.read() f1.close() xml_object1 = fromstring(xml_string1) P1 = convert.from_standard(xml_object1) #for semi-automatic SAMSA L1 = get_num_scenes(P1) L2 = get_num_sentences('%s.txt' % t) M1 = get_cmrelations(P1) A1 = get_cparticipants(P1) #print(L1) #print(L2) #print(M1) #print(A1) if L1 < L2: score = 0 elif L1 == L2: f1 = open( 'scene_sentence_alignment_output/a%s.txt' % t
def test_to_site(self): passage = convert.from_standard( TestUtil.load_xml("test_files/standard3.xml")) root = convert.to_site(passage) copy = convert.from_site(root) self.assertTrue(passage.equals(copy))
return (output) def get_sentences(P): """ P is the output of the simplification system. Return all the sentences in each passage """ dirpath = '/Mypath/System_output' folder = nltk.data.find(dirpath) corpusReader = nltk.corpus.PlaintextCorpusReader(folder, P) d = len(corpusReader.sents()) return (corpusReader.sents()[:d]) index = list(range(0, 100)) for t in index: f1 = open('UCCAannotated_source/%s.xml' % t) xml_string1 = f1.read() f1.close() xml_object1 = fromstring(xml_string1) P1 = convert.from_standard( xml_object1) #from_site for the semi-automatic version of SAMSA L1 = get_scenes(P1) L2 = get_sentences('%s.txt' % t) s = open('s%s.txt' % t, 'w') s.write('%s\n' % L1) s.write('%s\n' % L2) s.close()
def test_to_site(self): passage = convert.from_standard(self._load_xml('./standard3.xml')) root = convert.to_site(passage) copy = convert.from_site(root) self.assertTrue(passage.equals(copy))
def test_from_standard(self): passage = convert.from_standard(self._load_xml('./standard3.xml')) ref = convert.from_site(self._load_xml('./site3.xml')) self.assertTrue(passage.equals(ref, ordered=True))
def file2passage(filename): "Opens a standard xml file and returns its parsed Passage object" with open(filename) as f: etree = ElementTree().parse(f) return convert.from_standard(etree)
def download(): xml = request.values["input"] out_format = request.values["format"] print("Converting to " + out_format) out = xml if out_format == "xml" else "\n".join(TO_FORMAT[out_format](from_standard(fromstring(xml)))) return Response(out, headers={"Content-Type": CONTENT_TYPES.get(out_format, "text/plain")})
def test_to_text(self): passage = convert.from_standard(TestUtil.load_xml("test_files/standard3.xml")) self.assertEqual(convert.to_text(passage, False)[0], "1 2 3 4 . 6 7 8 9 10 . 12 13 14 15") self.assertSequenceEqual(convert.to_text(passage, True), ["1 2 3 4 .", "6 7 8 9 10 .", "12 13 14 15"])
def test_to_site(self): passage = convert.from_standard(TestUtil.load_xml("test_files/standard3.xml")) root = convert.to_site(passage) copy = convert.from_site(root) self.assertTrue(passage.equals(copy))
def test_to_text(self): passage = convert.from_standard(self._load_xml('./standard3.xml')) self.assertEqual(convert.to_text(passage, False)[0], '1 2 3 4 . 6 7 8 9 10 . 12 13 14 15') self.assertSequenceEqual(convert.to_text(passage, True), ['1 2 3 4 .', '6 7 8 9 10 .', '12 13 14 15'])
def test_from_standard(self): passage = convert.from_standard( TestUtil.load_xml("test_files/standard3.xml")) ref = convert.from_site(TestUtil.load_xml("test_files/site3.xml")) self.assertTrue(passage.equals(ref, ordered=True))
def __init__(self, *args, **kwargs): super(ParserTests, self).__init__(*args, **kwargs) Config("", "-m", "test") self.passage = convert.from_standard(TestUtil.load_xml('test_files/standard3.xml'))
starts = [0, len(tokens)] return [ ' '.join(tokens[starts[i]:starts[i + 1]]) for i in range(len(starts) - 1) ] index = list(range(0, 1500)) for t in index: f1 = open('test.en.tupa_parsed/newsdiscusstest2015-enfr-src_%s.xml' % t) xml_string1 = f1.read() f1.close() xml_object1 = fromstring(xml_string1) P1 = convert.from_standard(xml_object1) L1 = get_Hscenes(P1) L2 = get_EAscenes(P1)[0] C2 = get_EAscenes(P1)[1] T = get_passage(P1) D = to_word_text(P1) split12 = [] for h in L1: D1 = get_difference(h, L2, C2) split12.append(D1) S1 = sum(split12, []) s = open('12r%s.txt' % t, 'w') if S1 != []: s.write('%s\n' % S1)