def fix_punct(conllu_string): doc = Document() doc.from_conllu_string(conllu_string) fixpunct_block = FixPunct() fixpunct_block.process_document(doc) output_string = doc.to_conllu_string() return output_string
def fix_punct(conllu_string): doc = Document() doc.from_conllu_string(conllu_string) fixpunct_block = FixPunct() fixpunct_block.process_document(doc) output_string = doc.to_conllu_string() output_string = re.sub(r'# sent_id = [0-9]+\n',r'',output_string) # remove udapi sent_id return output_string
def test_print_subtree(self): """Test print_subtree() method, which uses udapi.block.write.textmodetrees.""" doc = Document() data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu') doc.load_conllu(data_filename) root = doc.bundles[0].get_tree() expected1 = ("# sent_id = a-mf920901-001-p1s1A\n" "# text = Slovenská ústava: pro i proti\n" "─┮\n" " │ ╭─╼ Slovenská ADJ amod\n" " ╰─┾ ústava NOUN root\n" " ┡─╼ : PUNCT punct\n" " ╰─┮ pro ADP appos\n" " ┡─╼ i CONJ cc\n" " ╰─╼ proti ADP conj\n" "\n") expected2 = ("─┮\n" " │ ╭─╼ Slovenská Case=Nom|Degree=Pos|Gender=Fem|Negative=Pos|Number=Sing _\n" " ╰─┾ ústava Case=Nom|Gender=Fem|Negative=Pos|Number=Sing SpaceAfter=No\n" " ┡─╼ : _ _\n" " ╰─┮ pro AdpType=Prep|Case=Acc LId=pro-1\n" " ┡─╼ i _ LId=i-1\n" " ╰─╼ proti AdpType=Prep|Case=Dat LId=proti-1\n" "\n") # test non-projective tree root3 = Root() for i in range(1, 5): root3.create_child(form=str(i)) nodes = root3.descendants(add_self=1) nodes[1].parent = nodes[3] nodes[4].parent = nodes[2] expected3 = ("─┮\n" " │ ╭─╼ 1\n" " ┡─╪───┮ 2\n" " ╰─┶ 3 │\n" " ╰─╼ 4\n" "\n") try: sys.stdout = capture = io.StringIO() root.print_subtree(color=False) self.assertEqual(capture.getvalue(), expected1) capture.seek(0) capture.truncate() root.print_subtree(color=False, attributes='form,feats,misc', print_sent_id=False, print_text=False) self.assertEqual(capture.getvalue(), expected2) capture.seek(0) capture.truncate() root3.print_subtree(color=False, attributes='form', print_sent_id=0, print_text=0) self.assertEqual(capture.getvalue(), expected3) finally: sys.stdout = sys.__stdout__ # pylint: disable=redefined-variable-type
def fix_punct(conllu_string): # Protect possessive apostrophe from being treated as punctuation conllu_string = re.sub(r"\t'\t([^\t\n]+\tPART\tPOS)", r'\t&udapi_apos;\t\1', conllu_string, flags=re.MULTILINE) # remove udapi sent_id doc = Document() doc.from_conllu_string(conllu_string) fixpunct_block = FixPunct() fixpunct_block.process_document(doc) output_string = doc.to_conllu_string() output_string = output_string.replace('&udapi_apos;',"'") output_string = re.sub(r'# sent_id = [0-9]+\n',r'',output_string) # remove udapi sent_id return output_string
def execute(self): """Parse given scenario and execute it.""" # Parse the given scenario from the command line. block_names, block_args = _parse_command_line_arguments( self.args.scenario) # Import blocks (classes) and construct block instances. blocks = _import_blocks(block_names, block_args) # Initialize blocks (process_start). for block in blocks: block.process_start() readers = [] for block in blocks: try: block.finished # pylint: disable=pointless-statement readers.append(block) except AttributeError: pass if not readers: logging.info('No reader specified, using read.Conllu') conllu_reader = Conllu() readers = [conllu_reader] blocks = readers + blocks # Apply blocks on the data. finished = False filenames_iterator = 0 # !!! ADDED !!! while not finished: document = Document() logging.info(" ---- ROUND ----") for block in blocks: if (filenames_iterator < len(block.filenames)): # !!! filename = block.filenames[filenames_iterator] # !!! document.set_filename(filename) # ADDED filenames_iterator += 1 # !!! logging.info("Executing block " + block.__class__.__name__) block.before_process_document(document) result = block.process_document(document) if (type(result) == int): init_cluster_id = result block.after_process_document(document) finished = True for reader in readers: finished = finished and reader.finished # 6. close blocks (process_end) for block in blocks: block.process_end()
def process_doc(book_list, outfile_name): ordered_doc = Document() for book in book_list: for _, sent in tree_dic[book]: bund = ordered_doc.create_bundle() bund.add_tree(sent) for block in blocks: block.apply_on_document(ordered_doc) if outfile_name: ordered_doc.store_conllu(outfile_name)
def test_deps_getter(self): """Test enhanced dependencies.""" # Create a path to the test CoNLLU file. data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu') # Read a test CoNLLU file. document = Document() reader = Conllu(files=data_filename) reader.process_document(document) # Exactly one bundle should be loaded. self.assertEqual(len(document.bundles), 1) # Obtain the dependency tree and check its sentence ID. root = document.bundles[0].get_tree() self.assertEqual(root.bundle.bundle_id, 'a-mf920901-001-p1s1A') # Check raw secondary dependencies for each node. nodes = root.descendants() self.assertEqual(nodes[0].raw_deps, '0:root|2:amod') self.assertEqual(nodes[1].raw_deps, '0:root') self.assertEqual(nodes[2].raw_deps, '0:root') self.assertEqual(nodes[3].raw_deps, '0:root') self.assertEqual(nodes[4].raw_deps, '1:amod') self.assertEqual(nodes[5].raw_deps, '5:conj') # Check deserialized dependencies. self.assertEqual(nodes[0].deps[0]['parent'], root) self.assertEqual(nodes[0].deps[0]['deprel'], 'root') self.assertEqual(nodes[5].deps[0]['parent'], nodes[4])
def setUpClass(cls): cls.doc = Document() cls.data = os.path.join(os.path.dirname(udapi.__file__), "core", "tests", "data", "enh_deps.conllu") cls.doc.load_conllu(cls.data) cls.tree = cls.doc.bundles[0].get_tree() cls.nodes = cls.tree.descendants cls.add_empty_node(cls.tree, 3)
def test_topology(self): """Test methods/properties descendants, children, prev_node, next_node, ord.""" doc = Document() data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu') doc.load_conllu(data_filename) self.assertEqual(len(doc.bundles), 1) root = doc.bundles[0].get_tree() nodes = root.descendants nodes2 = root.descendants() # descendants() and descendants should return the same sequence of nodes self.assertEqual(nodes, nodes2) self.assertEqual(len(nodes), 6) self.assertEqual(nodes[1].parent, root) self.assertEqual(nodes[2].root, root) self.assertEqual(len(nodes[1].descendants), 5) self.assertEqual(len(nodes[1].children), 3) self.assertEqual(len(nodes[1].children(add_self=True)), 4) self.assertEqual(len(nodes[1].children(add_self=1, following_only=1)), 3) self.assertEqual(nodes[0].next_node, nodes[1]) self.assertEqual(nodes[2].prev_node, nodes[1]) self.assertEqual(nodes[5].next_node, None) self.assertEqual(root.prev_node, None) (common_ancestor, added_nodes) = find_minimal_common_treelet(nodes[0], nodes[1]) self.assertEqual(common_ancestor, nodes[1]) self.assertEqual(list(added_nodes), []) input_nodes = [nodes[2], nodes[4], nodes[5]] (common_ancestor, added_nodes) = find_minimal_common_treelet(*input_nodes) self.assertEqual(common_ancestor, nodes[1]) self.assertEqual(list(added_nodes), [nodes[1], nodes[3]]) # ords and reorderings self.assertEqual([node.ord for node in nodes], [1, 2, 3, 4, 5, 6]) self.assertTrue(nodes[0].precedes(nodes[1])) self.assertTrue(nodes[0] < nodes[1]) self.assertFalse(nodes[0] > nodes[1]) self.assertTrue(nodes[0] <= nodes[0]) nodes[0].shift_after_node(nodes[1]) self.assertEqual([node.ord for node in nodes], [2, 1, 3, 4, 5, 6]) self.assertEqual([node.ord for node in root.descendants()], [1, 2, 3, 4, 5, 6])
def setUpClass(cls): cls.doc = Document() cls.data = os.path.join( os.path.dirname(tb2ud.__file__), "../test/data/tlg0011.tlg005.daphne_tb-grc1.xml") reader = AgldtReader(cls.data) reader.apply_on_document(cls.doc) print(len(cls.doc.bundles)) cls.tree = cls.doc.bundles[263].get_tree() cls.nodes = cls.tree.descendants
def setUpClass(cls): cls.doc = Document() cls.data = os.path.join(os.path.dirname(tb2ud.__file__), "../test/data/artificials.conllu") cls._reader = ConlluReader(files=cls.data) cls._reader.apply_on_document(cls.doc) # cls.tree = cls.doc.bundles[0].get_tree() # cls.nodes = cls.tree.descendants cls.writer = ConlluWriter() cls._subtreeconverted = False
def fix_punct(conllu_string): conllu_string = re.sub(r"\t'\t([^\t\n]+\tPART\tPOS)", r'\t&udapi_apos;\t\1', conllu_string, flags=re.MULTILINE) conllu_string = re.sub( r'\t"\t([^\t\n]+\t[^\t\n]+\t[^\t\n]+\t[^\t\n]+\t[^\t\n]+\t[^p])', r'\t&udapi_quot;\t\1', conllu_string, flags=re.MULTILINE) doc = UdapiDocument() doc.from_conllu_string(conllu_string) fixpunct_block = FixPunct() fixpunct_block.process_document(doc) output_string = doc.to_conllu_string() output_string = output_string.replace('&udapi_apos;', "'").replace('&udapi_quot;', '"') output_string = re.sub(r'# sent_id = [0-9]+\n', r'', output_string) # remove udapi sent_id return output_string
def main(): doc = Document() doc.from_conllu_string(conllu_string1) tree = doc.bundles[0].get_tree() nodes = tree.descendants writer = ConlluWriter() # Shifter shifter = ShiftArtificials() shifter.apply_on_document(doc) # writer.apply_on_document(doc) # Converter converter = SubTreeConverter(with_enhanced=True) converter.apply_on_document(doc) print(len(tree.empty_nodes)) # Writer writer.apply_on_document(doc)
def extract_senseid_children_collocates(conllu_filename): D = Document() D.load_conllu(conllu_filename ) #'Chinese_train_pos.xml.utf8.sentences.conllu.senseid') target_senseid_deprel_form_bundles = Vividict() #defaultdict(dict) for bundle in D.bundles: setattr_words(bundle=bundle) node = bundle.get_tree() while node: target = node.form senseid = node.misc['senseid'] if senseid: # For a verb like 想, list all children of the sense node: for child in node.children: if target_senseid_deprel_form_bundles[target][senseid][ child.deprel][child.form] == {}: target_senseid_deprel_form_bundles[target][senseid][ child.deprel][child.form] = [bundle] else: target_senseid_deprel_form_bundles[target][senseid][ child.deprel][child.form].append(bundle) node = node.next_node # To convert back to a common dictionaryu instance: d = dict(target_senseid_deprel_form_bundles) for target, senseid_deprel_form_bundles in target_senseid_deprel_form_bundles.items( ): d[target] = dict(senseid_deprel_form_bundles) for senseid, deprel_form_bundles in senseid_deprel_form_bundles.items( ): d[target][senseid] = dict(deprel_form_bundles) for deprel, form_bundles in deprel_form_bundles.items(): #d[target][senseid][deprel]=dict(form_bundles) sorted_form_bundles = sorted( form_bundles.items(), key=lambda form_bundles: len(form_bundles[1]), reverse=True) d[target][senseid][deprel] = OrderedDict(sorted_form_bundles) return d
def test_topology(self): """Test methods/properties descendants, children, prev_node, next_node, ord.""" doc = Document() data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu') doc.load_conllu(data_filename) self.assertEqual(len(doc.bundles), 1) root = doc.bundles[0].get_tree() nodes = root.descendants nodes2 = root.descendants() # descendants() and descendants should return the same sequence of nodes self.assertEqual(nodes, nodes2) self.assertEqual(len(nodes), 6) self.assertEqual(nodes[1].parent, root) self.assertEqual(nodes[2].root, root) self.assertEqual(len(nodes[1].descendants), 5) self.assertEqual(len(nodes[1].children), 3) self.assertEqual(len(nodes[1].children(add_self=True)), 4) self.assertEqual(len(nodes[1].children(add_self=1, following_only=1)), 3) self.assertEqual(nodes[0].next_node, nodes[1]) self.assertEqual(nodes[2].prev_node, nodes[1]) self.assertEqual(nodes[5].next_node, None) self.assertEqual(root.prev_node, None) (common_ancestor, added_nodes) = find_minimal_common_treelet(nodes[0], nodes[1]) self.assertEqual(common_ancestor, nodes[1]) self.assertEqual(list(added_nodes), []) input_nodes = [nodes[2], nodes[4], nodes[5]] (common_ancestor, added_nodes) = find_minimal_common_treelet(*input_nodes) self.assertEqual(common_ancestor, nodes[1]) self.assertEqual(list(added_nodes), [nodes[1], nodes[3]]) # ords and reorderings self.assertEqual([node.ord for node in nodes], [1, 2, 3, 4, 5, 6]) nodes[0].shift_after_node(nodes[1]) self.assertEqual([node.ord for node in nodes], [2, 1, 3, 4, 5, 6]) self.assertEqual([node.ord for node in root.descendants()], [1, 2, 3, 4, 5, 6])
def execute(self): """Parse given scenario and execute it.""" # Parse the given scenario from the command line. block_names, block_args = _parse_command_line_arguments( self.args.scenario) # Import blocks (classes) and construct block instances. blocks = _import_blocks(block_names, block_args) # Initialize blocks (process_start). for block in blocks: block.process_start() readers = [] for block in blocks: try: block.finished # pylint: disable=pointless-statement readers.append(block) except AttributeError: pass if not readers: logging.info('No reader specified, using read.Conllu') conllu_reader = Conllu() readers = [conllu_reader] blocks = readers + blocks # Apply blocks on the data. finished = False while not finished: document = Document() logging.info(" ---- ROUND ----") for block in blocks: logging.info("Executing block " + block.__class__.__name__) block.apply_on_document(document) finished = True for reader in readers: finished = finished and reader.finished # 6. close blocks (process_end) for block in blocks: block.process_end()
def load(): from udapi.core.document import Document load, read, write, text, relchain, save = [], [], [], [], [], [] for _ in range(30): start = timeit.default_timer() document = Document() document.load_conllu('cs-ud-train-l.conllu') end = timeit.default_timer() load.append(end - start) start = timeit.default_timer() for bundle in document: for root in bundle: for node in root.descendants: form_lemma = node.form + node.lemma end = timeit.default_timer() read.append(end - start) start = timeit.default_timer() for bundle in document: for root in bundle: chain = [n for n in root.descendants if n.deprel == "case" and n.parent.deprel == "nmod"] end = timeit.default_timer() relchain.append(end - start) start = timeit.default_timer() for bundle in document: for root in bundle: for node in root.descendants: node.deprel = 'dep' end = timeit.default_timer() write.append(end - start) start = timeit.default_timer() for bundle in document: for root in bundle: root.compute_text() end = timeit.default_timer() text.append(end - start) start = timeit.default_timer() document.store_conllu('hello.conllu') end = timeit.default_timer() save.append(end - start) for x, y in [('load', load), ('read', read), ('write', write), ('text', text), ('relchain', relchain), ('save', save)]: print("{}\t{} +/- {}".format(x, round(np.mean(y), 2), round(np.std(y), 2)))
def load(): from udapi.core.document import Document document = Document() document.load_conllu('cs-ud-train-l.conllu') for bundle in document: for root in bundle: for node in root.descendants: form_lemma = node.form + node.lemma for bundle in document: for root in bundle: chain = [n for n in root.descendants if n.parent.deprel == "det" and n.parent.parent.deprel == "obj"] for bundle in document: for root in bundle: for node in root.descendants: node.deprel = 'dep' for bundle in document: for root in bundle: root.compute_text() document.store_conllu('hello.conllu')
def test_iterator(self): doc = Document() doc.bundles = ['a', 'b', 'c'] for bundle in doc: print(bundle)
if outfile_name: ordered_doc.store_conllu(outfile_name) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("infile", help="Input file") parser.add_argument('-a', '--all', action='store_true', help='create all the 6-book chunks possible') parser.add_argument('-s', '--start', type=int, default=1, help='Starting book') parser.add_argument('-e', '--end', type=int, default=24, help='Ending book') parser.add_argument('-o', '--out', help='Output file') args = parser.parse_args() # outname = args.out doc = Document() reader = AgldtReader(args.infile, fix_cycles=True) reader.apply_on_document(doc) trees = [b.get_tree() for b in doc.bundles] if args.all: start = 1 while 1: stop = start + 5 if stop > 24: break else: tree_dic = get_ordered_trees(trees, start, stop) book_list = sorted(tree_dic.keys()) outf = args.infile.replace('.tb.xml', f'.{start}-{stop}.tb.conllu') process_doc(book_list, outf)
def test_init(self): doc = Document()
from udapi.core.document import Document D = Document() D.load_conllu( 'SemEval-2007/Chinese_train_pos.xml.utf8.sentences.conllu.senseid') for bundle in D.bundles: bundle.words = [] node = bundle.get_tree() while node: bundle.words.append(node.form) node = node.next_node print(bundle.bundle_id, bundle.words)
def test_print_subtree(self): """Test print_subtree() method, which uses udapi.block.write.textmodetrees.""" doc = Document() data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu') doc.load_conllu(data_filename) root = doc.bundles[0].get_tree() expected1 = ("# sent_id = a-mf920901-001-p1s1A\n" "# text = Slovenská ústava: pro i proti\n" "─┮\n" " │ ╭─╼ Slovenská ADJ amod\n" " ╰─┾ ústava NOUN root\n" " ┡─╼ : PUNCT punct\n" " ╰─┮ pro ADP appos\n" " ┡─╼ i CONJ cc\n" " ╰─╼ proti ADP conj\n" "\n") expected2 = ( "─┮\n" " │ ╭─╼ Slovenská Case=Nom|Degree=Pos|Gender=Fem|Negative=Pos|Number=Sing _\n" " ╰─┾ ústava Case=Nom|Gender=Fem|Negative=Pos|Number=Sing SpaceAfter=No\n" " ┡─╼ : _ _\n" " ╰─┮ pro AdpType=Prep|Case=Acc LId=pro-1\n" " ┡─╼ i _ LId=i-1\n" " ╰─╼ proti AdpType=Prep|Case=Dat LId=proti-1\n" "\n") # test non-projective tree root3 = Root() for i in range(1, 5): root3.create_child(form=str(i)) nodes = root3.descendants(add_self=1) nodes[1].parent = nodes[3] nodes[4].parent = nodes[2] expected3 = ("─┮\n" " │ ╭─╼ 1\n" " ┡─╪───┮ 2\n" " ╰─┶ 3 │\n" " ╰─╼ 4\n" "\n") try: sys.stdout = capture = io.StringIO() root.print_subtree(color=False) self.assertEqual(capture.getvalue(), expected1) capture.seek(0) capture.truncate() root.print_subtree(color=False, attributes='form,feats,misc', print_sent_id=False, print_text=False) self.assertEqual(capture.getvalue(), expected2) capture.seek(0) capture.truncate() root3.print_subtree(color=False, attributes='form', print_sent_id=0, print_text=0) self.assertEqual(capture.getvalue(), expected3) finally: sys.stdout = sys.__stdout__ # pylint: disable=redefined-variable-type
maxseed = 2**32; def myrand(modulo): global seed seed = (1103515245 * seed + 12345) % maxseed; return seed % modulo; debug = False if sys.argv[1] == "-d": debug = True sys.argv.pop(1) in_conllu = sys.argv[1] out_conllu = sys.argv[2] print("init") doc = Document() doc.load({'filename':in_conllu}) print("load") if debug: doc.store({'filename':'udapi-load.conllu'}) for bundle in doc: for root in bundle: for node in root.descendants(): pass print("iter") for bundle in doc: for root in bundle: for node in root.descendants():
This script takes an AGDT xml file and generate a half-baked CONLL-U, right before the SetArtificial stage. In this way, we create a test set to verify the problems in the SetArtificial stage. """ from udapi.core.document import Document from udapi.block.agldt.setspaceafter import SetSpaceAfter from udapi.block.read.agldt import Agldt as AgldtReader from tb2ud import * from tb2ud.text.updatetext import UpdateText from tb2ud.postprocess.fixsomepos import FixSomePos from collections import defaultdict import re tst_file = "./data/artificial_sentences.xml" doc = Document() reader = AgldtReader(tst_file, fix_cycles=True) reader.apply_on_document(doc) #trees = [b.get_tree() for b in doc.bundles] blocks = [ SetSpaceAfter(), CreateUpos(), CreateFeats(), SetMember(), ShallowConverter(), ShiftArtificials(), SubTreeConverter(with_enhanced=True), FixObj(), # SetArtificials(), MakeEnhanced(), # COMMENT OUT if you DO NOT want empty nodes and enhanced deps RehangPunct(),
if book_start <= int(bk) <= book_end: d[int(bk)].append((int(ln), tree)) print("reordering the dictionary") for k in d.keys(): d[k].sort(key=lambda x: x[0]) return d parser = argparse.ArgumentParser() parser.add_argument("infile", help="Input file") parser.add_argument('-s', '--start', type=int, default=1, help='Starting book') parser.add_argument('-e', '--end', type=int, default=24, help='Ending book') parser.add_argument('-o', '--out', help='Output file') args = parser.parse_args() doc = Document() reader = AgldtReader(args.infile, fix_cycles=True) reader.apply_on_document(doc) trees = [b.get_tree() for b in doc.bundles] tree_dic = get_ordered_trees(trees, args.start, args.end) book_list = sorted(tree_dic.keys()) ordered_doc = Document() for book in book_list: for _, sent in tree_dic[book]: bund = ordered_doc.create_bundle() bund.add_tree(sent) outname = args.out
def from_connlu(conllu): doc = Document() doc.from_conllu_string(conllu) return doc