def test_deps_getter(self): """Test enhanced dependencies.""" # Create a path to the test CoNLLU file. data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu') # Read a test CoNLLU file. document = Document() reader = Conllu(files=data_filename) reader.process_document(document) # Exactly one bundle should be loaded. self.assertEqual(len(document.bundles), 1) # Obtain the dependency tree and check its sentence ID. root = document.bundles[0].get_tree() self.assertEqual(root.bundle.bundle_id, 'a-mf920901-001-p1s1A') # Check raw secondary dependencies for each node. nodes = root.descendants() self.assertEqual(nodes[0].raw_deps, '0:root|2:amod') self.assertEqual(nodes[1].raw_deps, '0:root') self.assertEqual(nodes[2].raw_deps, '0:root') self.assertEqual(nodes[3].raw_deps, '0:root') self.assertEqual(nodes[4].raw_deps, '1:amod') self.assertEqual(nodes[5].raw_deps, '5:conj') # Check deserialized dependencies. self.assertEqual(nodes[0].deps[0]['parent'], root) self.assertEqual(nodes[0].deps[0]['deprel'], 'root') self.assertEqual(nodes[5].deps[0]['parent'], nodes[4])
def __init__(self, model): """Create the UDPipe tool object.""" self.model = model path = require_file(model) self.tool = Model.load(path) if not self.tool: raise IOError("Cannot load model from file '%s'" % path) self.error = ProcessingError() self.conllu_reader = ConlluReader() self.tokenizer = self.tool.newTokenizer(Model.DEFAULT)
def get_ud_analysis(analysis): analysis = Conllu(filehandle=StringIO(analysis)).read_tree() heads = [] words = [node.form for node in analysis.descendants] for id, token in enumerate(analysis.descendants): if token.deprel == 'root': # root_id = words.index(token.form) heads.append(-1) else: heads.append(words.index(token.parent.form)) return heads, words
def execute(self): """Parse given scenario and execute it.""" # Parse the given scenario from the command line. block_names, block_args = _parse_command_line_arguments( self.args.scenario) # Import blocks (classes) and construct block instances. blocks = _import_blocks(block_names, block_args) # Initialize blocks (process_start). for block in blocks: block.process_start() readers = [] for block in blocks: try: block.finished # pylint: disable=pointless-statement readers.append(block) except AttributeError: pass if not readers: logging.info('No reader specified, using read.Conllu') conllu_reader = Conllu() readers = [conllu_reader] blocks = readers + blocks # Apply blocks on the data. finished = False filenames_iterator = 0 # !!! ADDED !!! while not finished: document = Document() logging.info(" ---- ROUND ----") for block in blocks: if (filenames_iterator < len(block.filenames)): # !!! filename = block.filenames[filenames_iterator] # !!! document.set_filename(filename) # ADDED filenames_iterator += 1 # !!! logging.info("Executing block " + block.__class__.__name__) block.before_process_document(document) result = block.process_document(document) if (type(result) == int): init_cluster_id = result block.after_process_document(document) finished = True for reader in readers: finished = finished and reader.finished # 6. close blocks (process_end) for block in blocks: block.process_end()
def __call__(self, q_tokens_batch: List[List[str]]) -> List[Tuple[str, str]]: entity_rel_list = [] for q_tokens in q_tokens_batch: q_str = '\n'.join(q_tokens) s = self.full_ud_model.process(q_str) tree = Conllu(filehandle=StringIO(s)).read_tree() fnd, detected_entity, detected_rel = self.find_entity( tree, q_tokens) if not fnd: fnd, detected_entity, detected_rel = self.find_entity_adj(tree) detected_entity = detected_entity.replace("первый ", '') entity_rel_list.append((detected_entity, detected_rel)) return entity_rel_list
def _get_groups_from_tree(sentence): groups = defaultdict(list) to_tree = UDPIPE.process(sentence) tree = Conllu(filehandle=StringIO(to_tree)).read_tree() nodes = tree.descendants for node in nodes: parent = node.parent parent_form = parent.form parent_tag = parent.upos _node = node.form if parent_tag in ['NOUN', 'PROPN', 'VERB']: groups[(parent_form, parent_tag)].append((_node, node.upos)) return groups
def execute(self): """Parse given scenario and execute it.""" # Parse the given scenario from the command line. block_names, block_args = _parse_command_line_arguments( self.args.scenario) # Import blocks (classes) and construct block instances. blocks = _import_blocks(block_names, block_args) # Initialize blocks (process_start). for block in blocks: block.process_start() readers = [] for block in blocks: try: block.finished # pylint: disable=pointless-statement readers.append(block) except AttributeError: pass if not readers: logging.info('No reader specified, using read.Conllu') conllu_reader = Conllu() readers = [conllu_reader] blocks = readers + blocks # Apply blocks on the data. finished = False while not finished: document = Document() logging.info(" ---- ROUND ----") for block in blocks: logging.info("Executing block " + block.__class__.__name__) block.apply_on_document(document) finished = True for reader in readers: finished = finished and reader.finished # 6. close blocks (process_end) for block in blocks: block.process_end()
def from_conllu_string(self, string): """Load a document from a conllu-formatted string.""" reader = ConlluReader(filehandle=io.StringIO(string)) reader.apply_on_document(self)
def __call__( self, syntax_tree_batch: List[str], positions_batch: List[List[List[int]]] ) -> Tuple[List[List[str]], List[Dict[str, str]]]: log.debug(f"positions of entity tokens {positions_batch}") query_nums_batch = [] entities_dict_batch = [] types_dict_batch = [] questions_batch = [] count = False for syntax_tree, positions in zip(syntax_tree_batch, positions_batch): log.debug(f"\n{syntax_tree}") tree = Conllu(filehandle=StringIO(syntax_tree)).read_tree() root = self.find_root(tree) tree_desc = tree.descendants unknown_node = "" if root: log.debug(f"syntax tree info, root: {root.form}") unknown_node, unknown_branch = self.find_branch_with_unknown( root) positions = [num for position in positions for num in position] if unknown_node: log.debug( f"syntax tree info, unknown node: {unknown_node.form}, unknown branch: {unknown_branch.form}" ) log.debug(f"wh_leaf: {self.wh_leaf}") clause_node, clause_branch = self.find_clause_node( root, unknown_branch) modifiers, clause_modifiers = self.find_modifiers_of_unknown( unknown_node) log.debug( f"modifiers: {[modifier.form for modifier in modifiers]}") if f"{tree_desc[0].form.lower()} {tree_desc[1].form.lower()}" in self.change_root_tokens: new_root = root.children[0] else: new_root = root root_desc = defaultdict(list) for node in new_root.children: if node.deprel not in ["punct", "advmod", "cop", "mark"]: if node == unknown_branch: root_desc[node.deprel].append(node) else: if self.find_entities(node, positions, cut_clause=False) or \ (self.find_year_or_number(node) and node.deprel in ["obl", "nummod"]): root_desc[node.deprel].append(node) if root.form.lower() == self.how_many or ("nsubj" in root_desc.keys() and \ self.how_many in [nd.form.lower() for nd in root_desc["nsubj"]]): count = True log.debug(f"root_desc {root_desc.keys()}") appos_token_nums = sorted(self.find_appos_tokens(root, [])) appos_tokens = [ elem.form for elem in tree_desc if elem.ord in appos_token_nums ] clause_token_nums = sorted( self.find_clause_tokens(root, clause_node, [])) clause_tokens = [ elem.form for elem in tree_desc if elem.ord in clause_token_nums ] log.debug(f"appos tokens: {appos_tokens}") log.debug(f"clause_tokens: {clause_tokens}") self.root_entity = False if root.ord - 1 in positions: self.root_entity = True query_nums, entities_dict, types_dict = self.build_query( new_root, unknown_branch, root_desc, unknown_node, modifiers, clause_modifiers, clause_node, positions, count=count) if self.lang == "rus": question = ' '.join([node.form for node in tree.descendants \ if (node.ord not in appos_token_nums or node.ord not in clause_token_nums)]) else: question = ' '.join( [node.form for node in tree.descendants]) log.debug(f"sanitized question: {question}") query_nums_batch.append(query_nums) entities_dict_batch.append(entities_dict) types_dict_batch.append(types_dict) questions_batch.append(question) return questions_batch, query_nums_batch, entities_dict_batch, types_dict_batch
class UDPipe: """Wrapper for UDPipe (more pythonic than ufal.udpipe).""" def __init__(self, model): """Create the UDPipe tool object.""" self.model = model path = require_file(model) self.tool = Model.load(path) if not self.tool: raise IOError("Cannot load model from file '%s'" % path) self.error = ProcessingError() self.conllu_reader = ConlluReader() self.tokenizer = self.tool.newTokenizer(Model.DEFAULT) def tag_parse_tree(self, root): """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized).""" descendants = root.descendants if not descendants: return pipeline = Pipeline(self.tool, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') in_data = " ".join([n.form for n in descendants]) out_data = pipeline.process(in_data, self.error) if self.error.occurred(): raise IOError("UDPipe error " + self.error.message) self.conllu_reader.files.filehandle = io.StringIO(out_data) parsed_root = self.conllu_reader.read_tree() nodes = [root] + descendants for parsed_node in parsed_root.descendants: node = nodes[parsed_node.ord] node.parent = nodes[parsed_node.parent.ord] for attr in 'upos xpos lemma feats deprel'.split(): setattr(node, attr, getattr(parsed_node, attr)) # TODO: benchmark which solution is the fastest one. E.g. we could also do # for node, parsed_node in zip(root.descendants, parsed_root.descendants): # parsed_node.misc = node.misc # pylint: disable=protected-access #root._children, root._descendants = parsed_root._children, parsed_root._descendants def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True): """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`. If resegment=True, the returned list of Udapi trees may contain multiple trees. """ if root.children: raise ValueError( 'Tree already contained nodes before tokenization') # Tokenize and segment the text (segmentation cannot be turned off in older UDPipe versions). self.tokenizer.setText(root.text) is_another = True u_sentences = [] while is_another: u_sentence = Sentence() is_another = self.tokenizer.nextSentence(u_sentence) if is_another: u_sentences.append(u_sentence) # If resegmentation was not required, we need to join the segments. if not resegment and len(u_sentences) > 1: first_sent = u_sentences[0] n_words = first_sent.words.size() - 1 for other_sent in u_sentences[1:]: other_words = other_sent.words.size() - 1 for i in range(1, other_words + 1): u_w = other_sent.words[i] n_words += 1 u_w.id = n_words first_sent.words.append(u_w) u_sentences = [first_sent] # tagging and parsing if tag: for u_sentence in u_sentences: self.tool.tag(u_sentence, Model.DEFAULT) if parse: self.tool.parse(u_sentence, Model.DEFAULT) elif parse: raise ValueError( 'Combination parse=True tag=False is not allowed.') # converting UDPipe nodes to Udapi nodes new_root = root trees = [] for u_sentence in u_sentences: if not new_root: new_root = Root() heads, nodes = [], [new_root] u_words = u_sentence.words for i in range(1, u_words.size()): u_w = u_words[i] node = new_root.create_child( form=u_w.form, lemma=u_w.lemma, upos=u_w.upostag, xpos=u_w.xpostag, feats=u_w.feats, deprel=u_w.deprel, misc=u_w.misc, ) if parse: heads.append(u_w.head) nodes.append(node) if parse: for node in nodes[1:]: head = heads.pop(0) node.parent = nodes[head] trees.append(new_root) new_root = None return trees
def load_conllu(self, filename): """Load a document from a conllu-formatted file.""" reader = ConlluReader(files=filename) reader.process_document(self)
def load_conllu(self, filename=None): """Load a document from a conllu-formatted file.""" reader = ConlluReader(files=filename) reader.apply_on_document(self)
def __call__( self, syntax_tree_batch: List[str], positions_batch: List[List[List[int]]] ) -> Tuple[List[List[str]], List[Dict[str, str]]]: log.debug(f"positions of entity tokens {positions_batch}") query_nums_batch = [] entities_dict_batch = [] types_dict_batch = [] questions_batch = [] for syntax_tree, positions in zip(syntax_tree_batch, positions_batch): log.debug(f"\n{syntax_tree}") tree = Conllu(filehandle=StringIO(syntax_tree)).read_tree() root = self.find_root(tree) tree_desc = tree.descendants log.debug(f"syntax tree info, root: {root.form}") unknown_node, unknown_branch = self.find_branch_with_unknown(root) positions = [num for position in positions for num in position] if unknown_node: log.debug( f"syntax tree info, unknown node: {unknown_node.form}, unknown branch: {unknown_branch.form}" ) clause_node, clause_branch = self.find_clause_node( root, unknown_branch) modifiers, clause_modifiers = self.find_modifiers_of_unknown( unknown_node) log.debug( f"modifiers: {[modifier.form for modifier in modifiers]}") if f"{tree_desc[0].form.lower()} {tree_desc[1].form.lower()}" in [ "каким был", "какой была" ]: new_root = root.children[0] else: new_root = root root_desc = defaultdict(list) for node in new_root.children: if node.deprel not in ["punct", "advmod", "cop"]: if node == unknown_branch: root_desc[node.deprel].append(node) else: if self.find_entities(node, positions, cut_clause=False): root_desc[node.deprel].append(node) appos_token_nums = sorted(self.find_appos_tokens(root, [])) appos_tokens = [ elem.form for elem in tree_desc if elem.ord in appos_token_nums ] clause_token_nums = sorted( self.find_clause_tokens(root, clause_node, [])) clause_tokens = [ elem.form for elem in tree_desc if elem.ord in clause_token_nums ] log.debug(f"appos tokens: {appos_tokens}") log.debug(f"clause tokens: {clause_tokens}") query_nums, entities_dict, types_dict = self.build_query( new_root, unknown_branch, root_desc, unknown_node, modifiers, clause_modifiers, positions) question = ' '.join([ node.form for node in tree.descendants if (node.ord not in appos_token_nums or node.ord not in clause_token_nums) ]) log.debug(f"sanitized question: {question}") query_nums_batch.append(query_nums) entities_dict_batch.append(entities_dict) types_dict_batch.append(types_dict) questions_batch.append(question) return questions_batch, query_nums_batch, entities_dict_batch, types_dict_batch
class UDPipe: """Wrapper for UDPipe (more pythonic than ufal.udpipe).""" def __init__(self, model): """Create the UDPipe tool object.""" self.model = model path = require_file(model) self.tool = Model.load(path) if not self.tool: raise IOError("Cannot load model from file '%s'" % path) self.error = ProcessingError() self.conllu_reader = ConlluReader() self.tokenizer = self.tool.newTokenizer(Model.DEFAULT) def tag_parse_tree(self, root): """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized).""" pipeline = Pipeline(self.tool, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') in_data = " ".join([n.form for n in root.descendants]) out_data = pipeline.process(in_data, self.error) if self.error.occurred(): raise IOError("UDPipe error " + self.error.message) self.conllu_reader.files.filehandle = io.StringIO(out_data) parsed_root = self.conllu_reader.read_tree() nodes = [root] + root.descendants for parsed_node in parsed_root.descendants: node = nodes[parsed_node.ord] node.parent = nodes[parsed_node.parent.ord] for attr in 'upos xpos lemma feats'.split(): setattr(node, attr, getattr(parsed_node, attr)) # TODO: benchmark which solution is the fastest one. E.g. we could also do # for node, parsed_node in zip(root.descendants, parsed_root.descendants): # parsed_node.misc = node.misc # pylint: disable=protected-access #root._children, root._descendants = parsed_root._children, parsed_root._descendants def tokenize_tag_parse_tree(self, root): """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`.""" if root.children: raise ValueError('Tree already contained nodes before tokenization') # tokenization (I cannot turn off segmenter, so I need to join the segments) self.tokenizer.setText(root.text) u_sentence = Sentence() is_another = self.tokenizer.nextSentence(u_sentence) u_words = u_sentence.words n_words = u_words.size() - 1 if is_another: u_sent_cont = Sentence() while self.tokenizer.nextSentence(u_sent_cont): n_cont = u_sent_cont.words.size() - 1 for i in range(1, n_cont + 1): u_w = u_sent_cont.words[i] n_words += 1 u_w.id = n_words u_words.append(u_w) # tagging and parsing self.tool.tag(u_sentence, Model.DEFAULT) self.tool.parse(u_sentence, Model.DEFAULT) # converting UDPipe nodes to Udapi nodes heads, nodes = [], [root] for i in range(1, u_words.size()): u_w = u_words[i] node = root.create_child( form=u_w.form, lemma=u_w.lemma, upos=u_w.upostag, xpos=u_w.xpostag, feats=u_w.feats, deprel=u_w.deprel, ) node.misc = u_w.misc heads.append(u_w.head) nodes.append(node) for node in nodes[1:]: head = heads.pop(0) node.parent = nodes[head]
class UDPipe: """Wrapper for UDPipe (more pythonic than ufal.udpipe).""" def __init__(self, model): """Create the UDPipe tool object.""" self.model = model path = require_file(model) self.tool = Model.load(path) if not self.tool: raise IOError("Cannot load model from file '%s'" % path) self.error = ProcessingError() self.conllu_reader = ConlluReader() self.tokenizer = self.tool.newTokenizer(Model.DEFAULT) def tag_parse_tree(self, root): """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized).""" descendants = root.descendants if not descendants: return pipeline = Pipeline(self.tool, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') in_data = " ".join([n.form for n in descendants]) out_data = pipeline.process(in_data, self.error) if self.error.occurred(): raise IOError("UDPipe error " + self.error.message) self.conllu_reader.files.filehandle = io.StringIO(out_data) parsed_root = self.conllu_reader.read_tree() nodes = [root] + descendants for parsed_node in parsed_root.descendants: node = nodes[parsed_node.ord] node.parent = nodes[parsed_node.parent.ord] for attr in 'upos xpos lemma feats'.split(): setattr(node, attr, getattr(parsed_node, attr)) # TODO: benchmark which solution is the fastest one. E.g. we could also do # for node, parsed_node in zip(root.descendants, parsed_root.descendants): # parsed_node.misc = node.misc # pylint: disable=protected-access #root._children, root._descendants = parsed_root._children, parsed_root._descendants def tokenize_tag_parse_tree(self, root): """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`.""" if root.children: raise ValueError('Tree already contained nodes before tokenization') # tokenization (I cannot turn off segmenter, so I need to join the segments) self.tokenizer.setText(root.text) u_sentence = Sentence() is_another = self.tokenizer.nextSentence(u_sentence) u_words = u_sentence.words n_words = u_words.size() - 1 if is_another: u_sent_cont = Sentence() while self.tokenizer.nextSentence(u_sent_cont): n_cont = u_sent_cont.words.size() - 1 for i in range(1, n_cont + 1): u_w = u_sent_cont.words[i] n_words += 1 u_w.id = n_words u_words.append(u_w) # tagging and parsing self.tool.tag(u_sentence, Model.DEFAULT) self.tool.parse(u_sentence, Model.DEFAULT) # converting UDPipe nodes to Udapi nodes heads, nodes = [], [root] for i in range(1, u_words.size()): u_w = u_words[i] node = root.create_child( form=u_w.form, lemma=u_w.lemma, upos=u_w.upostag, xpos=u_w.xpostag, feats=u_w.feats, deprel=u_w.deprel, ) node.misc = u_w.misc heads.append(u_w.head) nodes.append(node) for node in nodes[1:]: head = heads.pop(0) node.parent = nodes[head]