def load_ace_file(textfile, fmt): print ' - %s' % os.path.split(textfile)[1] annfile = textfile+'.tmx.rdc.xml' # Read the xml file, and get a list of entities entities = [] xml = ET.parse(open(annfile)).getroot() for entity in xml.findall('document/entity'): typ = entity.find('entity_type').text for mention in entity.findall('entity_mention'): if mention.get('TYPE') != 'NAME': continue # only NEs s = int(mention.find('head/charseq/start').text) e = int(mention.find('head/charseq/end').text)+1 entities.append( (s, e, typ) ) # Read the text file, and mark the entities. text = open(textfile).read() # Strip XML tags, since they don't count towards the indices text = re.sub('<(?!/?TEXT)[^>]+>', '', text) # Blank out anything before/after <TEXT> def subfunc(m): return ' '*(m.end()-m.start()-6) text = re.sub('[\s\S]*<TEXT>', subfunc, text) text = re.sub('</TEXT>[\s\S]*', '', text) # Simplify quotes text = re.sub("``", ' "', text) text = re.sub("''", '" ', text) entity_types = set(typ for (s,e,typ) in entities) # Binary distinction (NE or not NE) if fmt == 'binary': i = 0 toks = nltk.Tree('S', []) for (s,e,typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(nltk.word_tokenize(text[i:s])) toks.append(nltk.Tree('NE', text[s:e].split())) i = e toks.extend(nltk.word_tokenize(text[i:])) yield toks # Multiclass distinction (NE type) elif fmt == 'multiclass': i = 0 toks = nltk.Tree('S', []) for (s,e,typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(nltk.word_tokenize(text[i:s])) toks.append(nltk.Tree(typ, text[s:e].split())) i = e toks.extend(nltk.word_tokenize(text[i:])) yield toks else: raise ValueError('bad fmt value')
def build_sentence_tree(tagged_sentence): """Builds the sentence tree based on the IOB tags for person and date""" phrase = [] label = "" token_list = [] for token in tagged_sentence: iob = token[2] word = token[:-1] if (iob == 'O'): if (phrase != []): token_list.append(nltk.Tree(label, phrase)) label = "" phrase = [] token_list.append(word) else: token_list.append(word) else: if (iob[2:] in ["PERSON", "DATE"]): if (label == iob[2:] or label == ""): label = iob[2:] phrase.append(word) else: token_list.append(nltk.Tree(label, phrase)) label = "" phrase = [] phrase.append(word) if (phrase != []): token_list.append(nltk.Tree(label, phrase)) return token_list
def test_chunk_tagger(): """ Test Chunk tagger. """ from nltk.tokenize import word_tokenize logging.info('Loading PoS tagger') pos_tag = pickle.load(open("tmp/pos_tagger.p", "rb")).tag logging.info('Loading Chunk tagger') chunk_tag = pickle.load(open("tmp/chunk_tagger.p", "rb")).parse loc = nltk.Tree('LOC', [(u'santander', u'NC')]) org = nltk.Tree('ORG', [(u'izquierda', u'NC')]) test_tree = nltk.Tree('S', [ org, (u'unida', u'AQ'), (u'de', u'SP'), loc, (unicode('presentó', 'utf-8'), u'VMI'), (u'hoy', u'RG'), (u'su', u'DP'), (u'nuevo', u'AQ'), (unicode('boletín', 'utf-8'), u'NC'), (u'trimestral', u'AQ') ]) string = unicode( """Izquierda Unida de Santander presentó hoy su nuevo boletín\ trimestral""", 'utf-8') tokens = [token.lower() for token in word_tokenize(string)] pos_tokens = pos_tag(tokens) result = chunk_tag(pos_tokens) assert result == test_tree
def construct_dependency_tree(parsed): """Constructs a dependency tree from a list of dependency triplets of the form (relationship, governing word, dependent word)""" root = parsed.dependencies_root dependencies = set([(rel, gov.text, dep.text) for rel, gov, dep in parsed.dependencies]) children = [(rel, gov, dep) for (rel, gov, dep) in dependencies if gov == root] dependencies = dependencies - set(children) remaining_nodes = [nltk.Tree(dep, []) for (rel, gov, dep) in children] tree = nltk.Tree(root, remaining_nodes) while dependencies != set(): #find current node and its children node = remaining_nodes.pop(0) children = [(rel, gov, dep) for (rel, gov, dep) in dependencies if gov == node.node] children_nodes = [nltk.Tree(dep, []) for (rel, gov, dep) in children] #update counting structures remaining_nodes.extend(children_nodes) node.extend(children_nodes) dependencies = dependencies - set(children) return tree
def binarize(tree, binarize_direction='left', dummy_label_manipulating='parent'): assert binarize_direction in [ 'left', 'right' ], f"We only support left/right direction here" assert dummy_label_manipulating in [ 'parent', 'universal', 'universal_node_unary' ], f"We only support parent/universal direction here" tree = tree.copy(True) nodes = [tree] while nodes: node = nodes.pop() if isinstance(node, nltk.Tree): nodes.extend([child for child in node]) if len(node) > 1: for i, child in enumerate(node): if not isinstance(child[0], nltk.Tree): if dummy_label_manipulating == 'parent': node[i] = nltk.Tree(f"{node.label()}|<>", [child]) elif dummy_label_manipulating == 'universal': node[i] = nltk.Tree(f"|<>", [child]) elif dummy_label_manipulating == 'universal_node_unary': node[i] = nltk.Tree(f"UNARY|<>", [child]) tree = custom_chomsky_normal_form(tree, binarize_direction, dummy_label_manipulating, 0, 0) tree.collapse_unary() return tree
def visual_tree(node): """ :param node: the root node. """ if node.child2 is None: return nltk.Tree(node.parent, [node.child1]) return nltk.Tree(node.parent, [visual_tree(node.child1), visual_tree(node.child2)])
def list2tree(node): if isinstance(node, list): tree = [] for child in node: tree.append(list2tree(child)) return nltk.Tree('<unk>', tree) elif isinstance(node, str): return nltk.Tree('<word>', [node])
def sent2tree(node): trees = [sent2tree(i) for i in node] if node.tag == "tok": return nltk.Tree(node.attrib["cat"], [node.text]) elif node.tag == "cons": return nltk.Tree(node.attrib["cat"], trees) else: return sent2tree(node[0])
def list2tree(node): """convert list instance to nltk.Tree.""" if isinstance(node, list): tree = [] for child in node: tree.append(list2tree(child)) return nltk.Tree('<l>', tree) elif isinstance(node, dict): return nltk.Tree(node['tag'], [node['word']])
def build_nltktree(depth, arc, tag, sen, arcdict, tagdict, stagdict, stags=None): """stags are the stanford predicted tags present in the train/valid/test files. """ assert len(sen) > 0 assert len(depth) == len(sen) - 1, ("%s_%s" % (len(depth), len(sen))) if stags: assert len(stags) == len(tag) if len(sen) == 1: tag_list = str(tagdict[tag[0]]).split('+') tag_list.reverse() # if stags, put the real stanford pos TAG for the word and leave the # unary chain on top. if stags is not None: assert len(stags) > 0 tag_list.insert(0, str(stagdict[stags[0]])) word = str(sen[0]) for t in tag_list: word = nltk.Tree(t, [word]) assert isinstance(word, nltk.Tree) return word else: idx = numpy.argmax(depth) node0 = build_nltktree(depth[:idx], arc[:idx], tag[:idx + 1], sen[:idx + 1], arcdict, tagdict, stagdict, stags[:idx + 1] if stags else None) node1 = build_nltktree(depth[idx + 1:], arc[idx + 1:], tag[idx + 1:], sen[idx + 1:], arcdict, tagdict, stagdict, stags[idx + 1:] if stags else None) if node0.label() != '<empty>' and node1.label() != '<empty>': tr = [node0, node1] elif node0.label() == '<empty>' and node1.label() != '<empty>': tr = [c for c in node0] + [node1] elif node0.label() != '<empty>' and node1.label() == '<empty>': tr = [node0] + [c for c in node1] elif node0.label() == '<empty>' and node1.label() == '<empty>': tr = [c for c in node0] + [c for c in node1] arc_list = str(arcdict[arc[idx]]).split('+') arc_list.reverse() for a in arc_list: if isinstance(tr, nltk.Tree): tr = [tr] tr = nltk.Tree(a, tr) return tr
def binarize(cls, tree): r""" Conducts binarization over the tree. First, the tree is transformed to satisfy `Chomsky Normal Form (CNF)`_. Here we call :meth:`~nltk.tree.Tree.chomsky_normal_form` to conduct left-binarization. Second, all unary productions in the tree are collapsed. Args: tree (nltk.tree.Tree): The tree to be binarized. Returns: The binarized tree. Examples: >>> tree = nltk.Tree.fromstring(''' (TOP (S (NP (_ She)) (VP (_ enjoys) (S (VP (_ playing) (NP (_ tennis))))) (_ .))) ''') >>> print(Tree.binarize(tree)) (TOP (S (S|<> (NP (_ She)) (VP (VP|<> (_ enjoys)) (S::VP (VP|<> (_ playing)) (NP (_ tennis))))) (S|<> (_ .)))) .. _Chomsky Normal Form (CNF): https://en.wikipedia.org/wiki/Chomsky_normal_form """ tree = tree.copy(True) if len(tree) == 1 and not isinstance(tree[0][0], nltk.Tree): tree[0] = nltk.Tree(f"{tree.label()}|<>", [tree[0]]) nodes = [tree] while nodes: node = nodes.pop() if isinstance(node, nltk.Tree): nodes.extend([child for child in node]) if len(node) > 1: for i, child in enumerate(node): if not isinstance(child[0], nltk.Tree): node[i] = nltk.Tree(f"{node.label()}|<>", [child]) tree.chomsky_normal_form('left', 0, 0) tree.collapse_unary(joinChar='::') return tree
def track(node): i, j, label = next(node) if j == i + 1: children = [leaves[i]] else: children = track(node) + track(node) if label.endswith('|<>'): return children labels = label.split('+') tree = nltk.Tree(labels[-1], children) for label in reversed(labels[:-1]): tree = nltk.Tree(label, [tree]) return [tree]
def to_tree(self, leaves, label_from_index: dict, tag_from_index: dict = None): if self.tags is not None: if tag_from_index is None: raise ValueError( "tags_from_index is required to convert predicted pos tags" ) predicted_tags = [tag_from_index[i] for i in self.tags] assert len(leaves) == len(predicted_tags) leaves = [ nltk.Tree(tag, [leaf[0] if isinstance(leaf, tuple) else leaf]) for tag, leaf in zip(predicted_tags, leaves) ] else: leaves = [ nltk.Tree(leaf[1], [leaf[0]]) if isinstance(leaf, tuple) else (nltk.Tree("UNK", [leaf]) if isinstance(leaf, str) else leaf) for leaf in leaves ] idx = -1 def helper(): nonlocal idx idx += 1 i, j, label = ( self.starts[idx], self.ends[idx], label_from_index[self.labels[idx]], ) if (i + 1) >= j: children = [leaves[i]] else: children = [] while ((idx + 1) < len(self.starts) and i <= self.starts[idx + 1] and self.ends[idx + 1] <= j): children.extend(helper()) if label: for sublabel in reversed(label.split("::")): children = [nltk.Tree(sublabel, children)] return children children = helper() return nltk.Tree("TOP", children)
def _conv_etree2tree(self, node, nodes, label="", include_edgelabels=True): if type(node) == _Terminal: pos = node.pos if include_edgelabels and label not in ["", "--"]: pos = pos + "-" + label return nltk.Tree(pos, [node.word]) elif type(node) == _NonTerminal: cat = node.cat children = list() for e in node.edges: children.append( self._conv_etree2tree(nodes.get(e[1]), nodes, e[0], include_edgelabels)) return nltk.Tree(cat, children) return None
def convert_to_revised_tokenization(orig_trees, revised_trees): for orig_tree, revised_tree in zip(orig_trees, revised_trees): orig_words = [standardize_form(word) for word in orig_tree.leaves()] revised_words = [ standardize_form(word) for word in revised_tree.leaves() ] o2r, r2o = tokenizations.get_alignments(orig_words, revised_words) assert all(len(x) >= 1 for x in o2r) converted_tree = orig_tree.copy(deep=True) for j in range(len(revised_words)): if len(r2o[j]) > 1: for i in r2o[j][1:]: orig_treeposition = orig_tree.leaf_treeposition(i) if len(orig_treeposition) > 1 and len( orig_tree[orig_treeposition[:-1]]) == 1: converted_tree[orig_treeposition[:-1]] = nltk.Tree( DUMMY_LABEL, [DUMMY_WORD]) else: converted_tree[orig_treeposition] = DUMMY_LABEL for i in range(len(orig_words)): if converted_tree[orig_tree.leaf_treeposition(i)] == DUMMY_LABEL: continue elif len(o2r[i]) == 1: j = o2r[i][0] converted_tree[orig_tree.leaf_treeposition(i)] = revised_tree[ revised_tree.leaf_treeposition(j)] else: orig_treeposition = orig_tree.leaf_treeposition(i) if len(orig_treeposition) > 1 and len( orig_tree[orig_treeposition[:-1]]) == 1: orig_treeposition = orig_treeposition[:-1] revised_leaves = [ revised_tree[revised_tree.leaf_treeposition(j)[:-1]] for j in o2r[i] ] assert all(len(x) == 1 for x in revised_leaves) converted_tree[orig_treeposition] = nltk.Tree( DUMMY_LABEL, revised_leaves) else: converted_tree[orig_treeposition] = nltk.Tree( DUMMY_LABEL, [ revised_tree[revised_tree.leaf_treeposition(j)] for j in o2r[i] ]) yield converted_tree
def to_nltk_tree(node): if node.n_lefts + node.n_rights > 0: t = nltk.Tree(node.orth_, [to_nltk_tree(child) for child in node.children]) return t else: return node.orth_
def binarize(line, lan="en"): assert lan in ['en', 'ch'], "illegal language (en or ch): %s" % lan root = nltk.Tree(line) stack = [root] while stack: curNode = stack.pop() if len(curNode) > 2: if curNode.node == 'NP': rightBinarize(curNode) elif curNode.node == 'VP': if lan == 'en': vvBinarize(curNode) elif lan == 'ch': if curNode[0].node in vvTags: leftBinarize(curNode) elif curNode[-1].node in vvTags: rightBinarize(curNode) else: vvBinarize(curNode) for child in curNode: #print >> sys.stderr, child if child.height() > 2: stack.append(child) continue return ' '.join(root.pprint().split()) + '\n'
def adjust_ne(chunks, additional_nes): for counter, chunk in enumerate(chunks): if type(chunk).__name__ == "Tree": pass elif chunk[0] in additional_nes: #tuples chunks[counter] = nltk.Tree('NE', [chunk]) return chunks
def toNltkTrees(self, node): if node.n_lefts + node.n_rights > 0: return nltk.Tree( node.orth_, [self.toNltkTrees(child) for child in node.children]) else: return node.orth_
def nltk_tree_handling(): # construction tree1 = nltk.Tree("NP", ["Alice"]) print "tree1=", tree1 tree2 = nltk.Tree("NP", ["the", "rabbit"]) print "tree2=", tree2 tree3 = nltk.Tree("VP", ["chased", tree2]) print "tree3=", tree3 tree4 = nltk.Tree("S", [tree1, tree3]) print "tree4=", tree4 # deconstruction print "tree4[1]=", tree4[1] print "tree4[1].node=", tree4[1].node, \ "tree4[1].leaves()=", tree4[1].leaves() print "tree4[1][1][1]=", tree4[1][1][1] _traverse(tree4)
def tree2list(tree, parent_arc=[]): if isinstance(tree, nltk.Tree): label = tree.label() if isinstance(tree[0], nltk.Tree): label = re.split('-|=', tree.label())[0] root_arc_list = parent_arc + [label] root_arc = '+'.join(root_arc_list) if len(tree) == 1: root, arc, tag = tree2list(tree[0], parent_arc=root_arc_list) elif len(tree) == 2: c0, arc0, tag0 = tree2list(tree[0]) c1, arc1, tag1 = tree2list(tree[1]) root = [c0, c1] arc = arc0 + [root_arc] + arc1 tag = tag0 + tag1 else: c0, arc0, tag0 = tree2list(tree[0]) c1, arc1, tag1 = tree2list(nltk.Tree('<empty>', tree[1:])) if bin == 0: root = [c0] + c1 else: root = [c0, c1] arc = arc0 + [root_arc] + arc1 tag = tag0 + tag1 return root, arc, tag else: if len(parent_arc) == 1: parent_arc.insert(0, '<empty>') # parent_arc[-1] = '<POS>' del parent_arc[-1] return str(tree), [], ['+'.join(parent_arc)]
def isLegalTree(line, i): try: t = nltk.Tree(line) pt = nltk.ParentedTree(line) except ValueError: print >> sys.stderr, "illegal tree!!! #" + str(i) print >> sys.stderr, line exit(1)
def to_nltk_tree(self, node): def tok_format(tok): return " ".join(['"%s"' % tok.orth_, tok.tag_, tok.pos_, tok.dep_]) if node.n_lefts + node.n_rights > 0: return nltk.Tree(tok_format(node), [self.to_nltk_tree(child) for child in node.children]) else: return tok_format(node)
def firstTree(self): '''startSymbol is a Label''' topright_labels = self.matrix[0][self.n - 1].labels() startSymbol = topright_labels[0] subtree1 = self.buildSubtrees(startSymbol.child1()) subtree2 = self.buildSubtrees(startSymbol.child2()) parse_tree = nltk.Tree(startSymbol.symbol(), [subtree1, subtree2]) return parse_tree
def build_parse_tree(self, node_tup, table): """ Given a CKY table and node_tuple (key in table), recursively builds an NLTK tree. """ parent_sym = node_tup[0] start_index = node_tup[1] stop_index = node_tup[2] production = table[start_index][stop_index][node_tup] if len(production) == 1: # preterminal: build leaf return nltk.Tree(parent_sym, production) else: # branching node, recurse left_tree = self.build_parse_tree(production[0], table) right_tree = self.build_parse_tree(production[1], table) return nltk.Tree(parent_sym, [left_tree, right_tree])
def backtrack(triple, back): low = triple[0] high = triple[1] label = triple[2] if (low, high, label) not in back: # print(label) return label else: branches = back[(low, high, label)] if len(branches) == 1: return nltk.Tree(label, [backtrack((low, high, branches[0]), back)]) elif len(branches) == 3: (split, left, right) = branches return nltk.Tree(label, [ backtrack((low, split, left), back), backtrack((split, high, right), back) ])
def _tagged_to_parse(self, tagged_tokens): """ Convert a list of tagged tokens to a chunk-parse tree. """ sent = nltk.Tree('S', []) for (tok, tag) in tagged_tokens: if tag == 'O': sent.append(tok) elif tag.startswith('B-'): sent.append(nltk.Tree(tag[2:], [tok])) elif tag.startswith('I-'): if (sent and isinstance(sent[-1], Tree) and sent[-1].node == tag[2:]): sent[-1].append(tok) else: sent.append(nltk.Tree(tag[2:], [tok])) return sent
def tokenize_text_and_tag_named_entities(text): tokens = [] for sentence in nltk.sent_tokenize(text): for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence))): if hasattr(chunk, 'node'): # print chunk if chunk.node != 'GPE': tmp_tree = nltk.Tree(chunk.node, [(' '.join(c[0] for c in chunk.leaves()))]) else: tmp_tree = nltk.Tree('LOCATION', [(' '.join(c[0] for c in chunk.leaves()))]) tokens.append(tmp_tree) else: tokens.append(chunk[0]) return tokens
def neTagSentence(self, s): for i in range(len(s)): if type(s[i]) is nltk.Tree: if (s[i].label() == "NE"): flattened = self.flatten(s[i]) result = self.classify(flattened, s, i) s[i] = nltk.Tree(result, [flattened]) return s
def chunk(self): """ Identify MWEs inside list of tree-tagger POS tagging representation of words :return: updated tree-tagger list """ if self._list_tt is not None: tree = nltk.Tree('S', [(word, tag) for [word, tag, _] in self._list_tt]) tree_pp = self._parse_rcp(label='PPH', tree=tree, rule_set=self._pp_rule_set) tree_nc = self._parse_rcp(label='NC', tree=tree_pp, rule_set=self._nc_ngram_set) _reparsed_tree_nc = nltk.Tree('S', []) for rule in self._nc_2gram_set: rcp_nc_subtree = nltk.RegexpChunkParser([rule], chunk_label='NC', root_label='NC') for child_tree in tree_nc: if isinstance(child_tree, nltk.Tree): reparsed_child_tree = rcp_nc_subtree.parse(child_tree) if reparsed_child_tree != child_tree: if child_tree not in reparsed_child_tree: _reparsed_tree_nc.append(reparsed_child_tree) else: _reparsed_tree_nc.append(child_tree) else: if child_tree not in _reparsed_tree_nc: _reparsed_tree_nc.append(child_tree) self._new_list_tt, nc_saving_list = self._tree_to_treetaggerlist( tree=_reparsed_tree_nc) unnested_nc_saving_list = self._unnest_mwes(nc_saving_list) # print("[list_tt]: ", self._list_tt) # print("[_new_list_tt]: ", self._new_list_tt) self._raw_mwes = self._join_mwes(unnested_nc_saving_list) # print("[_unnest_mwes]:", unnested_nc_saving_list) self._count_words() self._print_measures() self._raw_mwes = self.filter_mwes() # print("[FILTERED _RAW_MWES]:", self._raw_mwes) return self._new_list_tt