def BP_tree_to_nltk_tree(tree): root = Tree(str(tree.keys), children = []) if isinstance(tree, BPnode) or isinstance(tree, Node): for child in tree.children: root.append(BP_tree_to_nltk_tree(child)) return root
def unfolded_decoding(W_d, b_d, tree, encoded): (n, m) = W_d.shape # store all a_e results in tree structure decoding_tree = Tree(encoded, []) try: decoding_tree.span = tree.span except: pass # if the given node (root) has children, decode the node's encoding, split it, # and use this as the children's encoding (output) to recurse back, until terminal # nodes are reached if type(tree) == nltk.tree.Tree and len(tree) > 0: decoded = decode(W_d, b_d, encoded) for i, child in enumerate(tree): # NOTE: the number of branchings n is NOT assumed, but that it is uniform and that # len(input layer) = n*len(encoding) IS assumed full_decoded = unfolded_decoding(W_d, b_d, child, decoded[i * m : m + (i * m)]) decoding_tree.append(full_decoded) return decoding_tree else: decoding_tree = Tree(encoded, []) try: decoding_tree.span = tree.span except: pass return decoding_tree
def load_ace_file(textfile, fmt): print ' - %s' % os.path.split(textfile)[1] annfile = textfile+'.tmx.rdc.xml' # Read the xml file, and get a list of entities entities = [] xml = ET.parse(open(annfile)).getroot() for entity in xml.findall('document/entity'): typ = entity.find('entity_type').text for mention in entity.findall('entity_mention'): if mention.get('TYPE') != 'NAME': continue # only NEs s = int(mention.find('head/charseq/start').text) e = int(mention.find('head/charseq/end').text)+1 entities.append( (s, e, typ) ) # Read the text file, and mark the entities. text = open(textfile).read() # Strip XML tags, since they don't count towards the indices text = re.sub('<(?!/?TEXT)[^>]+>', '', text) # Blank out anything before/after <TEXT> def subfunc(m): return ' '*(m.end()-m.start()-6) text = re.sub('[\s\S]*<TEXT>', subfunc, text) text = re.sub('</TEXT>[\s\S]*', '', text) # Simplify quotes text = re.sub("``", ' "', text) text = re.sub("''", '" ', text) entity_types = set(typ for (s,e,typ) in entities) # Binary distinction (NE or not NE) if fmt == 'binary': i = 0 toks = Tree('S', []) for (s,e,typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(word_tokenize(text[i:s])) toks.append(Tree('NE', text[s:e].split())) i = e toks.extend(word_tokenize(text[i:])) yield toks # Multiclass distinction (NE type) elif fmt == 'multiclass': i = 0 toks = Tree('S', []) for (s,e,typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(word_tokenize(text[i:s])) toks.append(Tree(typ, text[s:e].split())) i = e toks.extend(word_tokenize(text[i:])) yield toks else: raise ValueError('bad fmt value')
def __build_tree(self, node_num): word_tuple = self.words[node_num] tree_node = Tree(word_tuple[1], []) node_dependencies = self.dependencies.get(node_num) if node_dependencies is not None: for dependency in node_dependencies: dependency_node = self.__build_tree(dependency[0]) tree_node.append(dependency_node) return tree_node
def __str2BguTree(self,text): lines = text.split('\n') tree = Tree('s',[]) for line in lines: if line=='': continue mlist = line.split("\t") word = mlist[0] raw = mlist[1] tree.append((word,bguTag(raw))) return tree
def postag_tree(tree): # Part-of-speech tagging. words = tree.leaves() tag_iter = (pos for (word, pos) in pos_tag(words)) newtree = Tree('S', []) for child in tree: if isinstance(child, Tree): newtree.append(Tree(child.label(), [])) for subchild in child: newtree[-1].append( (subchild, next(tag_iter)) ) else: newtree.append( (child, next(tag_iter)) ) return newtree
def tags2tree(sentence, root_label='S', strict=False): tree = Tree(root_label, []) for (word, postag, chunktag) in sentence: if chunktag is None: if strict: raise ValueError("Bad tag sequence") else: # Treat as O tree.append((word, postag)) elif chunktag.startswith('B'): tree.append(Tree(chunktag[2:], [(word, postag)])) elif chunktag.startswith('I'): if (len(tree) == 0 or not isinstance(tree[-1], Tree) or tree[-1].label() != chunktag[2:]): if strict: raise ValueError("Bad tag sequence") else: # Treat as B-* tree.append(Tree(chunktag[2:], [(word, postag)])) else: tree[-1].append((word, postag)) elif chunktag == 'O': tree.append((word, postag)) else: raise ValueError("Bad tag %r" % chunktag) return tree
def conlltags2tree(sentence, chunk_types=("NP", "PP", "VP"), root_label="S", strict=False): """ Convert the CoNLL IOB format to a tree. """ tree = Tree(root_label, []) for (word, postag, chunktag) in sentence: if chunktag is None: if strict: raise ValueError("Bad conll tag sequence") else: # Treat as O tree.append((word, postag)) elif chunktag.startswith("B-"): tree.append(Tree(chunktag[2:], [(word, postag)])) elif chunktag.startswith("I-"): if (len(tree) == 0 or not isinstance(tree[-1], Tree) or tree[-1].label() != chunktag[2:]): if strict: raise ValueError("Bad conll tag sequence") else: # Treat as B-* tree.append(Tree(chunktag[2:], [(word, postag)])) else: tree[-1].append((word, postag)) elif chunktag == "O": tree.append((word, postag)) else: raise ValueError(f"Bad conll tag {chunktag!r}") return tree
def conlltags2tree(sentence, chunk_types=('NP', 'PP', 'VP'), root_label='S', strict=False): """ Convert the CoNLL IOB format to a tree. """ tree = Tree(root_label, []) for (word, postag, chunktag) in sentence: if chunktag is None: if strict: raise ValueError("Bad conll tag sequence") else: # Treat as O tree.append((word, postag)) elif chunktag.startswith('B-'): tree.append(Tree(chunktag[2:], [(word, postag)])) elif chunktag.startswith('I-'): if (len(tree) == 0 or not isinstance(tree[-1], Tree) or tree[-1].label() != chunktag[2:]): if strict: raise ValueError("Bad conll tag sequence") else: # Treat as B-* tree.append(Tree(chunktag[2:], [(word, postag)])) else: tree[-1].append((word, postag)) elif chunktag == 'O': tree.append((word, postag)) else: raise ValueError("Bad conll tag {0!r}".format(chunktag)) return tree
def tags2tree(sentence, root_label='S', strict=False): tree = Tree(root_label, []) for (word, postag, chunktag) in sentence: if chunktag is None: if strict: raise ValueError("Bad tag sequence") else: # Treat as O tree.append((word, postag)) elif chunktag.startswith('B'): tree.append(Tree(chunktag[2:], [(word, postag)])) elif chunktag.startswith('I'): if (len(tree) == 0 or not isinstance(tree[-1], Tree) or tree[-1].label() != chunktag[2:]): if strict: raise ValueError("Bad tag sequence") else: # Treat as B-* tree.append(Tree(chunktag[2:], [(word, postag)])) else: tree[-1].append((word, postag)) elif chunktag == 'O': tree.append((word, postag)) else: raise ValueError("Bad tag %r" % chunktag) return tree
def conlltags2tree(sentence, chunk_types=('NP','PP','VP'), root_label='S', strict=False): """ Convert the CoNLL IOB format to a tree. """ tree = Tree(root_label, []) for (word, postag, chunktag) in sentence: if chunktag is None: if strict: raise ValueError("Bad conll tag sequence") else: # Treat as O tree.append((word,postag)) elif chunktag.startswith('B-'): tree.append(Tree(chunktag[2:], [(word,postag)])) elif chunktag.startswith('I-'): if (len(tree)==0 or not isinstance(tree[-1], Tree) or tree[-1].label() != chunktag[2:]): if strict: raise ValueError("Bad conll tag sequence") else: # Treat as B-* tree.append(Tree(chunktag[2:], [(word,postag)])) else: tree[-1].append((word,postag)) elif chunktag == 'O': tree.append((word,postag)) else: raise ValueError("Bad conll tag {0!r}".format(chunktag)) return tree
def postag_tree(tree): # Part-of-speech tagging. words = tree.leaves() tagged_words = ner_pipeline.part_of_speech_tagging(words) tag_iter = (pos for (word, pos) in tagged_words) newtree = Tree('S', []) for child in tree: if isinstance(child, Tree): newtree.append(Tree(child.label(), [])) for subchild in child: newtree[-1].append((subchild, next(tag_iter))) else: newtree.append((child, next(tag_iter))) return newtree
def binarize(tree): if not isinstance(tree, Tree): return tree children = [binarize(ch) for ch in tree] while len(children) > 2: temp = Tree('(' + tree.label() + 'bar)') temp.append(children[-2]) temp.append(children[-1]) children = children[:-2] + [temp] ret = Tree('(' + tree.label() + ')') for ch in children: ret.append(ch) return ret
def binarize(tree): if not isinstance(tree, Tree): return tree children = [binarize(ch) for ch in tree] while len(children) > 2: temp = Tree('(' + tree.label() + 'bar)') temp.append(children[-2]) temp.append(children[-1]) children = children[:-2] + [temp] ret = Tree('(' + tree.label() + ')') for ch in children: ret.append(ch) return ret
def _build_tree(self, words, back, i, j, node): """ Recursively build the tree from the back table. """ tree = Tree(node.symbol(), children=[]) if (i, j) == (j - 1, j): tree.append(words[j - 1]) return tree else: if (i, j, node) in back.keys(): k, b, c = back[i, j, node] tree.append(self._build_tree(words, back, i, k, b)) tree.append(self._build_tree(words, back, k, j, c)) return tree else: return tree
def split_tree_tokens(tree): """Process a chunk-parse Tree, splitting nodes in the form "token/POS". Returns a similar tree in which the leaves are PoS tagged tokens in the form: ("token", "TAG") """ token_iter = (tuple(token.split('/')) for token in tree.leaves()) newtree = NLTKParseTree(tree.node, []) for child in tree: if isinstance(child, NLTKParseTree): newtree.append(NLTKParseTree(child.node, [])) for subchild in child: newtree[-1].append(token_iter.next()) else: newtree.append(token_iter.next()) return newtree
def simplify(tree): if isinstance(tree, str): return tree ret = Tree(tree.label(), []) for ch in tree: newch = simplify(ch) if newch is None: continue ret.append(newch) if len(ret) == 0: ret.append('None') for cond, modif in RULES: if cond(ret): ret = modif(ret) if ret is None: break return ret
def simplify(tree): if isinstance(tree, str): return tree ret = Tree(tree.label(), []) for ch in tree: newch = simplify(ch) if newch is None: continue ret.append(newch) if len(ret) == 0: ret.append('None') for cond, modif in RULES: if cond(ret): ret = modif(ret) if ret is None: break return ret
def add_node_to_tree(self, chart, selected_rule, next_cell, root): _, u_path, left, right = next_cell[selected_rule] new_node = Tree(selected_rule.symbol(), []) root.append(new_node) for non_t in u_path: u_node = Tree(non_t.symbol(), []) new_node.append(u_node) new_node = u_node left_rule, left_i, left_j = left if left_j == 0: new_node.append(left_rule) return next_left = chart[left_i][left_j] right_rule, right_i, right_j = right next_right = chart[right_i][right_j] self.add_node_to_tree(chart, left_rule, next_left, new_node) self.add_node_to_tree(chart, right_rule, next_right, new_node)
def _tagged_to_parse(tagged_tokens): """Convert a list of tagged tokens to a chunk-parse Tree.""" tree = NLTKParseTree('TEXT', []) sent = NLTKParseTree('S', []) for ((token, pos), tag) in tagged_tokens: if tag == 'O': sent.append((token, pos)) if pos == '.': # End of sentence, add to main tree tree.append(sent) # Start a new subtree sent = NLTKParseTree('S', []) elif tag.startswith('B-'): sent.append(NLTKParseTree(tag[2:], [(token, pos)])) elif tag.startswith('I-'): if (sent and isinstance(sent[-1], NLTKParseTree) and sent[-1].node == tag[2:]): sent[-1].append((token, pos)) else: sent.append(NLTKParseTree(tag[2:], [(token, pos)])) if sent: tree.append(sent) return tree
def recursive_build(parse_chart, i, j): a, b, k = next(iter((parse_chart[i][j].values()))) assert (i == j or (i <= k and k < j)) root = Tree(a, []) foot = None if b != -1: foot = Tree(b, []) root.append(foot) else: foot = root if i != j: foot.append(recursive_build(parse_chart, i, k)) foot.append(recursive_build(parse_chart, k + 1, j)) else: foot.append(-1) return root
def build_tree(self, node, partition_key): start, end = [int(x) for x in partition_key.split("-")] sentence_len = end - start result = self.get_entry(partition_key, node) if result is None: return node, -1 production, probability = result tree = Tree(node, list()) parts = production.split(" ") if sentence_len == 1: if len(parts) != 1: raise Exception("Un Expected Rule!!") if parts[0] == node: tree.append(parts[0]) else: tree.append(self.build_tree(parts[0], partition_key)[0]) else: if len(parts) != 2: raise Exception("Un Expected Rule!!") node1, key1 = parts[0].rsplit(":", 1) tree.append(self.build_tree(node1, key1)[0]) node2, key2 = parts[1].rsplit(":", 1) tree.append(self.build_tree(node2, key2)[0]) return tree, probability
def _tagged_to_parse(self, tagged_tokens): """ Convert a list of tagged tokens to a chunk-parse tree. """ sent = Tree("S", []) for (tok, tag) in tagged_tokens: if tag == "O": sent.append(tok) elif tag.startswith("B-"): sent.append(Tree(tag[2:], [tok])) elif tag.startswith("I-"): if sent and isinstance(sent[-1], Tree) and sent[-1].label() == tag[2:]: sent[-1].append(tok) else: sent.append(Tree(tag[2:], [tok])) return sent
def _tagged_to_parse(self, tagged_tokens): """ Convert a list of tagged tokens to a chunk-parse tree. """ sent = Tree("S", []) for (tok, tag) in tagged_tokens: if tag == "O": sent.append(tok) elif tag.startswith("B-"): sent.append(Tree(tag[2:], [tok])) elif tag.startswith("I-"): if sent and isinstance(sent[-1], Tree) and sent[-1].label() == tag[2:]: sent[-1].append(tok) else: sent.append(Tree(tag[2:], [tok])) return sent
def IOB_to_tree(iob_tagged): # https://stackoverflow.com/questions/27629130/chunking-stanford-named-entity-recognizer-ner-outputs-from-nltk-format # https://stackoverflow.com/questions/30664677/extract-list-of-persons-and-organizations-using-stanford-ner-tagger-in-nltk root = Tree('S', []) for token in iob_tagged: if token[2] == 'O': root.append((token[0], token[1])) else: try: if root[-1].label() == token[2]: root[-1].append((token[0], token[1])) else: root.append(Tree(token[2], [(token[0], token[1])])) except: root.append(Tree(token[2], [(token[0], token[1])])) return root
def _tagged_to_parse(self, tagged_tokens): """ Convert a list of tagged tokens to a chunk-parse tree. """ sent = Tree('S', []) for (tok,tag) in tagged_tokens: if tag == 'O': sent.append(tok) elif tag.startswith('B-'): sent.append(Tree(tag[2:], [tok])) elif tag.startswith('I-'): if (sent and isinstance(sent[-1], Tree) and sent[-1].node == tag[2:]): sent[-1].append(tok) else: sent.append(Tree(tag[2:], [tok])) return sent
def _tagged_to_parse(self, tagged_tokens): """ Convert a list of tagged tokens to a chunk-parse tree. """ sent = Tree('S', []) for (tok, tag) in tagged_tokens: if tag == 'O': sent.append(tok) elif tag.startswith('B-'): sent.append(Tree(tag[2:], [tok])) elif tag.startswith('I-'): if (sent and isinstance(sent[-1], Tree) and sent[-1].node == tag[2:]): sent[-1].append(tok) else: sent.append(Tree(tag[2:], [tok])) return sent
def RemoveNIChunks(structured_sentence): ''' Removes branches that don't add extra information to the final parse, these being tagged by .*NI.* Takes in the ParsedSentence after UncertainGrammar has been applied Uses the rules defined and applied in UncertainGrammar Parameters: ----------- structured_sentence: Tree A sentence that has been parsed using nltk grammarparser Return: -------- structured_sentence: Tree A sentence that has been parsed using nltk grammarparser Exceptions: ----------- ''' if(type(structured_sentence) != tuple): for index in range(len(structured_sentence)): structured_sentence[index] = RemoveNIChunks(structured_sentence[index]) if("NI" in structured_sentence.label()):#"NI" means NO INFORMATION, therefore that the additional chunk does nothing label = structured_sentence.label().replace("NI","") Inside = Tree(label,[]) for Chunk in structured_sentence: if type(Chunk) != tuple: if "Clause" in Chunk.label(): for SubChunk in Chunk: Inside.append(SubChunk) else: Inside.append(Chunk) else: Inside.append(Chunk) structured_sentence = Inside return structured_sentence
def load_ace_file(textfile, fmt): print(" - {0}".format(os.path.split(textfile)[1])) annfile = textfile + ".tmx.rdc.xml" # Read the xml file, and get a list of entities entities = [] with open(annfile, "r") as infile: xml = ET.parse(infile).getroot() for entity in xml.findall("document/entity"): typ = entity.find("entity_type").text for mention in entity.findall("entity_mention"): if mention.get("TYPE") != "NAME": continue # only NEs s = int(mention.find("head/charseq/start").text) e = int(mention.find("head/charseq/end").text) + 1 entities.append((s, e, typ)) # Read the text file, and mark the entities. with open(textfile, "r") as infile: text = infile.read() # Strip XML tags, since they don't count towards the indices text = re.sub("<(?!/?TEXT)[^>]+>", "", text) # Blank out anything before/after <TEXT> def subfunc(m): return " " * (m.end() - m.start() - 6) text = re.sub(r"[\s\S]*<TEXT>", subfunc, text) text = re.sub(r"</TEXT>[\s\S]*", "", text) # Simplify quotes text = re.sub("``", ' "', text) text = re.sub("''", '" ', text) entity_types = set(typ for (s, e, typ) in entities) # Binary distinction (NE or not NE) if fmt == "binary": i = 0 toks = Tree("S", []) for (s, e, typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(word_tokenize(text[i:s])) toks.append(Tree("NE", text[s:e].split())) i = e toks.extend(word_tokenize(text[i:])) yield toks # Multiclass distinction (NE type) elif fmt == "multiclass": i = 0 toks = Tree("S", []) for (s, e, typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(word_tokenize(text[i:s])) toks.append(Tree(typ, text[s:e].split())) i = e toks.extend(word_tokenize(text[i:])) yield toks else: raise ValueError("bad fmt value")
def chunk(self, tree, rule, depth): ruleContents = rule.contents ruleName = rule.type if depth==0: #maximum recursion set by depth return tree children = tree.treepositions('postorder') #get tuples for all locations in tree string = "" parent = {} subtrees = {} #key->new subtree to add to tree; value->location to place in treepositions() dictBrothers = rule.find_brothers(children, parent) # returns a dict. of those children in the tree who have the same parent, # such that a rule MIGHT apply to them if dictBrothers == dict(): # no possible application of rule return tree #now we have dictBrothers which is a list of all children who have the same parent, #we check to see which list of brothers corresponds to ruleContents #such that tree will need to be altered at that location for child in children: # look for a child in tree for whom it both (1) has brothers and (2) rule applies (rule_to_children(tree, brothers, rule)) # otherwise, just "continue" if not parent[child] in dictBrothers: continue tempBrothers = dictBrothers[parent[child]] tuple = self.rule_to_children(tree, tempBrothers) if tuple == (-1,-1): continue #found a rule applies for certain children #now set up new tree #and re-arrange tree to fit #then recursively call chunker with depth-1 start = tuple[0] end = tuple[1] newTree = Tree("("+ruleName+")") for i in range(end-start): #set up new tree newChild = tempBrothers[i+start] ruleList = ruleContents.split() typeOf = type(tree[newChild]) if typeOf is Tree: modifiedName = "<"+tree[newChild].node+">" tree[newChild].node = modifiedName else: #ruleList = ruleContents.split() #subst="-->" #for i in range(len(rule)): #subst+="<"+ruleList[i]+"> " #add this so we know how tree was derived newTuple = (tree[newChild][0], "<"+str(tree[newChild][-1])+">") tree[newChild] = newTuple newTree.append(tree[newChild]) tree[tempBrothers[start]] = newTree #attach new tree at left-most child (start) #then remove old children except for #0/start, which is the new tree for i in range(end-start): if i != 0: tree[tempBrothers[i+start]] = "REMOVE" while "REMOVE" in tree: tree.remove("REMOVE") for subtree in tree.subtrees(): if "REMOVE" in subtree: subtree.remove("REMOVE") #now recursively chunk if there are more brothers #to whom rule applies if len(dictBrothers)>1 or len(dictBrothers[parent[child]])>len(ruleContents.split()): return self.chunk(tree, rule, depth-1) else: return tree #found no children for whom rule applies, so just return tree return tree
def load_ace_file(textfile, fmt): print(" - %s" % os.path.split(textfile)[1]) annfile = textfile + ".tmx.rdc.xml" # Read the xml file, and get a list of entities entities = [] xml = ET.parse(open(annfile)).getroot() for entity in xml.findall("document/entity"): typ = entity.find("entity_type").text for mention in entity.findall("entity_mention"): if mention.get("TYPE") != "NAME": continue # only NEs s = int(mention.find("head/charseq/start").text) e = int(mention.find("head/charseq/end").text) + 1 entities.append((s, e, typ)) # Read the text file, and mark the entities. with open(textfile) as fp: text = fp.read() # Strip XML tags, since they don't count towards the indices text = re.sub("<(?!/?TEXT)[^>]+>", "", text) # Blank out anything before/after <TEXT> def subfunc(m): return " " * (m.end() - m.start() - 6) text = re.sub("[\s\S]*<TEXT>", subfunc, text) text = re.sub("</TEXT>[\s\S]*", "", text) # Simplify quotes text = re.sub("``", ' "', text) text = re.sub("''", '" ', text) entity_types = set(typ for (s, e, typ) in entities) # Binary distinction (NE or not NE) if fmt == "binary": i = 0 toks = Tree("S", []) for (s, e, typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(word_tokenize(text[i:s])) toks.append(Tree("NE", text[s:e].split())) i = e toks.extend(word_tokenize(text[i:])) yield toks # Multiclass distinction (NE type) elif fmt == "multiclass": i = 0 toks = Tree("S", []) for (s, e, typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(word_tokenize(text[i:s])) toks.append(Tree(typ, text[s:e].split())) i = e toks.extend(word_tokenize(text[i:])) yield toks else: raise ValueError("bad fmt value")
def load_ace_file(textfile, fmt): print(' - {0}'.format(os.path.split(textfile)[1])) annfile = textfile + '.tmx.rdc.xml' # Read the xml file, and get a list of entities entities = [] with open(annfile, 'r') as infile: xml = ET.parse(infile).getroot() for entity in xml.findall('document/entity'): typ = entity.find('entity_type').text for mention in entity.findall('entity_mention'): if mention.get('TYPE') != 'NAME': continue # only NEs s = int(mention.find('head/charseq/start').text) e = int(mention.find('head/charseq/end').text) + 1 entities.append((s, e, typ)) # Read the text file, and mark the entities. with open(textfile, 'r') as infile: text = infile.read() # Strip XML tags, since they don't count towards the indices text = re.sub('<(?!/?TEXT)[^>]+>', '', text) # Blank out anything before/after <TEXT> def subfunc(m): return ' ' * (m.end() - m.start() - 6) text = re.sub('[\s\S]*<TEXT>', subfunc, text) text = re.sub('</TEXT>[\s\S]*', '', text) # Simplify quotes text = re.sub("``", ' "', text) text = re.sub("''", '" ', text) entity_types = set(typ for (s, e, typ) in entities) # Binary distinction (NE or not NE) if fmt == 'binary': i = 0 toks = Tree('S', []) for (s, e, typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(word_tokenize(text[i:s])) toks.append(Tree('NE', text[s:e].split())) i = e toks.extend(word_tokenize(text[i:])) yield toks # Multiclass distinction (NE type) elif fmt == 'multiclass': i = 0 toks = Tree('S', []) for (s, e, typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(word_tokenize(text[i:s])) toks.append(Tree(typ, text[s:e].split())) i = e toks.extend(word_tokenize(text[i:])) yield toks else: raise ValueError('bad fmt value')
def parse_string_tree(s, start): idx = start assert s[idx] == '(' idx = idx + 1 while s[idx] == ' ': idx = idx + 1 if s[idx] == '(': tl, idx = parse_string_tree(s, idx) while s[idx] == ' ': idx = idx + 1 if s[idx] == '(': t, idx = parse_string_tree(s, idx) # is a leaf t.insert(0, tl) # match closing bracket while s[idx] != ')': idx = idx + 1 idx = idx + 1 return t, idx else: # there is an input element aux = idx + 1 while s[aux] != ' ' and s[aux] != ')' and s[aux] != '(': aux = aux + 1 w = s[idx:aux] idx = aux t = Tree(w, []) while s[idx] == ' ': idx = idx + 1 if s[idx] != '(': if s[idx] != ')': # another word aux = idx + 1 while s[aux] != ' ' and s[aux] != ')' and s[aux] != '(': aux = aux + 1 wr = s[idx:aux] idx = aux tr = Tree(wr, []) t.append(tr) # match closing bracket while s[idx] != ')': idx = idx + 1 idx = idx + 1 return t, idx else: new_t, idx = parse_string_tree(s, idx) if len(t.label()) == 1: # is a variable new_t.insert(0, t) t = new_t else: # is an operator t.append(new_t) # match closing bracket while s[idx] != ')': idx = idx + 1 idx = idx + 1 return t, idx