Ejemplo n.º 1
0
Archivo: gui.py Proyecto: ndm25/eecs405
def BP_tree_to_nltk_tree(tree):
    root = Tree(str(tree.keys), children = [])
    if isinstance(tree, BPnode) or isinstance(tree, Node):
        for child in tree.children:
            root.append(BP_tree_to_nltk_tree(child))

    return root
Ejemplo n.º 2
0
def unfolded_decoding(W_d, b_d, tree, encoded):
    (n, m) = W_d.shape

    # store all a_e results in tree structure
    decoding_tree = Tree(encoded, [])
    try:
        decoding_tree.span = tree.span
    except:
        pass

    # if the given node (root) has children, decode the node's encoding, split it,
    # and use this as the children's encoding (output) to recurse back, until terminal
    # nodes are reached
    if type(tree) == nltk.tree.Tree and len(tree) > 0:
        decoded = decode(W_d, b_d, encoded)
        for i, child in enumerate(tree):

            # NOTE: the number of branchings n is NOT assumed, but that it is uniform and that
            # len(input layer) = n*len(encoding) IS assumed
            full_decoded = unfolded_decoding(W_d, b_d, child, decoded[i * m : m + (i * m)])
            decoding_tree.append(full_decoded)
        return decoding_tree
    else:
        decoding_tree = Tree(encoded, [])
        try:
            decoding_tree.span = tree.span
        except:
            pass
        return decoding_tree
Ejemplo n.º 3
0
def load_ace_file(textfile, fmt):
    print '  - %s' % os.path.split(textfile)[1]
    annfile = textfile+'.tmx.rdc.xml'

    # Read the xml file, and get a list of entities
    entities = []
    xml = ET.parse(open(annfile)).getroot()
    for entity in xml.findall('document/entity'):
        typ = entity.find('entity_type').text
        for mention in entity.findall('entity_mention'):
            if mention.get('TYPE') != 'NAME': continue # only NEs
            s = int(mention.find('head/charseq/start').text)
            e = int(mention.find('head/charseq/end').text)+1
            entities.append( (s, e, typ) )

    # Read the text file, and mark the entities.
    text = open(textfile).read()
    
    # Strip XML tags, since they don't count towards the indices
    text = re.sub('<(?!/?TEXT)[^>]+>', '', text)

    # Blank out anything before/after <TEXT>
    def subfunc(m): return ' '*(m.end()-m.start()-6)
    text = re.sub('[\s\S]*<TEXT>', subfunc, text)
    text = re.sub('</TEXT>[\s\S]*', '', text)

    # Simplify quotes
    text = re.sub("``", ' "', text)
    text = re.sub("''", '" ', text)

    entity_types = set(typ for (s,e,typ) in entities)

    # Binary distinction (NE or not NE)
    if fmt == 'binary':
        i = 0
        toks = Tree('S', [])
        for (s,e,typ) in sorted(entities):
            if s < i: s = i # Overlapping!  Deal with this better?
            if e <= s: continue
            toks.extend(word_tokenize(text[i:s]))
            toks.append(Tree('NE', text[s:e].split()))
            i = e
        toks.extend(word_tokenize(text[i:]))
        yield toks

    # Multiclass distinction (NE type)
    elif fmt == 'multiclass':
        i = 0
        toks = Tree('S', [])
        for (s,e,typ) in sorted(entities):
            if s < i: s = i # Overlapping!  Deal with this better?
            if e <= s: continue
            toks.extend(word_tokenize(text[i:s]))
            toks.append(Tree(typ, text[s:e].split()))
            i = e
        toks.extend(word_tokenize(text[i:]))
        yield toks

    else:
        raise ValueError('bad fmt value')
    def __build_tree(self, node_num):
        word_tuple = self.words[node_num]
        tree_node = Tree(word_tuple[1], [])

        node_dependencies = self.dependencies.get(node_num)
        if node_dependencies is not None:
            for dependency in node_dependencies:
                dependency_node = self.__build_tree(dependency[0])
                tree_node.append(dependency_node)

        return tree_node
Ejemplo n.º 5
0
 def __str2BguTree(self,text):
     lines = text.split('\n')
     tree = Tree('s',[])
     for line in lines:
         if line=='':
             continue
         mlist = line.split("\t")
         word = mlist[0]
         raw = mlist[1]
         tree.append((word,bguTag(raw)))
     return tree
def postag_tree(tree):
    # Part-of-speech tagging.
    words = tree.leaves()
    tag_iter = (pos for (word, pos) in pos_tag(words))
    newtree = Tree('S', [])
    for child in tree:
        if isinstance(child, Tree):
            newtree.append(Tree(child.label(), []))
            for subchild in child:
                newtree[-1].append( (subchild, next(tag_iter)) )
        else:
            newtree.append( (child, next(tag_iter)) )
    return newtree
Ejemplo n.º 7
0
 def tags2tree(sentence, root_label='S', strict=False):
     tree = Tree(root_label, [])
     for (word, postag, chunktag) in sentence:
         if chunktag is None:
             if strict:
                 raise ValueError("Bad tag sequence")
             else:
                 # Treat as O
                 tree.append((word, postag))
         elif chunktag.startswith('B'):
             tree.append(Tree(chunktag[2:], [(word, postag)]))
         elif chunktag.startswith('I'):
             if (len(tree) == 0 or not isinstance(tree[-1], Tree)
                     or tree[-1].label() != chunktag[2:]):
                 if strict:
                     raise ValueError("Bad tag sequence")
                 else:
                     # Treat as B-*
                     tree.append(Tree(chunktag[2:], [(word, postag)]))
             else:
                 tree[-1].append((word, postag))
         elif chunktag == 'O':
             tree.append((word, postag))
         else:
             raise ValueError("Bad tag %r" % chunktag)
     return tree
Ejemplo n.º 8
0
Archivo: util.py Proyecto: tchangw/nltk
def conlltags2tree(sentence,
                   chunk_types=("NP", "PP", "VP"),
                   root_label="S",
                   strict=False):
    """
    Convert the CoNLL IOB format to a tree.
    """
    tree = Tree(root_label, [])
    for (word, postag, chunktag) in sentence:
        if chunktag is None:
            if strict:
                raise ValueError("Bad conll tag sequence")
            else:
                # Treat as O
                tree.append((word, postag))
        elif chunktag.startswith("B-"):
            tree.append(Tree(chunktag[2:], [(word, postag)]))
        elif chunktag.startswith("I-"):
            if (len(tree) == 0 or not isinstance(tree[-1], Tree)
                    or tree[-1].label() != chunktag[2:]):
                if strict:
                    raise ValueError("Bad conll tag sequence")
                else:
                    # Treat as B-*
                    tree.append(Tree(chunktag[2:], [(word, postag)]))
            else:
                tree[-1].append((word, postag))
        elif chunktag == "O":
            tree.append((word, postag))
        else:
            raise ValueError(f"Bad conll tag {chunktag!r}")
    return tree
Ejemplo n.º 9
0
def conlltags2tree(sentence,
                   chunk_types=('NP', 'PP', 'VP'),
                   root_label='S',
                   strict=False):
    """
    Convert the CoNLL IOB format to a tree.
    """
    tree = Tree(root_label, [])
    for (word, postag, chunktag) in sentence:
        if chunktag is None:
            if strict:
                raise ValueError("Bad conll tag sequence")
            else:
                # Treat as O
                tree.append((word, postag))
        elif chunktag.startswith('B-'):
            tree.append(Tree(chunktag[2:], [(word, postag)]))
        elif chunktag.startswith('I-'):
            if (len(tree) == 0 or not isinstance(tree[-1], Tree)
                    or tree[-1].label() != chunktag[2:]):
                if strict:
                    raise ValueError("Bad conll tag sequence")
                else:
                    # Treat as B-*
                    tree.append(Tree(chunktag[2:], [(word, postag)]))
            else:
                tree[-1].append((word, postag))
        elif chunktag == 'O':
            tree.append((word, postag))
        else:
            raise ValueError("Bad conll tag {0!r}".format(chunktag))
    return tree
Ejemplo n.º 10
0
 def tags2tree(sentence, root_label='S', strict=False):
     tree = Tree(root_label, [])
     for (word, postag, chunktag) in sentence:
         if chunktag is None:
             if strict:
                 raise ValueError("Bad tag sequence")
             else:
                 # Treat as O
                 tree.append((word, postag))
         elif chunktag.startswith('B'):
             tree.append(Tree(chunktag[2:], [(word, postag)]))
         elif chunktag.startswith('I'):
             if (len(tree) == 0 or not isinstance(tree[-1], Tree) or
                         tree[-1].label() != chunktag[2:]):
                 if strict:
                     raise ValueError("Bad tag sequence")
                 else:
                     # Treat as B-*
                     tree.append(Tree(chunktag[2:], [(word, postag)]))
             else:
                 tree[-1].append((word, postag))
         elif chunktag == 'O':
             tree.append((word, postag))
         else:
             raise ValueError("Bad tag %r" % chunktag)
     return tree
Ejemplo n.º 11
0
Archivo: util.py Proyecto: DrDub/nltk
def conlltags2tree(sentence, chunk_types=('NP','PP','VP'),
                   root_label='S', strict=False):
    """
    Convert the CoNLL IOB format to a tree.
    """
    tree = Tree(root_label, [])
    for (word, postag, chunktag) in sentence:
        if chunktag is None:
            if strict:
                raise ValueError("Bad conll tag sequence")
            else:
                # Treat as O
                tree.append((word,postag))
        elif chunktag.startswith('B-'):
            tree.append(Tree(chunktag[2:], [(word,postag)]))
        elif chunktag.startswith('I-'):
            if (len(tree)==0 or not isinstance(tree[-1], Tree) or
                tree[-1].label() != chunktag[2:]):
                if strict:
                    raise ValueError("Bad conll tag sequence")
                else:
                    # Treat as B-*
                    tree.append(Tree(chunktag[2:], [(word,postag)]))
            else:
                tree[-1].append((word,postag))
        elif chunktag == 'O':
            tree.append((word,postag))
        else:
            raise ValueError("Bad conll tag {0!r}".format(chunktag))
    return tree
Ejemplo n.º 12
0
def postag_tree(tree):
    # Part-of-speech tagging.
    words = tree.leaves()
    tagged_words = ner_pipeline.part_of_speech_tagging(words)

    tag_iter = (pos for (word, pos) in tagged_words)
    newtree = Tree('S', [])
    for child in tree:
        if isinstance(child, Tree):
            newtree.append(Tree(child.label(), []))
            for subchild in child:
                newtree[-1].append((subchild, next(tag_iter)))
        else:
            newtree.append((child, next(tag_iter)))
    return newtree
Ejemplo n.º 13
0
def binarize(tree):
    if not isinstance(tree, Tree):
        return tree
    
    children = [binarize(ch) for ch in tree]
    while len(children) > 2:
        temp = Tree('(' + tree.label() + 'bar)')
        temp.append(children[-2])
        temp.append(children[-1])
        children = children[:-2] + [temp]
    
    ret = Tree('(' + tree.label() + ')')
    for ch in children:
        ret.append(ch)
    
    return ret
Ejemplo n.º 14
0
def binarize(tree):
    if not isinstance(tree, Tree):
        return tree

    children = [binarize(ch) for ch in tree]
    while len(children) > 2:
        temp = Tree('(' + tree.label() + 'bar)')
        temp.append(children[-2])
        temp.append(children[-1])
        children = children[:-2] + [temp]

    ret = Tree('(' + tree.label() + ')')
    for ch in children:
        ret.append(ch)

    return ret
Ejemplo n.º 15
0
 def _build_tree(self, words, back, i, j, node):
     """
     Recursively build the tree from the back table.
     """
     tree = Tree(node.symbol(), children=[])
     if (i, j) == (j - 1, j):
         tree.append(words[j - 1])
         return tree
     else:
         if (i, j, node) in back.keys():
             k, b, c = back[i, j, node]
             tree.append(self._build_tree(words, back, i, k, b))
             tree.append(self._build_tree(words, back, k, j, c))
             return tree
         else:
             return tree
def split_tree_tokens(tree):
    """Process a chunk-parse Tree, splitting nodes in the form "token/POS".
    
    Returns a similar tree in which the leaves are PoS tagged tokens in the
    form:
    ("token", "TAG")
    """
    token_iter = (tuple(token.split('/')) for token in tree.leaves())
    newtree = NLTKParseTree(tree.node, [])
    for child in tree:
        if isinstance(child, NLTKParseTree):
            newtree.append(NLTKParseTree(child.node, []))
            for subchild in child:
                newtree[-1].append(token_iter.next())
        else:
            newtree.append(token_iter.next())
    return newtree
Ejemplo n.º 17
0
def simplify(tree):
    if isinstance(tree, str):
        return tree

    ret = Tree(tree.label(), [])
    for ch in tree:
        newch = simplify(ch)
        if newch is None:
            continue
        ret.append(newch)
    if len(ret) == 0:
        ret.append('None')

    for cond, modif in RULES:
        if cond(ret):
            ret = modif(ret)
            if ret is None:
                break
    return ret
Ejemplo n.º 18
0
def simplify(tree):
    if isinstance(tree, str):
        return tree

    ret = Tree(tree.label(), [])
    for ch in tree:
        newch = simplify(ch)
        if newch is None:
            continue
        ret.append(newch)
    if len(ret) == 0:
        ret.append('None')

    for cond, modif in RULES:
        if cond(ret):
            ret = modif(ret)
            if ret is None:
                break
    return ret
Ejemplo n.º 19
0
    def add_node_to_tree(self, chart, selected_rule, next_cell, root):
        _, u_path, left, right = next_cell[selected_rule]

        new_node = Tree(selected_rule.symbol(), [])
        root.append(new_node)
        for non_t in u_path:
            u_node = Tree(non_t.symbol(), [])
            new_node.append(u_node)
            new_node = u_node

        left_rule, left_i, left_j = left
        if left_j == 0:
            new_node.append(left_rule)
            return

        next_left = chart[left_i][left_j]

        right_rule, right_i, right_j = right
        next_right = chart[right_i][right_j]

        self.add_node_to_tree(chart, left_rule, next_left, new_node)
        self.add_node_to_tree(chart, right_rule, next_right, new_node)
 def _tagged_to_parse(tagged_tokens):
     """Convert a list of tagged tokens to a chunk-parse Tree."""
     tree = NLTKParseTree('TEXT', [])
     sent = NLTKParseTree('S', [])
     for ((token, pos), tag) in tagged_tokens:
         if tag == 'O':
             sent.append((token, pos))
             if pos == '.':
                 # End of sentence, add to main tree
                 tree.append(sent)
                 # Start a new subtree
                 sent = NLTKParseTree('S', [])
         elif tag.startswith('B-'):
             sent.append(NLTKParseTree(tag[2:], [(token, pos)]))
         elif tag.startswith('I-'):
             if (sent and isinstance(sent[-1], NLTKParseTree) and
                 sent[-1].node == tag[2:]):
                 sent[-1].append((token, pos))
             else:
                 sent.append(NLTKParseTree(tag[2:], [(token, pos)]))
     if sent:
         tree.append(sent)
     return tree
Ejemplo n.º 21
0
def recursive_build(parse_chart, i, j):
    a, b, k = next(iter((parse_chart[i][j].values())))
    assert (i == j or (i <= k and k < j))
    root = Tree(a, [])
    foot = None
    if b != -1:
        foot = Tree(b, [])
        root.append(foot)
    else:
        foot = root
    if i != j:
        foot.append(recursive_build(parse_chart, i, k))
        foot.append(recursive_build(parse_chart, k + 1, j))
    else:
        foot.append(-1)
    return root
Ejemplo n.º 22
0
 def build_tree(self, node, partition_key):
     start, end = [int(x) for x in partition_key.split("-")]
     sentence_len = end - start
     result = self.get_entry(partition_key, node)
     if result is None:
         return node, -1
     production, probability = result
     tree = Tree(node, list())
     parts = production.split(" ")
     if sentence_len == 1:
         if len(parts) != 1:
             raise Exception("Un Expected Rule!!")
         if parts[0] == node:
             tree.append(parts[0])
         else:
             tree.append(self.build_tree(parts[0], partition_key)[0])
     else:
         if len(parts) != 2:
             raise Exception("Un Expected Rule!!")
         node1, key1 = parts[0].rsplit(":", 1)
         tree.append(self.build_tree(node1, key1)[0])
         node2, key2 = parts[1].rsplit(":", 1)
         tree.append(self.build_tree(node2, key2)[0])
     return tree, probability
Ejemplo n.º 23
0
    def _tagged_to_parse(self, tagged_tokens):
        """
        Convert a list of tagged tokens to a chunk-parse tree.
        """
        sent = Tree("S", [])

        for (tok, tag) in tagged_tokens:
            if tag == "O":
                sent.append(tok)
            elif tag.startswith("B-"):
                sent.append(Tree(tag[2:], [tok]))
            elif tag.startswith("I-"):
                if sent and isinstance(sent[-1], Tree) and sent[-1].label() == tag[2:]:
                    sent[-1].append(tok)
                else:
                    sent.append(Tree(tag[2:], [tok]))
        return sent
Ejemplo n.º 24
0
    def _tagged_to_parse(self, tagged_tokens):
        """
        Convert a list of tagged tokens to a chunk-parse tree.
        """
        sent = Tree("S", [])

        for (tok, tag) in tagged_tokens:
            if tag == "O":
                sent.append(tok)
            elif tag.startswith("B-"):
                sent.append(Tree(tag[2:], [tok]))
            elif tag.startswith("I-"):
                if sent and isinstance(sent[-1], Tree) and sent[-1].label() == tag[2:]:
                    sent[-1].append(tok)
                else:
                    sent.append(Tree(tag[2:], [tok]))
        return sent
Ejemplo n.º 25
0
def IOB_to_tree(iob_tagged):
    # https://stackoverflow.com/questions/27629130/chunking-stanford-named-entity-recognizer-ner-outputs-from-nltk-format
    # https://stackoverflow.com/questions/30664677/extract-list-of-persons-and-organizations-using-stanford-ner-tagger-in-nltk
    root = Tree('S', [])
    for token in iob_tagged:
        if token[2] == 'O':
            root.append((token[0], token[1]))
        else:
            try:
                if root[-1].label() == token[2]:
                    root[-1].append((token[0], token[1]))
                else:
                    root.append(Tree(token[2], [(token[0], token[1])]))
            except:
                root.append(Tree(token[2], [(token[0], token[1])]))

    return root
Ejemplo n.º 26
0
 def _tagged_to_parse(self, tagged_tokens):
     """
     Convert a list of tagged tokens to a chunk-parse tree.
     """
     sent = Tree('S', [])
     
     for (tok,tag) in tagged_tokens:
         if tag == 'O':
             sent.append(tok)
         elif tag.startswith('B-'):
             sent.append(Tree(tag[2:], [tok]))
         elif tag.startswith('I-'):
             if (sent and isinstance(sent[-1], Tree) and
                 sent[-1].node == tag[2:]):
                 sent[-1].append(tok)
             else:
                 sent.append(Tree(tag[2:], [tok]))
     return sent
Ejemplo n.º 27
0
    def _tagged_to_parse(self, tagged_tokens):
        """
        Convert a list of tagged tokens to a chunk-parse tree.
        """
        sent = Tree('S', [])

        for (tok, tag) in tagged_tokens:
            if tag == 'O':
                sent.append(tok)
            elif tag.startswith('B-'):
                sent.append(Tree(tag[2:], [tok]))
            elif tag.startswith('I-'):
                if (sent and isinstance(sent[-1], Tree)
                        and sent[-1].node == tag[2:]):
                    sent[-1].append(tok)
                else:
                    sent.append(Tree(tag[2:], [tok]))
        return sent
Ejemplo n.º 28
0
def RemoveNIChunks(structured_sentence):
    '''
    Removes branches that don't add extra information to the final parse, these being tagged by .*NI.*  
    Takes in the ParsedSentence after UncertainGrammar has been applied
    Uses the rules defined and applied in UncertainGrammar

    Parameters:
    -----------
    structured_sentence: Tree
        A sentence that has been parsed using nltk grammarparser

    Return:
    --------
    structured_sentence: Tree
        A sentence that has been parsed using nltk grammarparser

    Exceptions:
    -----------
  
    '''
    if(type(structured_sentence) != tuple):
	for index in range(len(structured_sentence)):
	    structured_sentence[index] = RemoveNIChunks(structured_sentence[index])

	if("NI" in structured_sentence.label()):#"NI" means NO INFORMATION, therefore that the additional chunk does nothing
	    label = structured_sentence.label().replace("NI","")
	    Inside = Tree(label,[])	   
		
	    for Chunk in structured_sentence:
		if type(Chunk) != tuple:
	 	    if "Clause" in Chunk.label():
		        for SubChunk in Chunk:
			    Inside.append(SubChunk)
		    else:
		        Inside.append(Chunk)
		else:
		    Inside.append(Chunk)
	    structured_sentence = Inside
    return structured_sentence
Ejemplo n.º 29
0
def load_ace_file(textfile, fmt):
    print("  - {0}".format(os.path.split(textfile)[1]))
    annfile = textfile + ".tmx.rdc.xml"

    # Read the xml file, and get a list of entities
    entities = []
    with open(annfile, "r") as infile:
        xml = ET.parse(infile).getroot()
    for entity in xml.findall("document/entity"):
        typ = entity.find("entity_type").text
        for mention in entity.findall("entity_mention"):
            if mention.get("TYPE") != "NAME":
                continue  # only NEs
            s = int(mention.find("head/charseq/start").text)
            e = int(mention.find("head/charseq/end").text) + 1
            entities.append((s, e, typ))

    # Read the text file, and mark the entities.
    with open(textfile, "r") as infile:
        text = infile.read()

    # Strip XML tags, since they don't count towards the indices
    text = re.sub("<(?!/?TEXT)[^>]+>", "", text)

    # Blank out anything before/after <TEXT>
    def subfunc(m):
        return " " * (m.end() - m.start() - 6)

    text = re.sub(r"[\s\S]*<TEXT>", subfunc, text)
    text = re.sub(r"</TEXT>[\s\S]*", "", text)

    # Simplify quotes
    text = re.sub("``", ' "', text)
    text = re.sub("''", '" ', text)

    entity_types = set(typ for (s, e, typ) in entities)

    # Binary distinction (NE or not NE)
    if fmt == "binary":
        i = 0
        toks = Tree("S", [])
        for (s, e, typ) in sorted(entities):
            if s < i:
                s = i  # Overlapping!  Deal with this better?
            if e <= s:
                continue
            toks.extend(word_tokenize(text[i:s]))
            toks.append(Tree("NE", text[s:e].split()))
            i = e
        toks.extend(word_tokenize(text[i:]))
        yield toks

    # Multiclass distinction (NE type)
    elif fmt == "multiclass":
        i = 0
        toks = Tree("S", [])
        for (s, e, typ) in sorted(entities):
            if s < i:
                s = i  # Overlapping!  Deal with this better?
            if e <= s:
                continue
            toks.extend(word_tokenize(text[i:s]))
            toks.append(Tree(typ, text[s:e].split()))
            i = e
        toks.extend(word_tokenize(text[i:]))
        yield toks

    else:
        raise ValueError("bad fmt value")
Ejemplo n.º 30
0
	def chunk(self, tree, rule, depth):

		ruleContents = rule.contents
		ruleName = rule.type

		if depth==0:  #maximum recursion set by depth
			return tree
	
		children = tree.treepositions('postorder') #get tuples for all locations in tree
	
		string = ""
		parent = {}
		subtrees = {} #key->new subtree to add to tree; value->location to place in treepositions()


		dictBrothers = rule.find_brothers(children, parent) # returns a dict. of those children in the tree who have the same parent, 
							# such that a rule MIGHT apply to them	

		if dictBrothers == dict(): # no possible application of rule
			return tree

		#now we have dictBrothers which is a list of all children who have the same parent,
		#we check to see which list of brothers corresponds to ruleContents
		#such that tree will need to be altered at that location

		for child in children:
			# look for a child in tree for whom it both (1) has brothers and (2) rule applies (rule_to_children(tree, brothers, rule))
			 # otherwise, just "continue"
	    		if not parent[child] in dictBrothers: 
				continue
			tempBrothers = dictBrothers[parent[child]]
			tuple = self.rule_to_children(tree, tempBrothers)
			if tuple == (-1,-1):
				continue
		
			#found a rule applies for certain children
			#now set up new tree
			#and re-arrange tree to fit
			#then recursively call chunker with depth-1

		
			start = tuple[0]
			end = tuple[1]

			newTree = Tree("("+ruleName+")")
		
			for i in range(end-start):  				#set up new tree
	
				newChild = tempBrothers[i+start]
				
				ruleList = ruleContents.split()	
			
			
				typeOf = type(tree[newChild])
				if typeOf is Tree:
					modifiedName = "<"+tree[newChild].node+">"			 
					tree[newChild].node = modifiedName
				else:
					#ruleList = ruleContents.split()	
					#subst="-->"
					#for i in range(len(rule)):
						#subst+="<"+ruleList[i]+"> " 		#add this so we know how tree was derived
					newTuple = (tree[newChild][0], "<"+str(tree[newChild][-1])+">")
					tree[newChild] = newTuple
	

				newTree.append(tree[newChild]) 		
		
			tree[tempBrothers[start]] = newTree 	#attach new tree at left-most child (start)
								#then remove old children except for 
								#0/start, which is the new tree
		
			for i in range(end-start):		
		 	  if i != 0:		
				tree[tempBrothers[i+start]] = "REMOVE"
		
			while "REMOVE" in tree:
				tree.remove("REMOVE")

			for subtree in tree.subtrees():
				if "REMOVE" in subtree:
					subtree.remove("REMOVE")

				
			#now recursively chunk if there are more brothers
			#to whom rule applies
			if len(dictBrothers)>1 or len(dictBrothers[parent[child]])>len(ruleContents.split()):
				return self.chunk(tree, rule, depth-1)
			else:		    
				return tree

		#found no children for whom rule applies, so just return tree
		return tree
Ejemplo n.º 31
0
def load_ace_file(textfile, fmt):
    print("  - %s" % os.path.split(textfile)[1])
    annfile = textfile + ".tmx.rdc.xml"

    # Read the xml file, and get a list of entities
    entities = []
    xml = ET.parse(open(annfile)).getroot()
    for entity in xml.findall("document/entity"):
        typ = entity.find("entity_type").text
        for mention in entity.findall("entity_mention"):
            if mention.get("TYPE") != "NAME":
                continue  # only NEs
            s = int(mention.find("head/charseq/start").text)
            e = int(mention.find("head/charseq/end").text) + 1
            entities.append((s, e, typ))

    # Read the text file, and mark the entities.
    with open(textfile) as fp:
        text = fp.read()

    # Strip XML tags, since they don't count towards the indices
    text = re.sub("<(?!/?TEXT)[^>]+>", "", text)

    # Blank out anything before/after <TEXT>
    def subfunc(m):
        return " " * (m.end() - m.start() - 6)

    text = re.sub("[\s\S]*<TEXT>", subfunc, text)
    text = re.sub("</TEXT>[\s\S]*", "", text)

    # Simplify quotes
    text = re.sub("``", ' "', text)
    text = re.sub("''", '" ', text)

    entity_types = set(typ for (s, e, typ) in entities)

    # Binary distinction (NE or not NE)
    if fmt == "binary":
        i = 0
        toks = Tree("S", [])
        for (s, e, typ) in sorted(entities):
            if s < i:
                s = i  # Overlapping!  Deal with this better?
            if e <= s:
                continue
            toks.extend(word_tokenize(text[i:s]))
            toks.append(Tree("NE", text[s:e].split()))
            i = e
        toks.extend(word_tokenize(text[i:]))
        yield toks

    # Multiclass distinction (NE type)
    elif fmt == "multiclass":
        i = 0
        toks = Tree("S", [])
        for (s, e, typ) in sorted(entities):
            if s < i:
                s = i  # Overlapping!  Deal with this better?
            if e <= s:
                continue
            toks.extend(word_tokenize(text[i:s]))
            toks.append(Tree(typ, text[s:e].split()))
            i = e
        toks.extend(word_tokenize(text[i:]))
        yield toks

    else:
        raise ValueError("bad fmt value")
Ejemplo n.º 32
0
def load_ace_file(textfile, fmt):
    print('  - {0}'.format(os.path.split(textfile)[1]))
    annfile = textfile + '.tmx.rdc.xml'

    # Read the xml file, and get a list of entities
    entities = []
    with open(annfile, 'r') as infile:
        xml = ET.parse(infile).getroot()
    for entity in xml.findall('document/entity'):
        typ = entity.find('entity_type').text
        for mention in entity.findall('entity_mention'):
            if mention.get('TYPE') != 'NAME': continue  # only NEs
            s = int(mention.find('head/charseq/start').text)
            e = int(mention.find('head/charseq/end').text) + 1
            entities.append((s, e, typ))

    # Read the text file, and mark the entities.
    with open(textfile, 'r') as infile:
        text = infile.read()

    # Strip XML tags, since they don't count towards the indices
    text = re.sub('<(?!/?TEXT)[^>]+>', '', text)

    # Blank out anything before/after <TEXT>
    def subfunc(m):
        return ' ' * (m.end() - m.start() - 6)

    text = re.sub('[\s\S]*<TEXT>', subfunc, text)
    text = re.sub('</TEXT>[\s\S]*', '', text)

    # Simplify quotes
    text = re.sub("``", ' "', text)
    text = re.sub("''", '" ', text)

    entity_types = set(typ for (s, e, typ) in entities)

    # Binary distinction (NE or not NE)
    if fmt == 'binary':
        i = 0
        toks = Tree('S', [])
        for (s, e, typ) in sorted(entities):
            if s < i: s = i  # Overlapping!  Deal with this better?
            if e <= s: continue
            toks.extend(word_tokenize(text[i:s]))
            toks.append(Tree('NE', text[s:e].split()))
            i = e
        toks.extend(word_tokenize(text[i:]))
        yield toks

    # Multiclass distinction (NE type)
    elif fmt == 'multiclass':
        i = 0
        toks = Tree('S', [])
        for (s, e, typ) in sorted(entities):
            if s < i: s = i  # Overlapping!  Deal with this better?
            if e <= s: continue
            toks.extend(word_tokenize(text[i:s]))
            toks.append(Tree(typ, text[s:e].split()))
            i = e
        toks.extend(word_tokenize(text[i:]))
        yield toks

    else:
        raise ValueError('bad fmt value')
Ejemplo n.º 33
0
def parse_string_tree(s, start):
    idx = start

    assert s[idx] == '('
    idx = idx + 1

    while s[idx] == ' ':
        idx = idx + 1

    if s[idx] == '(':

        tl, idx = parse_string_tree(s, idx)
        while s[idx] == ' ':
            idx = idx + 1

        if s[idx] == '(':
            t, idx = parse_string_tree(s, idx)

        # is a leaf

        t.insert(0, tl)

        # match closing bracket
        while s[idx] != ')':
            idx = idx + 1
        idx = idx + 1

        return t, idx
    else:
        # there is an input element
        aux = idx + 1
        while s[aux] != ' ' and s[aux] != ')' and s[aux] != '(':
            aux = aux + 1
        w = s[idx:aux]
        idx = aux
        t = Tree(w, [])

        while s[idx] == ' ':
            idx = idx + 1

        if s[idx] != '(':
            if s[idx] != ')':
                # another word
                aux = idx + 1
                while s[aux] != ' ' and s[aux] != ')' and s[aux] != '(':
                    aux = aux + 1
                wr = s[idx:aux]
                idx = aux
                tr = Tree(wr, [])

                t.append(tr)

            # match closing bracket
            while s[idx] != ')':
                idx = idx + 1
            idx = idx + 1

            return t, idx

        else:
            new_t, idx = parse_string_tree(s, idx)

            if len(t.label()) == 1:
                # is a variable
                new_t.insert(0, t)
                t = new_t
            else:
                # is an operator
                t.append(new_t)

            # match closing bracket
            while s[idx] != ')':
                idx = idx + 1
            idx = idx + 1

            return t, idx