def makeEntityGraph(self, entities, interactions, entityToDuplicates=None): graph = Graph() graph.addNodes(entities) # initialize a helper map interactionMap = {} for interaction in interactions: e1 = self.entitiesById[interaction.get("e1")] e2 = self.entitiesById[interaction.get("e2")] if e1 not in interactionMap: interactionMap[e1] = {} if e2 not in interactionMap[e1]: interactionMap[e1][e2] = [] interactionMap[e1][e2].append(interaction) if entityToDuplicates == None: entityToDuplicates = {} for e in entities: entityToDuplicates[e] = [] # make the graph for e1 in entities: # loop through all given entities for e2 in entities: # loop through all given entities interactionTypes = set() for d1 in [e1] + entityToDuplicates[e1]: # add duplicates to each iteration for d2 in [e2] + entityToDuplicates[e2]: # add duplicates to each iteration if d1 in interactionMap and d2 in interactionMap[d1]: for interaction in interactionMap[d1][d2]: if interaction.get("type") not in interactionTypes: # remove edges with the same type that another edge already had graph.addEdge(e1, e2, interaction) # add primary and duplicate edges for the main entity pair interactionTypes.add(interaction.get("type")) return graph
def __init__(self, sentenceElement, tokenElements, dependencyElements): """ Creates the syntactic graph part of the SentenceGraph. The semantic graph can be added with mapInteractions. @param sentenceElement: interaction-XML sentence-element @type sentenceElement: cElementTree.Element @param tokenElements: interaction-XML syntactic token elements @type tokenElements: list of cElementTree.Element objects @param dependencyElements: interacton-XML syntactic dependency elements @type dependencyElements: list of cElementTree.Element objects """ self.sentenceElement = sentenceElement self.tokens = tokenElements self.dependencies = dependencyElements #self.dependencyGraph = NX.XDiGraph(multiedges = multiedges) #if multiedges: # self.dependencyGraph = NX10.MultiDiGraph() #else: # self.dependencyGraph = NX10.DiGraph() self.dependencyGraph = Graph() self.interactions = None self.entities = None self.interactionGraph = None self.entityGraph = None self.duplicateInteractionEdgesRemoved = 0 self.tokenHeadScores = None # Merged graph self.mergedEntities = None self.mergedEntityToDuplicates = None self.mergedEntityGraph = None self.tokensById = {} for token in self.tokens: self.tokensById[token.get("id")] = token #self.dependencyGraph.add_node(token) self.dependencyGraph.addNodes(self.tokens) # Build the dependency graph using token-elements as nodes and dependency-elements # as edge data for dependency in self.dependencies: #self.dependencyGraph.add_edge(self.tokensById[dependency.attrib["t1"]],\ t1 = self.tokensById.get(dependency.get("t1")) t2 = self.tokensById.get(dependency.get("t2")) assert t1 != None and t2 != None, (t1, t2, self.tokensById.keys()) self.dependencyGraph.addEdge(t1, t2, dependency)
def makeEntityGraph(self, entities, interactions, entityToDuplicates=None): """ params: entities: merged Entities interactions: ALL original interactions entityToDuplicates: dict. key is a mergedEntity(Entity Element), value is a list of all duplicated entities of this mergedEntity(Entity Element) """ graph = Graph() graph.addNodes(entities) # make a dummy duplicate map if it's not required if entityToDuplicates == None: entityToDuplicates = {} for e in entities: entityToDuplicates[e] = [] # initialize a helper map, this is a dict of dict # interactionMap[e1][e2] is the interactions between e1 and e2 interactionMap = {} for interaction in interactions: e1 = self.entitiesById[interaction.get("e1")] e2Id = interaction.get("e2") if e2Id not in self.entitiesById: # intersentence interaction if e2Id not in entities: entities.append(e2Id) entityToDuplicates[e2Id] = [] e2 = e2Id # make a dummy node else: e2 = self.entitiesById[e2Id] if e1 not in interactionMap: interactionMap[e1] = {} if e2 not in interactionMap[e1]: interactionMap[e1][e2] = [] interactionMap[e1][e2].append(interaction) # make the graph for e1 in entities: # loop through all given entities for e2 in entities: # loop through all given entities interactionTypes = set() for d1 in [ e1 ] + entityToDuplicates[e1]: # add duplicates to each iteration for d2 in [e2] + entityToDuplicates[ e2]: # add duplicates to each iteration if d1 in interactionMap and d2 in interactionMap[d1]: for interaction in interactionMap[d1][d2]: if interaction.get( "type" ) not in interactionTypes: # remove edges with the same type that another edge already had graph.addEdge( e1, e2, interaction ) # add primary and duplicate edges for the main entity pair interactionTypes.add( interaction.get("type")) return graph
def mapInteractions(self, entityElements, interactionElements, verbose=False): """ Maps the semantic interactions to the syntactic graph. Syntactic dependencies are defined between tokens. Semantic edges (interactions) are defined between annotated entities. To utilize the correlation of the dependency parse with the semantic interactions, the graphs must be aligned by mapping the interaction graph's nodes (entities) to the syntactic graph's nodes (tokens). This is done by determining the head tokens of the entities. @param entityElements: the semantic nodes (triggers and named entities) @type entityElements: list of cElementTree.Element objects @param interactionElements: the semantic edges (e.g. Cause and Theme for GENIA) @type interactionElements: list of cElementTree.Element objects @param verbose: Print selected head tokens on screen @param verbose: boolean Duplicated interactions are skipped in this function. For all gold interactions between two tokens, it only keeps one interaction for each interactions type. """ self.interactions = interactionElements self.entities = entityElements # Entities that have no text binding can not be mapped and are therefore removed for entity in self.entities[:]: if entity.get("charOffset") == "": self.entities.remove(entity) #self.interactionGraph = NX.XDiGraph(multiedges = multiedges) #if multiedges: # self.interactionGraph = NX10.MultiDiGraph() #else: # self.interactionGraph = NX10.DiGraph() self.interactionGraph = Graph() self.interactionGraph.addNodes(self.tokens) #for token in self.tokens: # self.interactionGraph.add_node(token) self.entitiesByToken = {} # a mapping for fast access self.entitiesById = {} self.entityHeadTokenByEntity = {} sentenceSpan = (0, len(self.sentenceElement.get("text")) ) # for validating the entity offsets for entity in self.entities[:]: headToken = self.mapEntity(entity, verbose) if entity.tag != "entity": self.entities.remove(entity) elif headToken != None: self.entityHeadTokenByEntity[entity] = headToken self.entitiesById[entity.get("id")] = entity else: # Check that the entity is within the sentence if not Range.overlap( Range.charOffsetToSingleTuple( entity.get("charOffset")), sentenceSpan): raise Exception("Entity " + entity.get("id") + ", charOffset " + entity.get("charOffset") + ", does not overlap with sentence " + self.sentenceElement.get("id") + ", length " + str(sentenceSpan[1])) # Assume there simply is no token corresponding to the entity self.entities.remove(entity) self._markNamedEntities() for interaction in self.interactions: if (not self.entitiesById.has_key(interaction.get("e1")) ): #and self.entitiesById.has_key(interaction.get("e2")): continue # e1 is outside of this sentence # assign the token1 to whatever the entity id (key) as a placeholder - to test the interaction statistics # token1 = self.entityHeadTokenByEntity[self.entitiesById[self.entitiesById.keys()[0]]] # token2 = self.entityHeadTokenByEntity[self.entitiesById[interaction.get("e2")]] if (not self.entitiesById.has_key(interaction.get("e2")) ): #and self.entitiesById.has_key(interaction.get("e1")): continue # e2 is outside of this sentence # token1 = self.entityHeadTokenByEntity[self.entitiesById[interaction.get("e1")]] # token2 = self.entityHeadTokenByEntity[self.entitiesById[self.entitiesById.keys()[0]]] if self.entitiesById.has_key( interaction.get("e1")) and self.entitiesById.has_key( interaction.get("e2")): token1 = self.entityHeadTokenByEntity[self.entitiesById[ interaction.get("e1")]] token2 = self.entityHeadTokenByEntity[self.entitiesById[ interaction.get("e2")]] # else: # token1 = self.entityHeadTokenByEntity[self.entitiesById[self.entitiesById.keys()[0]]] # token2 = self.entityHeadTokenByEntity[self.entitiesById[self.entitiesById.keys()[0]]] # found = False # if multiedges: # edges = self.interactionGraph.get_edge_data(token1, token2, default={}) # for i in range(len(edges)): # edge = edges[i]["element"] # if edge.attrib["type"] == interaction.attrib["type"]: # found = True # break # if not found: # self.interactionGraph.add_edge(token1, token2, element=interaction) # else: # self.duplicateInteractionEdgesRemoved += 1 found = False edges = self.interactionGraph.getEdges(token1, token2) for edge in edges: if edge[2].get("type") == interaction.get("type"): found = True break if not found: self.interactionGraph.addEdge(token1, token2, interaction) else: # TODO: "skipped" would be better than "removed" self.duplicateInteractionEdgesRemoved += 1
class SentenceGraph: """ The main purpose of SentenceGraph is to connect the syntactic dependency parse (a graph where dependencies are edges and tokens are nodes) to the semantic interactions (which form a graph where interactions are edges and entities are nodes). Additionally, SentenceGraph provides several dictionaries that e.g. map element ids to their corresponding elements. """ def __init__(self, sentenceElement, tokenElements, dependencyElements): """ Creates the syntactic graph part of the SentenceGraph. The semantic graph can be added with mapInteractions. @param sentenceElement: interaction-XML sentence-element @type sentenceElement: cElementTree.Element @param tokenElements: interaction-XML syntactic token elements @type tokenElements: list of cElementTree.Element objects @param dependencyElements: interacton-XML syntactic dependency elements @type dependencyElements: list of cElementTree.Element objects """ self.sentenceElement = sentenceElement self.tokens = tokenElements self.dependencies = dependencyElements #self.dependencyGraph = NX.XDiGraph(multiedges = multiedges) #if multiedges: # self.dependencyGraph = NX10.MultiDiGraph() #else: # self.dependencyGraph = NX10.DiGraph() self.dependencyGraph = Graph() self.interactions = None self.entities = None self.interactionGraph = None self.entityGraph = None self.duplicateInteractionEdgesRemoved = 0 self.tokenHeadScores = None # Merged graph self.mergedEntities = None self.mergedEntityToDuplicates = None self.mergedEntityGraph = None self.tokensById = {} for token in self.tokens: self.tokensById[token.get("id")] = token #self.dependencyGraph.add_node(token) self.dependencyGraph.addNodes(self.tokens) # Build the dependency graph using token-elements as nodes and dependency-elements # as edge data for dependency in self.dependencies: #self.dependencyGraph.add_edge(self.tokensById[dependency.attrib["t1"]],\ t1 = self.tokensById.get(dependency.get("t1")) t2 = self.tokensById.get(dependency.get("t2")) assert t1 != None and t2 != None, (t1, t2, self.tokensById.keys()) self.dependencyGraph.addEdge(t1, t2, dependency) # def getUndirectedDependencyGraph(self): # """ # Create an undirected version of the syntactic dependency graph. # """ # u = NX10.MultiGraph() # for token in self.tokens: # u.add_node(token) # for dependency in self.dependencies: # u.add_edge(self.tokensById[dependency.attrib["t1"]],\ # self.tokensById[dependency.attrib["t2"]], element=dependency) # u.add_edge(self.tokensById[dependency.attrib["t2"]],\ # self.tokensById[dependency.attrib["t1"]], element=dependency) # return u def getSentenceId(self): return self.sentenceElement.get("id") def makeEntityGraph(self, entities, interactions, entityToDuplicates=None): """ params: entities: merged Entities interactions: ALL original interactions entityToDuplicates: dict. key is a mergedEntity(Entity Element), value is a list of all duplicated entities of this mergedEntity(Entity Element) """ graph = Graph() graph.addNodes(entities) # make a dummy duplicate map if it's not required if entityToDuplicates == None: entityToDuplicates = {} for e in entities: entityToDuplicates[e] = [] # initialize a helper map, this is a dict of dict # interactionMap[e1][e2] is the interactions between e1 and e2 interactionMap = {} for interaction in interactions: e1 = self.entitiesById[interaction.get("e1")] e2Id = interaction.get("e2") if e2Id not in self.entitiesById: # intersentence interaction if e2Id not in entities: entities.append(e2Id) entityToDuplicates[e2Id] = [] e2 = e2Id # make a dummy node else: e2 = self.entitiesById[e2Id] if e1 not in interactionMap: interactionMap[e1] = {} if e2 not in interactionMap[e1]: interactionMap[e1][e2] = [] interactionMap[e1][e2].append(interaction) # make the graph for e1 in entities: # loop through all given entities for e2 in entities: # loop through all given entities interactionTypes = set() for d1 in [ e1 ] + entityToDuplicates[e1]: # add duplicates to each iteration for d2 in [e2] + entityToDuplicates[ e2]: # add duplicates to each iteration if d1 in interactionMap and d2 in interactionMap[d1]: for interaction in interactionMap[d1][d2]: if interaction.get( "type" ) not in interactionTypes: # remove edges with the same type that another edge already had graph.addEdge( e1, e2, interaction ) # add primary and duplicate edges for the main entity pair interactionTypes.add( interaction.get("type")) return graph # TODO: This method shouldn't be needed anymore def getInteractions(self, entity1, entity2, merged=False): """ Return a list of interaction-elements which represent directed interactions from entity1 to entity2. @param entity1: a semantic node (trigger or named entity) @type entity1: cElementTree.Element @param entity2: a semantic node (trigger or named entity) @type entity2: cElementTree.Element """ if merged: # Note: mergeInteractionGraph must be called before if self.mergedEntityToDuplicates == None: self.mergeInteractionGraph(True) if self.mergedEntityGraph == None: self.mergedEntityGraph = self.makeEntityGraph( self.mergedEntities, self.interactions, self.mergedEntityToDuplicates) return self.mergedEntityGraph.getEdges(entity1, entity2) else: if self.entityGraph == None: self.entityGraph = self.makeEntityGraph( self.entities, self.interactions) return self.entityGraph.getEdges(entity1, entity2) def getOutInteractions(self, entity, merged=False): if merged: # Note: mergeInteractionGraph must be called before #assert self.mergedEntityToDuplicates != None if self.mergedEntityToDuplicates == None: self.mergeInteractionGraph(True) if self.mergedEntityGraph == None: self.mergedEntityGraph = self.makeEntityGraph( self.mergedEntities, self.interactions, self.mergedEntityToDuplicates) return self.mergedEntityGraph.getOutEdges(entity) else: if self.entityGraph == None: self.entityGraph = self.makeEntityGraph( self.entities, self.interactions) return self.entityGraph.getOutEdges(entity) # rv = [] # for interaction in self.interactions: # if interaction.get("e1") == entity1.get("id") and interaction.get("e2") == entity2.get("id"): # rv.append(interaction) # return rv def mapInteractions(self, entityElements, interactionElements, verbose=False): """ Maps the semantic interactions to the syntactic graph. Syntactic dependencies are defined between tokens. Semantic edges (interactions) are defined between annotated entities. To utilize the correlation of the dependency parse with the semantic interactions, the graphs must be aligned by mapping the interaction graph's nodes (entities) to the syntactic graph's nodes (tokens). This is done by determining the head tokens of the entities. @param entityElements: the semantic nodes (triggers and named entities) @type entityElements: list of cElementTree.Element objects @param interactionElements: the semantic edges (e.g. Cause and Theme for GENIA) @type interactionElements: list of cElementTree.Element objects @param verbose: Print selected head tokens on screen @param verbose: boolean Duplicated interactions are skipped in this function. For all gold interactions between two tokens, it only keeps one interaction for each interactions type. """ self.interactions = interactionElements self.entities = entityElements # Entities that have no text binding can not be mapped and are therefore removed for entity in self.entities[:]: if entity.get("charOffset") == "": self.entities.remove(entity) #self.interactionGraph = NX.XDiGraph(multiedges = multiedges) #if multiedges: # self.interactionGraph = NX10.MultiDiGraph() #else: # self.interactionGraph = NX10.DiGraph() self.interactionGraph = Graph() self.interactionGraph.addNodes(self.tokens) #for token in self.tokens: # self.interactionGraph.add_node(token) self.entitiesByToken = {} # a mapping for fast access self.entitiesById = {} self.entityHeadTokenByEntity = {} sentenceSpan = (0, len(self.sentenceElement.get("text")) ) # for validating the entity offsets for entity in self.entities[:]: headToken = self.mapEntity(entity, verbose) if entity.tag != "entity": self.entities.remove(entity) elif headToken != None: self.entityHeadTokenByEntity[entity] = headToken self.entitiesById[entity.get("id")] = entity else: # Check that the entity is within the sentence if not Range.overlap( Range.charOffsetToSingleTuple( entity.get("charOffset")), sentenceSpan): raise Exception("Entity " + entity.get("id") + ", charOffset " + entity.get("charOffset") + ", does not overlap with sentence " + self.sentenceElement.get("id") + ", length " + str(sentenceSpan[1])) # Assume there simply is no token corresponding to the entity self.entities.remove(entity) self._markNamedEntities() for interaction in self.interactions: if (not self.entitiesById.has_key(interaction.get("e1")) ): #and self.entitiesById.has_key(interaction.get("e2")): continue # e1 is outside of this sentence # assign the token1 to whatever the entity id (key) as a placeholder - to test the interaction statistics # token1 = self.entityHeadTokenByEntity[self.entitiesById[self.entitiesById.keys()[0]]] # token2 = self.entityHeadTokenByEntity[self.entitiesById[interaction.get("e2")]] if (not self.entitiesById.has_key(interaction.get("e2")) ): #and self.entitiesById.has_key(interaction.get("e1")): continue # e2 is outside of this sentence # token1 = self.entityHeadTokenByEntity[self.entitiesById[interaction.get("e1")]] # token2 = self.entityHeadTokenByEntity[self.entitiesById[self.entitiesById.keys()[0]]] if self.entitiesById.has_key( interaction.get("e1")) and self.entitiesById.has_key( interaction.get("e2")): token1 = self.entityHeadTokenByEntity[self.entitiesById[ interaction.get("e1")]] token2 = self.entityHeadTokenByEntity[self.entitiesById[ interaction.get("e2")]] # else: # token1 = self.entityHeadTokenByEntity[self.entitiesById[self.entitiesById.keys()[0]]] # token2 = self.entityHeadTokenByEntity[self.entitiesById[self.entitiesById.keys()[0]]] # found = False # if multiedges: # edges = self.interactionGraph.get_edge_data(token1, token2, default={}) # for i in range(len(edges)): # edge = edges[i]["element"] # if edge.attrib["type"] == interaction.attrib["type"]: # found = True # break # if not found: # self.interactionGraph.add_edge(token1, token2, element=interaction) # else: # self.duplicateInteractionEdgesRemoved += 1 found = False edges = self.interactionGraph.getEdges(token1, token2) for edge in edges: if edge[2].get("type") == interaction.get("type"): found = True break if not found: self.interactionGraph.addEdge(token1, token2, interaction) else: # TODO: "skipped" would be better than "removed" self.duplicateInteractionEdgesRemoved += 1 def mapEntity(self, entityElement, verbose=False): """ Determine the head token for a named entity or trigger. The head token is the token closest to the root for the subtree of the dependency parse spanned by the text of the element. @param entityElement: a semantic node (trigger or named entity) @type entityElement: cElementTree.Element @param verbose: Print selected head tokens on screen @param verbose: boolean """ headOffset = None if entityElement.get("headOffset") != None: headOffset = Range.charOffsetToSingleTuple( entityElement.get("headOffset")) if entityElement.get("charOffset") != "": charOffsets = Range.charOffsetToTuples( entityElement.get("charOffset")) else: charOffsets = [] # Each entity can consist of multiple syntactic tokens, covered by its # charOffset-range. One of these must be chosen as the head token. headTokens = [] # potential head tokens for token in self.tokens: #print token.attrib["id"], token.attrib["charOffset"] tokenOffset = Range.charOffsetToSingleTuple( token.get("charOffset")) if headOffset != None and entityElement.get("type") != "Binding": # A head token can already be defined in the headOffset-attribute. # However, depending on the tokenization, even this range may # contain multiple tokens. Still, it can always be assumed that # if headOffset is defined, the corret head token is in this range. if Range.overlap(headOffset, tokenOffset): headTokens.append(token) else: for offset in charOffsets: if Range.overlap(offset, tokenOffset): headTokens.append(token) if len(headTokens) == 1: # An unambiguous head token was found token = headTokens[0] else: # One head token must be chosen from the candidates selHead = None if entityElement.get("type") == "Binding": for t in headTokens: compText = t.get("text").lower() for bindWord in ("bind", "complex", "h**o", "hetero", "dimer"): if bindWord in compText: selHead = t break if selHead != None: break # if compText.find("bind") != -1 or compText.find("complex") != -1: # selHead = t # #print "Head:", selHead.get("text"), "/", entityElement.get("text"), entityElement.get("headOffset"), selHead.get("charOffset") # entityElement.set("headOffset", selHead.get("charOffset")) # break # elif "egulation" in entityElement.get("type"): # self.getTokenHeadScores() # regulationHeads = [x for x in headTokens if self.tokenHeadScores[x] >= 1] # if len(regulationHeads) > 0: # selHead = regulationHeads[-1] if selHead == None: token = self.findHeadToken(headTokens) else: token = selHead if verbose: print >> sys.stderr, "Selected head:", token.get( "id"), token.get("text") #assert token != None, entityElement.get("id") if token != None: # The ElementTree entity-element is modified by setting the headOffset attribute if entityElement.get("headOffset") == None or entityElement.get( "headOffset") != token.get("charOffset"): entityElement.set("headOffset", token.get("charOffset")) if not self.entitiesByToken.has_key(token): self.entitiesByToken[token] = [] self.entitiesByToken[token].append(entityElement) else: print >> sys.stderr, "Warning, no tokens for entity", entityElement.get( "id") return token # def mapEntityHints(self, verbose=False): # """ # Determine the head token for a named entity or trigger. The head token is the token closest # to the root for the subtree of the dependency parse spanned by the text of the element. # # @param entityElement: a semantic node (trigger or named entity) # @type entityElement: cElementTree.Element # @param verbose: Print selected head tokens on screen # @param verbose: boolean # """ # self.entityHints = self.sentenceElement.findall("entityHint") # self.entityHintsByToken = {} # for entityElement in self.entityHints: # headOffset = None # if entityElement.attrib.has_key("headOffset"): # headOffset = Range.charOffsetToSingleTuple(entityElement.attrib["headOffset"]) # if entityElement.attrib["charOffset"] != "": # charOffsets = Range.charOffsetToTuples(entityElement.attrib["charOffset"]) # else: # charOffsets = [] # # Each entity can consist of multiple syntactic tokens, covered by its # # charOffset-range. One of these must be chosen as the head token. # headTokens = [] # potential head tokens # for token in self.tokens: # #print token.attrib["id"], token.attrib["charOffset"] # tokenOffset = Range.charOffsetToSingleTuple(token.attrib["charOffset"]) # if headOffset != None: # # A head token can already be defined in the headOffset-attribute. # # However, depending on the tokenization, even this range may # # contain multiple tokens. Still, it can always be assumed that # # if headOffset is defined, the corret head token is in this range. # if Range.overlap(headOffset,tokenOffset): # headTokens.append(token) # else: # for offset in charOffsets: # if Range.overlap(offset,tokenOffset): # headTokens.append(token) # if len(headTokens)==1: # An unambiguous head token was found # token = headTokens[0] # else: # One head token must be chosen from the candidates # token = self.findHeadToken(headTokens) # if verbose: # print >> sys.stderr, "Selected head:", token.attrib["id"], token.attrib["text"] # assert token != None, entityElement.get("id") # if token != None: # # The ElementTree entity-element is modified by setting the headOffset attribute # if not entityElement.attrib.has_key("headOffset"): # entityElement.attrib["headOffset"] = token.attrib["charOffset"] # if not self.entityHintsByToken.has_key(token): # self.entityHintsByToken[token] = [] # self.entityHintsByToken[token].append(entityElement) # def findHeadToken(self, candidateTokens): # return sorted([(int(x.get("headScore")), x) for x in candidateTokens])[0][-1] # def findHeadToken(self, candidateTokens): # candidateTokenSet = set(candidateTokens) # tokenScores = [] # index = 0 # for token in candidateTokens: # tokenScore = [] # a hierarchical score list # counts = {"in-from-candidate":0, "out-to-candidate":0, "in-from-external":0, "out-to-external":0} # inEdges = self.dependencyGraph.getInEdges(token) # for inEdge in inEdges: # counts["in-from-candidate" if inEdge[0] in candidateTokenSet else "in-from-external"] += 1 # outEdges = self.dependencyGraph.getOutEdges(token) # for outEdge in outEdges: # counts["out-to-candidate" if outEdge[1] in candidateTokenSet else "out-to-external"] += 1 # tokenScore.append(1 if len(inEdges) + len(outEdges) > 0 else 0) # prefer tokens connected to the parse # tokenScore.append(counts["out-to-candidate"]) # tokenScore.append(-counts["in-from-external"]) # # tokenScore.append(len(outEdges)) # # #tokenScore.append(counts["out-to-external"]) # prefer tokens with outgoing external edges # # #tokenScore.append(-counts["in-from-candidate"]) # prefer tokens without incoming edges from other candidates # # #tokenScore.append(counts["out-to-candidate"]) # prefer tokens with outgoing edges to other candidates # # #tokenScore.append(counts["in-from-external"]) # prefer tokens with incoming external edges # # tokenScore.append(1 if re.search('[a-zA-Z]', token.get("text")) != None else 0) # prefer tokens with letters # tokenScore.append(index) # if everything else is equal, prefer the rightmost token # tokenScore.append(token) # add the token itself as the last element of the score list so it will get sorted along with the score # index += 1 # tokenScores.append(tokenScore) # #token.set("headCounts", str(counts)) # tokenScores.sort(reverse=True) # rank = 0 # for tokenScore in tokenScores: # tokenScore[-1].set("headRank", str(rank)) # tokenScore[-1].set("headScore", str(",".join([str(x) for x in tokenScore[:-1]]))) # rank += 1 # return tokenScores[0][-1] def findHeadToken(self, candidateTokens): """ Select the candidate token that is closest to the root of the subtree of the depencdeny parse to which the candidate tokens belong to. See getTokenHeadScores method for the algorithm. @param candidateTokens: the list of syntactic tokens from which the head token is selected @type candidateTokens: list of cElementTree.Element objects """ tokenHeadScores = self.getTokenHeadScores() #if debug: # print "Tokens:", candidateTokenIds # print "Scores:", tokenScores if len(candidateTokens) == 0: return None highestScore = -9999999 bestTokens = [] for token in candidateTokens: if tokenHeadScores[token] > highestScore: highestScore = tokenHeadScores[token] for token in candidateTokens: if tokenHeadScores[token] == highestScore: bestTokens.append(token) # if debug: # print "tokens:" # for i in range(len(candidateTokenIds)): # print "[", candidateTokenIds[i], self.tokensById[candidateTokenIds[i]].text, tokenHeadScores[candidateTokenIds[i]], "]" return bestTokens[-1] def getTokenHeadScores(self): """ A head token is chosen using a heuristic that prefers tokens closer to the root of the dependency parse. In a list of candidate tokens, the one with the highest score is the head token. The return value of this method is a dictionary that maps token elements to their scores. """ # Token head scores are cached the first time this function is called if self.tokenHeadScores != None: return self.tokenHeadScores else: self.tokenHeadScores = {} # Give all tokens initial scores tokenById = {} for token in self.tokens: tokenId = token.get("id") assert tokenId not in tokenById tokenById[tokenId] = token self.tokenHeadScores[ token] = 0 # initialize score as zero (unconnected token) for dependency in self.dependencies: if dependency.get("t1") == token.get("id") or dependency.get( "t2") == token.get("id"): self.tokenHeadScores[ token] = 1 # token is connected by a dependency break # Give a low score for tokens that clearly can't be head and are probably produced by hyphen-splitter for token in self.tokens: tokenText = token.get("text") if tokenText == "\\" or tokenText == "/" or tokenText == "-": self.tokenHeadScores[token] = -1 # Loop over all dependencies and increase the scores of all governor tokens # until each governor token has a higher score than its dependent token. # Some dependencies might form a loop so a list is used to define those # dependency types used in determining head scores. #depTypesToInclude = ["prep", "nn", "det", "hyphen", "num", "amod", "nmod", "appos", "measure", "dep", "partmod"] #depTypesToRemoveReverse = ["A/AN"] modifiedScores = True loopCount = 0 # loopcount for devel set approx. 2-4 while modifiedScores == True: # loop until the scores no longer change if loopCount > 20: # survive loops #print >> sys.stderr, "Warning, possible loop in parse for sentence", self.getSentenceId() break modifiedScores = False # for token1 in self.tokens: # for token2 in self.tokens: # for each combination of tokens... for dep in self.dependencies: # ... check each dependency token1 = tokenById[dep.get("t1")] token2 = tokenById[dep.get("t2")] #if dep.get("type") in depTypesToInclude: # The governor token of the dependency must have a higher score # than the dependent token. if self.tokenHeadScores[token1] <= self.tokenHeadScores[token2]: self.tokenHeadScores[ token1] = self.tokenHeadScores[token2] + 1 modifiedScores = True # elif dep.attrib["t1"] == tokenI.attrib["id"] and dep.attrib["t2"] == tokenJ.attrib["id"] and (dep.attrib["type"] in depTypesToRemoveReverse): # #tokenScores[i] -= 1 # if self.tokenHeadScores[tokenJ] <= self.tokenHeadScores[tokenI]: # self.tokenHeadScores[tokenJ] = self.tokenHeadScores[tokenI] + 1 # modifiedScores = True loopCount += 1 # Add scores to tokens for token in self.tokens: token.set("headScore", str(self.tokenHeadScores[token])) return self.tokenHeadScores def _markNamedEntities(self): """ This method is used to define which tokens belong to _named_ entities. Named entities are sometimes masked when testing learning of interactions, to prevent the system making a trivial decision based on commonly interacting names. This function assumes that all given entities are named entities. """ self.tokenIsName = {} self.tokenIsEntity = {} self.tokenIsEntityHead = {} # Initialize the dictionaries for token in self.tokens: self.tokenIsName[token] = False self.tokenIsEntity[token] = False self.tokenIsEntityHead[token] = [] for entity in self.entities: entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) entityHeadOffset = Range.charOffsetToSingleTuple( entity.get("headOffset")) for token in self.tokens: tokenOffset = Range.charOffsetToSingleTuple( token.get("charOffset")) for entityOffset in entityOffsets: if Range.overlap(entityOffset, tokenOffset): self.tokenIsEntity[token] = True if entity.get("given") == "True": self.tokenIsName[token] = True # if entity.get("given") != None: # if entity.get("given") == "True": # self.tokenIsName[token] = True # else: # entity.set("given", "True") # self.tokenIsName[token] = True if Range.overlap(entityHeadOffset, tokenOffset): self.tokenIsEntityHead[token].append(entity) def getTokenText(self, token): """ Returns the text of a token, and masks it if the token is the head token of a named entity. @param token: interaction-XML syntactic token. @type token: cElementTree.Element """ if self.tokenIsName[token]: return "NAMED_ENT" else: return token.get("text") def getCleared(self): c = SentenceGraph(self.sentenceElement, self.tokens, self.dependencies) namedEntities = [] for entity in self.entities: if entity.get("given") == "True": namedEntities.append(entity) c.mapInteractions(namedEntities, []) return c def mergeInteractionGraph(self, merge=True): """ For merging duplicate entities keepDuplicates - allows calling the function with no effect, so that the same code can be used for merged and unmerged cases """ # pdb.set_trace() self.mergedEntities = [] self.mergedEntityToDuplicates = {} #duplicates = {} #mergedIds = {} if not merge: # no entities are filtered # Create dummy structures for entity in self.entities: mergedIds[entity] = entity.get("id") self.mergedEntities.append(entity) self.mergedEntityToDuplicates[entity] = [] return # Mark all duplicates after the first one in the list for removal removeEntities = [False] * len(self.entities) entitiesToKeep = [] for i in range(len(self.entities) ): # loop through all entities, including the last one if removeEntities[i]: # entity has been already removed continue self.mergedEntities.append(self.entities[i]) #mergedIds[entities[i]] = entities[i].get("id") self.mergedEntityToDuplicates[self.entities[i]] = [] if self.entities[i].get( "given") == "True": # named entities are never merged continue for j in range( i + 1, len(self.entities) ): # loop through all entities coming after entity "i" # Entities are duplicates if they have the same type and head token # Also, they are not duplicates if the charOffset differs. This shoulnd't matter, # as the head tokens are the same, but in practice, on the GE, task improves performance, # maybe due to multiple similar examples affecting SVM learning. if self.entities[i].get("type") == self.entities[j].get("type") and \ self.entities[i].get("charOffset") == self.entities[j].get("charOffset"): # and self.entityHeadTokenByEntity[self.entities[i]] == self.entityHeadTokenByEntity[self.entities[j]]: removeEntities[j] = True #mergedIds[entities[i]] += "/" + entities[j].get("id") self.mergedEntityToDuplicates[self.entities[i]].append( self.entities[j])
def mapInteractions(self, entityElements, interactionElements, verbose=False): """ Maps the semantic interactions to the syntactic graph. Syntactic dependencies are defined between tokens. Semantic edges (interactions) are defined between annotated entities. To utilize the correlation of the dependency parse with the semantic interactions, the graphs must be aligned by mapping the interaction graph's nodes (entities) to the syntactic graph's nodes (tokens). This is done by determining the head tokens of the entities. @param entityElements: the semantic nodes (triggers and named entities) @type entityElements: list of cElementTree.Element objects @param interactionElements: the semantic edges (e.g. Cause and Theme for GENIA) @type interactionElements: list of cElementTree.Element objects @param verbose: Print selected head tokens on screen @param verbose: boolean """ self.interactions = interactionElements self.entities = entityElements # Entities that have no text binding can not be mapped and are therefore removed for entity in self.entities[:]: if entity.get("charOffset") == "": self.entities.remove(entity) #self.interactionGraph = NX.XDiGraph(multiedges = multiedges) #if multiedges: # self.interactionGraph = NX10.MultiDiGraph() #else: # self.interactionGraph = NX10.DiGraph() self.interactionGraph = Graph() self.interactionGraph.addNodes(self.tokens) #for token in self.tokens: # self.interactionGraph.add_node(token) self.entitiesByToken = {} # a mapping for fast access self.entitiesById = {} self.entityHeadTokenByEntity = {} for entity in self.entities[:]: headToken = self.mapEntity(entity, verbose) if headToken != None: self.entityHeadTokenByEntity[entity] = headToken self.entitiesById[entity.get("id")] = entity else: self.entities.remove(entity) self._markNamedEntities() for interaction in self.interactions: if not self.entitiesById.has_key(interaction.get("e1")): continue # e1 is outside of this sentence if not self.entitiesById.has_key(interaction.get("e2")): continue # e2 is outside of this sentence token1 = self.entityHeadTokenByEntity[self.entitiesById[interaction.get("e1")]] token2 = self.entityHeadTokenByEntity[self.entitiesById[interaction.get("e2")]] # found = False # if multiedges: # edges = self.interactionGraph.get_edge_data(token1, token2, default={}) # for i in range(len(edges)): # edge = edges[i]["element"] # if edge.attrib["type"] == interaction.attrib["type"]: # found = True # break # if not found: # self.interactionGraph.add_edge(token1, token2, element=interaction) # else: # self.duplicateInteractionEdgesRemoved += 1 found = False edges = self.interactionGraph.getEdges(token1, token2) for edge in edges: if edge[2].get("type") == interaction.get("type"): found = True break if not found: self.interactionGraph.addEdge(token1, token2, interaction) else: # TODO: "skipped" would be better than "removed" self.duplicateInteractionEdgesRemoved += 1
class SentenceGraph: """ The main purpose of SentenceGraph is to connect the syntactic dependency parse (a graph where dependencies are edges and tokens are nodes) to the semantic interactions (which form a graph where interactions are edges and entities are nodes). Additionally, SentenceGraph provides several dictionaries that e.g. map element ids to their corresponding elements. """ def __init__(self, sentenceElement, tokenElements, dependencyElements): """ Creates the syntactic graph part of the SentenceGraph. The semantic graph can be added with mapInteractions. @param sentenceElement: interaction-XML sentence-element @type sentenceElement: cElementTree.Element @param tokenElements: interaction-XML syntactic token elements @type tokenElements: list of cElementTree.Element objects @param dependencyElements: interacton-XML syntactic dependency elements @type dependencyElements: list of cElementTree.Element objects """ self.sentenceElement = sentenceElement self.tokens = tokenElements self.dependencies = dependencyElements #self.dependencyGraph = NX.XDiGraph(multiedges = multiedges) #if multiedges: # self.dependencyGraph = NX10.MultiDiGraph() #else: # self.dependencyGraph = NX10.DiGraph() self.dependencyGraph = Graph() self.interactions = None self.entities = None self.interactionGraph = None self.entityGraph = None self.duplicateInteractionEdgesRemoved = 0 self.tokenHeadScores = None # Merged graph self.mergedEntities = None self.mergedEntityToDuplicates = None self.mergedEntityGraph = None self.tokensById = {} for token in self.tokens: self.tokensById[token.get("id")] = token #self.dependencyGraph.add_node(token) self.dependencyGraph.addNodes(self.tokens) # Build the dependency graph using token-elements as nodes and dependency-elements # as edge data for dependency in self.dependencies: #self.dependencyGraph.add_edge(self.tokensById[dependency.attrib["t1"]],\ self.dependencyGraph.addEdge(self.tokensById[dependency.get("t1")],\ self.tokensById[dependency.get("t2")],\ dependency) # element=dependency) # def getUndirectedDependencyGraph(self): # """ # Create an undirected version of the syntactic dependency graph. # """ # u = NX10.MultiGraph() # for token in self.tokens: # u.add_node(token) # for dependency in self.dependencies: # u.add_edge(self.tokensById[dependency.attrib["t1"]],\ # self.tokensById[dependency.attrib["t2"]], element=dependency) # u.add_edge(self.tokensById[dependency.attrib["t2"]],\ # self.tokensById[dependency.attrib["t1"]], element=dependency) # return u def getSentenceId(self): return self.sentenceElement.get("id") def makeEntityGraph(self, entities, interactions, entityToDuplicates=None): graph = Graph() graph.addNodes(entities) # initialize a helper map interactionMap = {} for interaction in interactions: e1 = self.entitiesById[interaction.get("e1")] e2 = self.entitiesById[interaction.get("e2")] if e1 not in interactionMap: interactionMap[e1] = {} if e2 not in interactionMap[e1]: interactionMap[e1][e2] = [] interactionMap[e1][e2].append(interaction) if entityToDuplicates == None: entityToDuplicates = {} for e in entities: entityToDuplicates[e] = [] # make the graph for e1 in entities: # loop through all given entities for e2 in entities: # loop through all given entities interactionTypes = set() for d1 in [e1] + entityToDuplicates[e1]: # add duplicates to each iteration for d2 in [e2] + entityToDuplicates[e2]: # add duplicates to each iteration if d1 in interactionMap and d2 in interactionMap[d1]: for interaction in interactionMap[d1][d2]: if interaction.get("type") not in interactionTypes: # remove edges with the same type that another edge already had graph.addEdge(e1, e2, interaction) # add primary and duplicate edges for the main entity pair interactionTypes.add(interaction.get("type")) return graph # TODO: This method shouldn't be needed anymore def getInteractions(self, entity1, entity2, merged=False): """ Return a list of interaction-elements which represent directed interactions from entity1 to entity2. @param entity1: a semantic node (trigger or named entity) @type entity1: cElementTree.Element @param entity2: a semantic node (trigger or named entity) @type entity2: cElementTree.Element """ if merged: # Note: mergeInteractionGraph must be called before if self.mergedEntityToDuplicates == None: self.mergeInteractionGraph(True) if self.mergedEntityGraph == None: self.mergedEntityGraph = self.makeEntityGraph(self.mergedEntities, self.interactions, self.mergedEntityToDuplicates) return self.mergedEntityGraph.getEdges(entity1, entity2) else: if self.entityGraph == None: self.entityGraph = self.makeEntityGraph(self.entities, self.interactions) return self.entityGraph.getEdges(entity1, entity2) def getOutInteractions(self, entity, merged=False): if merged: # Note: mergeInteractionGraph must be called before #assert self.mergedEntityToDuplicates != None if self.mergedEntityToDuplicates == None: self.mergeInteractionGraph(True) if self.mergedEntityGraph == None: self.mergedEntityGraph = self.makeEntityGraph(self.mergedEntities, self.interactions, self.mergedEntityToDuplicates) return self.mergedEntityGraph.getOutEdges(entity) else: if self.entityGraph == None: self.entityGraph = self.makeEntityGraph(self.entities, self.interactions) return self.entityGraph.getOutEdges(entity) # rv = [] # for interaction in self.interactions: # if interaction.get("e1") == entity1.get("id") and interaction.get("e2") == entity2.get("id"): # rv.append(interaction) # return rv def mapInteractions(self, entityElements, interactionElements, verbose=False): """ Maps the semantic interactions to the syntactic graph. Syntactic dependencies are defined between tokens. Semantic edges (interactions) are defined between annotated entities. To utilize the correlation of the dependency parse with the semantic interactions, the graphs must be aligned by mapping the interaction graph's nodes (entities) to the syntactic graph's nodes (tokens). This is done by determining the head tokens of the entities. @param entityElements: the semantic nodes (triggers and named entities) @type entityElements: list of cElementTree.Element objects @param interactionElements: the semantic edges (e.g. Cause and Theme for GENIA) @type interactionElements: list of cElementTree.Element objects @param verbose: Print selected head tokens on screen @param verbose: boolean """ self.interactions = interactionElements self.entities = entityElements # Entities that have no text binding can not be mapped and are therefore removed for entity in self.entities[:]: if entity.get("charOffset") == "": self.entities.remove(entity) #self.interactionGraph = NX.XDiGraph(multiedges = multiedges) #if multiedges: # self.interactionGraph = NX10.MultiDiGraph() #else: # self.interactionGraph = NX10.DiGraph() self.interactionGraph = Graph() self.interactionGraph.addNodes(self.tokens) #for token in self.tokens: # self.interactionGraph.add_node(token) self.entitiesByToken = {} # a mapping for fast access self.entitiesById = {} self.entityHeadTokenByEntity = {} for entity in self.entities[:]: headToken = self.mapEntity(entity, verbose) if headToken != None: self.entityHeadTokenByEntity[entity] = headToken self.entitiesById[entity.get("id")] = entity else: self.entities.remove(entity) self._markNamedEntities() for interaction in self.interactions: if not self.entitiesById.has_key(interaction.get("e1")): continue # e1 is outside of this sentence if not self.entitiesById.has_key(interaction.get("e2")): continue # e2 is outside of this sentence token1 = self.entityHeadTokenByEntity[self.entitiesById[interaction.get("e1")]] token2 = self.entityHeadTokenByEntity[self.entitiesById[interaction.get("e2")]] # found = False # if multiedges: # edges = self.interactionGraph.get_edge_data(token1, token2, default={}) # for i in range(len(edges)): # edge = edges[i]["element"] # if edge.attrib["type"] == interaction.attrib["type"]: # found = True # break # if not found: # self.interactionGraph.add_edge(token1, token2, element=interaction) # else: # self.duplicateInteractionEdgesRemoved += 1 found = False edges = self.interactionGraph.getEdges(token1, token2) for edge in edges: if edge[2].get("type") == interaction.get("type"): found = True break if not found: self.interactionGraph.addEdge(token1, token2, interaction) else: # TODO: "skipped" would be better than "removed" self.duplicateInteractionEdgesRemoved += 1 def mapEntity(self, entityElement, verbose=False): """ Determine the head token for a named entity or trigger. The head token is the token closest to the root for the subtree of the dependency parse spanned by the text of the element. @param entityElement: a semantic node (trigger or named entity) @type entityElement: cElementTree.Element @param verbose: Print selected head tokens on screen @param verbose: boolean """ headOffset = None if entityElement.get("headOffset") != None: headOffset = Range.charOffsetToSingleTuple(entityElement.get("headOffset")) if entityElement.get("charOffset") != "": charOffsets = Range.charOffsetToTuples(entityElement.get("charOffset")) else: charOffsets = [] # Each entity can consist of multiple syntactic tokens, covered by its # charOffset-range. One of these must be chosen as the head token. headTokens = [] # potential head tokens for token in self.tokens: #print token.attrib["id"], token.attrib["charOffset"] tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) if headOffset != None and entityElement.get("type") != "Binding": # A head token can already be defined in the headOffset-attribute. # However, depending on the tokenization, even this range may # contain multiple tokens. Still, it can always be assumed that # if headOffset is defined, the corret head token is in this range. if Range.overlap(headOffset,tokenOffset): headTokens.append(token) else: for offset in charOffsets: if Range.overlap(offset,tokenOffset): headTokens.append(token) if len(headTokens)==1: # An unambiguous head token was found token = headTokens[0] else: # One head token must be chosen from the candidates selHead = None if entityElement.get("type") == "Binding": for t in headTokens: compText = t.get("text").lower() if compText.find("bind") != -1 or compText.find("complex") != -1: selHead = t #print "Head:", selHead.get("text"), "/", entityElement.get("text"), entityElement.get("headOffset"), selHead.get("charOffset") entityElement.set("headOffset", selHead.get("charOffset")) break if selHead == None: token = self.findHeadToken(headTokens) else: token = selHead if verbose: print >> sys.stderr, "Selected head:", token.get("id"), token.get("text") #assert token != None, entityElement.get("id") if token != None: # The ElementTree entity-element is modified by setting the headOffset attribute if entityElement.get("headOffset") == None or entityElement.get("headOffset") != token.get("charOffset"): entityElement.set("headOffset", token.get("charOffset")) if not self.entitiesByToken.has_key(token): self.entitiesByToken[token] = [] self.entitiesByToken[token].append(entityElement) else: print >> sys.stderr, "Warning, no tokens for entity", entityElement.get("id") return token # def mapEntityHints(self, verbose=False): # """ # Determine the head token for a named entity or trigger. The head token is the token closest # to the root for the subtree of the dependency parse spanned by the text of the element. # # @param entityElement: a semantic node (trigger or named entity) # @type entityElement: cElementTree.Element # @param verbose: Print selected head tokens on screen # @param verbose: boolean # """ # self.entityHints = self.sentenceElement.findall("entityHint") # self.entityHintsByToken = {} # for entityElement in self.entityHints: # headOffset = None # if entityElement.attrib.has_key("headOffset"): # headOffset = Range.charOffsetToSingleTuple(entityElement.attrib["headOffset"]) # if entityElement.attrib["charOffset"] != "": # charOffsets = Range.charOffsetToTuples(entityElement.attrib["charOffset"]) # else: # charOffsets = [] # # Each entity can consist of multiple syntactic tokens, covered by its # # charOffset-range. One of these must be chosen as the head token. # headTokens = [] # potential head tokens # for token in self.tokens: # #print token.attrib["id"], token.attrib["charOffset"] # tokenOffset = Range.charOffsetToSingleTuple(token.attrib["charOffset"]) # if headOffset != None: # # A head token can already be defined in the headOffset-attribute. # # However, depending on the tokenization, even this range may # # contain multiple tokens. Still, it can always be assumed that # # if headOffset is defined, the corret head token is in this range. # if Range.overlap(headOffset,tokenOffset): # headTokens.append(token) # else: # for offset in charOffsets: # if Range.overlap(offset,tokenOffset): # headTokens.append(token) # if len(headTokens)==1: # An unambiguous head token was found # token = headTokens[0] # else: # One head token must be chosen from the candidates # token = self.findHeadToken(headTokens) # if verbose: # print >> sys.stderr, "Selected head:", token.attrib["id"], token.attrib["text"] # assert token != None, entityElement.get("id") # if token != None: # # The ElementTree entity-element is modified by setting the headOffset attribute # if not entityElement.attrib.has_key("headOffset"): # entityElement.attrib["headOffset"] = token.attrib["charOffset"] # if not self.entityHintsByToken.has_key(token): # self.entityHintsByToken[token] = [] # self.entityHintsByToken[token].append(entityElement) def findHeadToken(self, candidateTokens): """ Select the candidate token that is closest to the root of the subtree of the depencdeny parse to which the candidate tokens belong to. See getTokenHeadScores method for the algorithm. @param candidateTokens: the list of syntactic tokens from which the head token is selected @type candidateTokens: list of cElementTree.Element objects """ tokenHeadScores = self.getTokenHeadScores() #if debug: # print "Tokens:", candidateTokenIds # print "Scores:", tokenScores if len(candidateTokens) == 0: return None highestScore = -9999999 bestTokens = [] for token in candidateTokens: if tokenHeadScores[token] > highestScore: highestScore = tokenHeadScores[token] for token in candidateTokens: if tokenHeadScores[token] == highestScore: bestTokens.append(token) # if debug: # print "tokens:" # for i in range(len(candidateTokenIds)): # print "[", candidateTokenIds[i], self.tokensById[candidateTokenIds[i]].text, tokenHeadScores[candidateTokenIds[i]], "]" return bestTokens[-1] def getTokenHeadScores(self): """ A head token is chosen using a heuristic that prefers tokens closer to the root of the dependency parse. In a list of candidate tokens, the one with the highest score is the head token. The return value of this method is a dictionary that maps token elements to their scores. """ # Token head scores are cached the first time this function is called if self.tokenHeadScores != None: return self.tokenHeadScores else: self.tokenHeadScores = {} # Give all tokens initial scores for token in self.tokens: self.tokenHeadScores[token] = 0 # initialize score as zero (unconnected token) for dependency in self.dependencies: if dependency.get("t1") == token.get("id") or dependency.get("t2") == token.get("id"): self.tokenHeadScores[token] = 1 # token is connected by a dependency break # Give a low score for tokens that clearly can't be head and are probably produced by hyphen-splitter for token in self.tokens: tokenText = token.get("text") if tokenText == "\\" or tokenText == "/" or tokenText == "-": self.tokenHeadScores[token] = -1 # Loop over all dependencies and increase the scores of all governor tokens # until each governor token has a higher score than its dependent token. # Some dependencies might form a loop so a list is used to define those # dependency types used in determining head scores. depTypesToInclude = ["prep", "nn", "det", "hyphen", "num", "amod", "nmod", "appos", "measure", "dep", "partmod"] #depTypesToRemoveReverse = ["A/AN"] modifiedScores = True loopCount = 0 # loopcount for devel set approx. 2-4 while modifiedScores == True: # loop until the scores no longer change if loopCount > 20: # survive loops print >> sys.stderr, "Warning, possible loop in parse for sentence", self.getSentenceId() break modifiedScores = False for token1 in self.tokens: for token2 in self.tokens: # for each combination of tokens... for dep in self.dependencies: # ... check each dependency if dep.get("t1") == token1.get("id") and dep.get("t2") == token2.get("id") and (dep.get("type") in depTypesToInclude): # The governor token of the dependency must have a higher score # than the dependent token. if self.tokenHeadScores[token1] <= self.tokenHeadScores[token2]: self.tokenHeadScores[token1] = self.tokenHeadScores[token2] + 1 modifiedScores = True # elif dep.attrib["t1"] == tokenI.attrib["id"] and dep.attrib["t2"] == tokenJ.attrib["id"] and (dep.attrib["type"] in depTypesToRemoveReverse): # #tokenScores[i] -= 1 # if self.tokenHeadScores[tokenJ] <= self.tokenHeadScores[tokenI]: # self.tokenHeadScores[tokenJ] = self.tokenHeadScores[tokenI] + 1 # modifiedScores = True loopCount += 1 # Add scores to tokens for token in self.tokens: token.set("headScore", str(self.tokenHeadScores[token])) return self.tokenHeadScores def _markNamedEntities(self): """ This method is used to define which tokens belong to _named_ entities. Named entities are sometimes masked when testing learning of interactions, to prevent the system making a trivial decision based on commonly interacting names. """ self.tokenIsName = {} self.tokenIsEntity = {} self.tokenIsEntityHead = {} # Initialize the dictionaries for token in self.tokens: self.tokenIsName[token] = False self.tokenIsEntity[token] = False self.tokenIsEntityHead[token] = [] for entity in self.entities: entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) entityHeadOffset = Range.charOffsetToSingleTuple(entity.get("headOffset")) for token in self.tokens: tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) for entityOffset in entityOffsets: if Range.overlap(entityOffset, tokenOffset): self.tokenIsEntity[token] = True if entity.get("isName") != None: if entity.get("isName") == "True": self.tokenIsName[token] = True else: entity.set("isName", "True") self.tokenIsName[token] = True if Range.overlap(entityHeadOffset, tokenOffset): self.tokenIsEntityHead[token].append(entity) def getTokenText(self, token): """ Returns the text of a token, and masks it if the token is the head token of a named entity. @param token: interaction-XML syntactic token. @type token: cElementTree.Element """ if self.tokenIsName[token]: return "NAMED_ENT" else: return token.get("text") def getCleared(self): c = SentenceGraph(self.sentenceElement, self.tokens, self.dependencies) namedEntities = [] for entity in self.entities: if entity.get("isName") == "True": namedEntities.append(entity) c.mapInteractions(namedEntities, []) return c def mergeInteractionGraph(self, merge=True): """ For merging duplicate entities keepDuplicates - allows calling the function with no effect, so that the same code can be used for merged and unmerged cases """ self.mergedEntities = [] self.mergedEntityToDuplicates = {} #duplicates = {} #mergedIds = {} if not merge: # no entities are filtered # Create dummy structures for entity in self.entities: mergedIds[entity] = entity.get("id") self.mergedEntities.append(entity) self.mergedEntityToDuplicates[entity] = [] return # Mark all duplicates after the first one in the list for removal removeEntities = [False] * len(self.entities) entitiesToKeep = [] for i in range(len(self.entities)): # loop through all entities, including the last one if removeEntities[i]: # entity has been already removed continue self.mergedEntities.append(self.entities[i]) #mergedIds[entities[i]] = entities[i].get("id") self.mergedEntityToDuplicates[self.entities[i]] = [] if self.entities[i].get("isName") == "True": # named entities are never merged continue for j in range(i+1, len(self.entities)): # loop through all entities coming after entity "i" # Entities are duplicates if they have the same type and head token # Also, they are not duplicates if the charOffset differs. This shoulnd't matter, # as the head tokens are the same, but in practice, on the GE, task improves performance, # maybe due to multiple similar examples affecting SVM learning. if self.entities[i].get("type") == self.entities[j].get("type") and \ self.entities[i].get("charOffset") == self.entities[j].get("charOffset"): # and self.entityHeadTokenByEntity[self.entities[i]] == self.entityHeadTokenByEntity[self.entities[j]]: removeEntities[j] = True #mergedIds[entities[i]] += "/" + entities[j].get("id") self.mergedEntityToDuplicates[self.entities[i]].append(self.entities[j]) #return entitiesToKeep, mergedIds, duplicates