class UnmergedEdgeExampleBuilder(ExampleBuilder): def __init__(self, style=["typed","directed","headsOnly"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) self.pathLengths = length assert(self.pathLengths == None) self.types = types if "random" in self.styles: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet) #self.outFile = open("exampleTempFile.txt","wt") @classmethod def run(cls, input, output, parse, tokenization, style, idFileTag=None): classSet, featureSet = cls.getIdSets(idFileTag) if style == None: e = UnmergedEdgeExampleBuilder(classSet=classSet, featureSet=featureSet) else: e = UnmergedEdgeExampleBuilder(style=style, classSet=classSet, featureSet=featureSet) sentences = cls.getSentences(input, parse, tokenization) e.buildExamplesForSentences(sentences, output, idFileTag) print e.classSet.Ids def definePredictedValueRange(self, sentences, elementName): self.multiEdgeFeatureBuilder.definePredictedValueRange(sentences, elementName) def getPredictedValueRange(self): return self.multiEdgeFeatureBuilder.predictedRange def filterEdgesByType(self, edges, typesToInclude): if len(typesToInclude) == 0: return edges edgesToKeep = [] for edge in edges: if edge.get("type") in typesToInclude: edgesToKeep.append(edge) return edgesToKeep def getCategoryName(self, sentenceGraph, e1, e2, directed=True): # Dummies are potential entities that do not exist in the # training data. If both entities of an interaction are dummies # it can't exist in the training data and is therefore a negative if e1[2] or e2[2]: return "neg" e1 = e1[0] e2 = e2[0] interactions = sentenceGraph.getInteractions(e1, e2) if not directed: interactions.extend(sentenceGraph.getInteractions(e2, e1)) types = set() for interaction in interactions: types.add(interaction.attrib["type"]) types = list(types) types.sort() categoryName = "" for name in types: if categoryName != "": categoryName += "---" categoryName += name if categoryName != "": return categoryName else: return "neg" def preProcessExamples(self, allExamples): if "normalize" in self.styles: print >> sys.stderr, " Normalizing feature vectors" ExampleUtils.normalizeFeatureVectors(allExamples) return allExamples def isPotentialGeniaInteraction(self, e1, e2): if e1.get("isName") == "True": return False else: return True def nxMultiDiGraphToUndirected(self, graph): undirected = NX10.MultiGraph(name=graph.name) undirected.add_nodes_from(graph) undirected.add_edges_from(graph.edges_iter()) return undirected def getInteractionEdgeLengths(self, sentenceGraph, paths): """ Return dependency and linear length of all interaction edges (measured between the two tokens). """ interactionLengths = {} for interaction in sentenceGraph.interactions: # Calculated interaction edge dep and lin length e1 = sentenceGraph.entitiesById[interaction.get("e1")] e2 = sentenceGraph.entitiesById[interaction.get("e2")] t1 = sentenceGraph.entityHeadTokenByEntity[e1] t2 = sentenceGraph.entityHeadTokenByEntity[e2] # Get dep path length if t1 != t2 and paths.has_key(t1) and paths[t1].has_key(t2): pathLength = len(paths[t1][t2]) else: # no dependencyPath pathLength = 999999 # more than any real path # Linear distance t1Pos = -1 t2Pos = -1 for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == t1: t1Pos = i if t2Pos != -1: break if sentenceGraph.tokens[i] == t2: t2Pos = i if t1Pos != -1: break linLength = abs(t1Pos - t2Pos) interactionLengths[interaction] = (pathLength, linLength) return interactionLengths def getPrecedenceLevels(self, sentenceGraph, paths): """ Get overlapping entity precedence """ interactionLengths = self.getInteractionEdgeLengths(sentenceGraph, paths) interactionsByEntity = {} # Convenience mapping entityPrecedenceValues = {} for entity in sentenceGraph.entities: interactionsByEntity[entity] = [] eId = entity.get("id") # Add access to interactions argDepDist = 0 # Sum of lengths of shortest paths argLinDist = 0 # Sum of linear distances for interaction in sentenceGraph.interactions: if interaction.get("e1") == eId: # An argument of the entity defined by the node interactionsByEntity[entity].append(interaction) argDepDist += interactionLengths[interaction][0] argLinDist += interactionLengths[interaction][1] # Store precedence counts (num args, sum of dep lengths, sum of lin lengths) entityPrecedenceValues[entity] = (len(interactionsByEntity), argDepDist, argLinDist, entity) # Determine level of entity from precedence counts levelByEntity = {} # slot number #levelByInteraction = {} # slot number of parent node # There is one slot group per token, per type for token in sentenceGraph.tokens: # per token entitiesByType = {} for entity in sentenceGraph.tokenIsEntityHead[token]: # per type if entity.get("isName") == "True": # Names can never have duplicates assert not levelByEntity.has_key(entity) levelByEntity[entity] = 0 continue eType = entity.get("type") if eType == "neg": continue if not entitiesByType.has_key(eType): entitiesByType[eType] = [] entitiesByType[eType].append(entity) for eType in sorted(entitiesByType.keys()): # Slot ordering by precedence sortedEntities = [] for entity in entitiesByType[eType]: sortedEntities.append(entityPrecedenceValues[entity]) sortedEntities.sort(compareEntityPrecedence) level = 0 for precedenceTuple in sortedEntities: entity = precedenceTuple[3] assert not levelByEntity.has_key(entity) levelByEntity[entity] = level # Interactions have the same slot as their parent entity #for interaction in interactionsByEntity[entity]: # assert not levelByInteraction.has_key(interaction) # levelByInteraction[interaction] = level level += 1 return levelByEntity#, levelByInteraction def buildExamples(self, sentenceGraph): examples = [] exampleIndex = 0 #undirected = sentenceGraph.getUndirectedDependencyGraph() undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) ##undirected = sentenceGraph.dependencyGraph.to_undirected() ###undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) # Determine overlapping entity precedence #levelByEntity, levelByInteraction = self.getPrecedenceLevels(sentenceGraph, paths) levelByEntity = self.getPrecedenceLevels(sentenceGraph, paths) entities = [] # There is one entity group for each token, for each type of entity for token in sentenceGraph.tokens: # per token entitiesByType = {} for entity in sentenceGraph.tokenIsEntityHead[token]: # per type if entity.get("isName") == "True": # Names can never have duplicates entities.append( (entity, 0, False) ) continue eType = entity.get("type") if eType == "neg": continue if not entitiesByType.has_key(eType): entitiesByType[eType] = [] entitiesByType[eType].append(entity) # Create slot groups for tokens for which exists at least one entity eTypes = sorted(entitiesByType.keys()) if len(eTypes) == 0: continue # Create slot groups and insert GS data there for eType in eTypes: # Use first entity of a type as the dummy entity for unfilled slots dummyEntity = entitiesByType[eType][0] # Define entity slots entityGroup = [None, None, None, None] #entityGroup = [None, None] # Insert existing entities into slots for entity in entitiesByType[eType]: if levelByEntity.has_key(entity): level = levelByEntity[entity] if level < len(entityGroup): entityGroup[level] = (entity, level, False) # Create dummies for potential entities for i in range(len(entityGroup)): if entityGroup[i] == None: entityGroup[i] = (dummyEntity, i, True) # Put all slots into one potential entity list #print entityGroup for e in entityGroup: entities.append(e) # Generate examples based on interactions between entities for i in range(len(entities)-1): for j in range(i+1,len(entities)): eI = entities[i][0] eJ = entities[j][0] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] # define forward example categoryName = self.getCategoryName(sentenceGraph, entities[i], entities[j], True) if (not "genia_limits" in self.styles) or self.isPotentialGeniaInteraction(eI, eJ): examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, entities[i], entities[j]) ) exampleIndex += 1 # define reverse categoryName = self.getCategoryName(sentenceGraph, entities[j], entities[i], True) if (not "genia_limits" in self.styles) or self.isPotentialGeniaInteraction(eJ, eI): examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, entities[j], entities[i]) ) exampleIndex += 1 return examples def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, exampleIndex, e1=None, e2=None): entity1=e1[0] entity2=e2[0] # define features features = {} features[self.featureSet.getId("gov_level")] = e1[1] features[self.featureSet.getId("gov_level_"+str(e1[1]))] = 1 features[self.featureSet.getId("dep_level")] = e2[1] features[self.featureSet.getId("dep_level_"+str(e2[1]))] = 1 features[self.featureSet.getId("level_pair_"+str(e1[1])+"_"+str(e2[1]))] = 1 if True: #token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): if token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): path = paths[token1][token2] else: path = [token1, token2] assert(self.pathLengths == None) if self.pathLengths == None or len(path)-1 in self.pathLengths: if not "no_dependency" in self.styles: if token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): edges = self.multiEdgeFeatureBuilder.getEdges(sentenceGraph.dependencyGraph, path) else: edges = None if "entity_type" in self.styles: features[self.featureSet.getId("e1_"+entity1.attrib["type"])] = 1 features[self.featureSet.getId("e2_"+entity2.attrib["type"])] = 1 features[self.featureSet.getId("distance_"+str(len(path)))] = 1 if not "no_dependency" in self.styles: self.multiEdgeFeatureBuilder.setFeatureVector(features, entity1, entity2) #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast if not "disable_entity_features" in self.styles: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not "disable_terminus_features" in self.styles: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not "disable_single_element_features" in self.styles: self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, edges, sentenceGraph) if not "disable_ngram_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathGrams(2, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(3, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(4, path, edges, sentenceGraph) # remove for fast #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast #if edges != None: # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast if not "disable_path_edge_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, edges, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) if not "no_linear" in self.styles: self.tokenFeatureBuilder.setFeatureVector(features) for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == token1: token1Index = i if sentenceGraph.tokens[i] == token2: token2Index = i linearPreTag = "linfw_" if token1Index > token2Index: token1Index, token2Index = token2Index, token1Index linearPreTag = "linrv_" self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1") self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2") # Before, middle, after # self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf") # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw") # self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af") # before-middle, middle, middle-after # self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2) self.tokenFeatureBuilder.setFeatureVector(None) if "random" in self.styles: self.randomFeatureBuilder.setFeatureVector(features) self.randomFeatureBuilder.buildRandomFeatures(100, 0.01) self.randomFeatureBuilder.setFeatureVector(None) if "genia_limits" in self.styles: e1Type = entity1.get("type") e2Type = entity2.get("type") assert(entity1.get("isName") == "False") if entity2.get("isName") == "True": features[self.featureSet.getId("GENIA_target_protein")] = 1 else: features[self.featureSet.getId("GENIA_nested_event")] = 1 if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization if entity2.get("isName") == "True": features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1 else: features[self.featureSet.getId("GENIA_regulation_of_event")] = 1 else: features[self.featureSet.getId("always_negative")] = 1 if "subset" in self.styles: features[self.featureSet.getId("out_of_scope")] = 1 else: features[self.featureSet.getId("always_negative")] = 1 if "subset" in self.styles: features[self.featureSet.getId("out_of_scope")] = 1 path = [token1, token2] # define extra attributes if int(path[0].attrib["id"].split("_")[-1]) < int(path[-1].attrib["id"].split("_")[-1]): #extra = {"xtype":"edge","type":"i","t1":path[0],"t2":path[-1]} extra = {"xtype":"ue","type":"i","t1":path[0].get("id"),"t2":path[-1].get("id")} extra["deprev"] = False else: #extra = {"xtype":"edge","type":"i","t1":path[-1],"t2":path[0]} extra = {"xtype":"ue","type":"i","t1":path[-1].get("id"),"t2":path[0].get("id")} extra["deprev"] = True if entity1 != None: extra["e1"] = entity1.get("id") extra["l1"] = str(e1[1]) extra["d1"] = str(e1[2])[0] # is a dummy node (an entity not in existing triggers) if entity2 != None: extra["e2"] = entity2.get("id") extra["l2"] = str(e2[1]) extra["d2"] = str(e2[2])[0] # is a dummy node (an entity not in existing triggers) extra["categoryName"] = categoryName sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId # make example if "binary" in self.styles: if categoryName != "neg": category = 1 else: category = -1 categoryName = "i" else: category = self.classSet.getId(categoryName) return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)
class EventExampleBuilder(ExampleBuilder): def __init__(self, style=["typed", "directed", "headsOnly"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert (classSet.getId("neg") == 1) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) #if "ontology" in self.styles: # self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) self.pathLengths = length assert (self.pathLengths == None) self.types = types #self.outFile = open("exampleTempFile.txt","wt") @classmethod def run(cls, input, output, parse, tokenization, style, idFileTag=None): classSet, featureSet = cls.getIdSets(idFileTag) e = EventExampleBuilder(style=style, classSet=classSet, featureSet=featureSet) sentences = cls.getSentences(input, parse, tokenization) e.buildExamplesForSentences(sentences, output, idFileTag) def definePredictedValueRange(self, sentences, elementName): self.multiEdgeFeatureBuilder.definePredictedValueRange( sentences, elementName) def getPredictedValueRange(self): return self.multiEdgeFeatureBuilder.predictedRange def preProcessExamples(self, allExamples): if "normalize" in self.styles: print >> sys.stderr, " Normalizing feature vectors" ExampleUtils.normalizeFeatureVectors(allExamples) return allExamples def isPotentialGeniaInteraction(self, e1, e2): if e1.get("isName") == "True" and e2.get("isName") == "True": return False elif e1.get("isName") == "True" and e2.get("isName") == "False": return False else: return True def getArgumentEntities(self, sentenceGraph, entityNode): eId = entityNode.get("id") assert (eId != None) themeNodes = [] causeNodes = [] for edge in sentenceGraph.interactions: if edge.get("e1") == eId: edgeType = edge.get("type") assert (edgeType in ["Theme", "Cause"]), edgeType if edgeType == "Theme": themeNodes.append( sentenceGraph.entitiesById[edge.get("e2")]) elif edgeType == "Cause": causeNodes.append( sentenceGraph.entitiesById[edge.get("e2")]) return themeNodes, causeNodes def makeGSEvents(self, sentenceGraph): self.gsEvents = {} # [token]->[event-type]->[1-n argument sets] for token in sentenceGraph.tokens: self.gsEvents[token] = {} for entity in sentenceGraph.entities: if entity.get("type") == "neg": continue eId = entity.get("id") eType = entity.get("type") arguments = set() for interaction in sentenceGraph.interactions: if interaction.get("e1") == eId: arguments.add( (interaction.get("type"), interaction.get("e2"))) eHeadToken = sentenceGraph.entityHeadTokenByEntity[entity] if not self.gsEvents[eHeadToken].has_key(eType): self.gsEvents[eHeadToken][eType] = [] self.gsEvents[eHeadToken][eType].append(arguments) def isGSEvent(self, sentenceGraph, entity, themeNodes, causeNodes): eHeadToken = sentenceGraph.entityHeadTokenByEntity[entity] eType = entity.get("type") if not self.gsEvents[eHeadToken].has_key(eType): return False argumentSet = set() for themeNode in themeNodes: if themeNode != None: argumentSet.add(("Theme", themeNode.get("id"))) for causeNode in causeNodes: if causeNode != None: argumentSet.add(("Cause", causeNode.get("id"))) if argumentSet in self.gsEvents[eHeadToken][eType]: return True else: return False # def isEvent(self, sentenceGraph, eventNode, themeNodes, causeNodes): # goldThemeNodes, goldCauseNodes = self.getArgumentEntities(sentenceGraph, eventNode) # for node in themeNodes: # if node != None and node not in goldThemeNodes: # return False # for node in causeNodes: # if node != None and node not in goldCauseNodes: # return False # return True def buildExamples(self, sentenceGraph): self.makeGSEvents(sentenceGraph) eventNodes = [] nameNodes = [] for entity in sentenceGraph.entities: if entity.get("type") == "neg": continue if entity.get("isName") == "True": nameNodes.append(entity) else: eventNodes.append(entity) allNodes = eventNodes + nameNodes examples = [] exampleIndex = 0 undirected = sentenceGraph.dependencyGraph.to_undirected() paths = NX.all_pairs_shortest_path(undirected, cutoff=999) for eventNode in eventNodes: eventType = eventNode.get("type") if eventType in [ "Gene_expression", "Transcription", "Protein_catabolism", "Localization", "Phosphorylation" ]: for nameNode in nameNodes: if self.isPotentialGeniaInteraction(eventNode, nameNode): examples.append( self.buildExample(exampleIndex, sentenceGraph, paths, eventNode, nameNode)) exampleIndex += 1 elif eventType in [ "Regulation", "Positive_regulation", "Negative_regulation" ]: combinations = combine.combine(allNodes + [None], allNodes + [None]) for combination in combinations: if combination[0] == combination[1]: continue if combination[0] == eventNode or combination[ 1] == eventNode: continue if combination[ 0] != None and not self.isPotentialGeniaInteraction( eventNode, combination[0]): continue if combination[ 1] != None and not self.isPotentialGeniaInteraction( eventNode, combination[1]): continue examples.append( self.buildExample(exampleIndex, sentenceGraph, paths, eventNode, combination[0], combination[1])) exampleIndex += 1 elif eventType in ["Binding"]: continue else: assert False, eventType self.gsEvents = None return examples def buildExample(self, exampleIndex, sentenceGraph, paths, eventNode, themeNode, causeNode=None): features = {} if self.isGSEvent(sentenceGraph, eventNode, [themeNode], [causeNode]): category = self.classSet.getId("pos") else: category = self.classSet.getId("neg") if themeNode != None: self.buildArgumentFeatures(sentenceGraph, paths, features, eventNode, themeNode, "theme_") if causeNode != None: self.buildArgumentFeatures(sentenceGraph, paths, features, eventNode, causeNode, "cause_") # Common features # eventType = eventNode.get("type") # e2Type = entity2.get("type") # assert(entity1.get("isName") == "False") # if entity2.get("isName") == "True": # features[self.featureSet.getId("GENIA_target_protein")] = 1 # else: # features[self.featureSet.getId("GENIA_nested_event")] = 1 # if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization # if entity2.get("isName") == "True": # features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1 # else: # features[self.featureSet.getId("GENIA_regulation_of_event")] = 1 # define extra attributes extra = {"xtype": "trigger-event", "type": eventNode.get("type")} extra["e"] = eventNode.get("id") eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode] extra["et"] = eventToken.get("id") if themeNode != None: extra["t"] = themeNode.get("id") themeToken = sentenceGraph.entityHeadTokenByEntity[themeNode] extra["tt"] = themeToken.get("id") if causeNode != None: extra["c"] = causeNode.get("id") causeToken = sentenceGraph.entityHeadTokenByEntity[causeNode] extra["ct"] = causeToken.get("id") sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId # make example #assert (category == 1 or category == -1) return (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra) def buildArgumentFeatures(self, sentenceGraph, paths, features, eventNode, argNode, tag): eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode] argToken = sentenceGraph.entityHeadTokenByEntity[argNode] if eventToken != argToken and paths.has_key( eventToken) and paths[eventToken].has_key(argToken): path = paths[eventToken][argToken] edges = self.multiEdgeFeatureBuilder.getEdges( sentenceGraph.dependencyGraph, path) else: path = [eventToken, argToken] edges = None self.multiEdgeFeatureBuilder.tag = tag self.multiEdgeFeatureBuilder.setFeatureVector(features, eventNode, argNode) if not "disable_entity_features" in self.styles: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not "disable_terminus_features" in self.styles: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures( path, sentenceGraph) # remove for fast if not "disable_single_element_features" in self.styles: self.multiEdgeFeatureBuilder.buildSingleElementFeatures( path, edges, sentenceGraph) if not "disable_ngram_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathGrams( 2, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 3, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 4, path, edges, sentenceGraph) # remove for fast if not "disable_path_edge_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures( path, edges, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) self.multiEdgeFeatureBuilder.tag = ""
class EventExampleBuilder(ExampleBuilder): def __init__(self, style=["typed", "directed", "headsOnly"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert classSet.getId("neg") == 1 ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True # self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) # if "ontology" in self.styles: # self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) self.pathLengths = length assert self.pathLengths == None self.types = types # self.outFile = open("exampleTempFile.txt","wt") @classmethod def run(cls, input, output, parse, tokenization, style, idFileTag=None): classSet, featureSet = cls.getIdSets(idFileTag) e = EventExampleBuilder(style=style, classSet=classSet, featureSet=featureSet) sentences = cls.getSentences(input, parse, tokenization) e.buildExamplesForSentences(sentences, output, idFileTag) def definePredictedValueRange(self, sentences, elementName): self.multiEdgeFeatureBuilder.definePredictedValueRange(sentences, elementName) def getPredictedValueRange(self): return self.multiEdgeFeatureBuilder.predictedRange def preProcessExamples(self, allExamples): if "normalize" in self.styles: print >>sys.stderr, " Normalizing feature vectors" ExampleUtils.normalizeFeatureVectors(allExamples) return allExamples def isPotentialGeniaInteraction(self, e1, e2): if e1.get("isName") == "True" and e2.get("isName") == "True": return False elif e1.get("isName") == "True" and e2.get("isName") == "False": return False else: return True def getArgumentEntities(self, sentenceGraph, entityNode): eId = entityNode.get("id") assert eId != None themeNodes = [] causeNodes = [] for edge in sentenceGraph.interactions: if edge.get("e1") == eId: edgeType = edge.get("type") assert edgeType in ["Theme", "Cause"], edgeType if edgeType == "Theme": themeNodes.append(sentenceGraph.entitiesById[edge.get("e2")]) elif edgeType == "Cause": causeNodes.append(sentenceGraph.entitiesById[edge.get("e2")]) return themeNodes, causeNodes def makeGSEvents(self, sentenceGraph): self.gsEvents = {} # [token]->[event-type]->[1-n argument sets] for token in sentenceGraph.tokens: self.gsEvents[token] = {} for entity in sentenceGraph.entities: if entity.get("type") == "neg": continue eId = entity.get("id") eType = entity.get("type") arguments = set() for interaction in sentenceGraph.interactions: if interaction.get("e1") == eId: arguments.add((interaction.get("type"), interaction.get("e2"))) eHeadToken = sentenceGraph.entityHeadTokenByEntity[entity] if not self.gsEvents[eHeadToken].has_key(eType): self.gsEvents[eHeadToken][eType] = [] self.gsEvents[eHeadToken][eType].append(arguments) def isGSEvent(self, sentenceGraph, entity, themeNodes, causeNodes): eHeadToken = sentenceGraph.entityHeadTokenByEntity[entity] eType = entity.get("type") if not self.gsEvents[eHeadToken].has_key(eType): return False argumentSet = set() for themeNode in themeNodes: if themeNode != None: argumentSet.add(("Theme", themeNode.get("id"))) for causeNode in causeNodes: if causeNode != None: argumentSet.add(("Cause", causeNode.get("id"))) if argumentSet in self.gsEvents[eHeadToken][eType]: return True else: return False # def isEvent(self, sentenceGraph, eventNode, themeNodes, causeNodes): # goldThemeNodes, goldCauseNodes = self.getArgumentEntities(sentenceGraph, eventNode) # for node in themeNodes: # if node != None and node not in goldThemeNodes: # return False # for node in causeNodes: # if node != None and node not in goldCauseNodes: # return False # return True def buildExamples(self, sentenceGraph): self.makeGSEvents(sentenceGraph) eventNodes = [] nameNodes = [] for entity in sentenceGraph.entities: if entity.get("type") == "neg": continue if entity.get("isName") == "True": nameNodes.append(entity) else: eventNodes.append(entity) allNodes = eventNodes + nameNodes examples = [] exampleIndex = 0 undirected = sentenceGraph.dependencyGraph.to_undirected() paths = NX.all_pairs_shortest_path(undirected, cutoff=999) for eventNode in eventNodes: eventType = eventNode.get("type") if eventType in [ "Gene_expression", "Transcription", "Protein_catabolism", "Localization", "Phosphorylation", ]: for nameNode in nameNodes: if self.isPotentialGeniaInteraction(eventNode, nameNode): examples.append(self.buildExample(exampleIndex, sentenceGraph, paths, eventNode, nameNode)) exampleIndex += 1 elif eventType in ["Regulation", "Positive_regulation", "Negative_regulation"]: combinations = combine.combine(allNodes + [None], allNodes + [None]) for combination in combinations: if combination[0] == combination[1]: continue if combination[0] == eventNode or combination[1] == eventNode: continue if combination[0] != None and not self.isPotentialGeniaInteraction(eventNode, combination[0]): continue if combination[1] != None and not self.isPotentialGeniaInteraction(eventNode, combination[1]): continue examples.append( self.buildExample(exampleIndex, sentenceGraph, paths, eventNode, combination[0], combination[1]) ) exampleIndex += 1 elif eventType in ["Binding"]: continue else: assert False, eventType self.gsEvents = None return examples def buildExample(self, exampleIndex, sentenceGraph, paths, eventNode, themeNode, causeNode=None): features = {} if self.isGSEvent(sentenceGraph, eventNode, [themeNode], [causeNode]): category = self.classSet.getId("pos") else: category = self.classSet.getId("neg") if themeNode != None: self.buildArgumentFeatures(sentenceGraph, paths, features, eventNode, themeNode, "theme_") if causeNode != None: self.buildArgumentFeatures(sentenceGraph, paths, features, eventNode, causeNode, "cause_") # Common features # eventType = eventNode.get("type") # e2Type = entity2.get("type") # assert(entity1.get("isName") == "False") # if entity2.get("isName") == "True": # features[self.featureSet.getId("GENIA_target_protein")] = 1 # else: # features[self.featureSet.getId("GENIA_nested_event")] = 1 # if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization # if entity2.get("isName") == "True": # features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1 # else: # features[self.featureSet.getId("GENIA_regulation_of_event")] = 1 # define extra attributes extra = {"xtype": "trigger-event", "type": eventNode.get("type")} extra["e"] = eventNode.get("id") eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode] extra["et"] = eventToken.get("id") if themeNode != None: extra["t"] = themeNode.get("id") themeToken = sentenceGraph.entityHeadTokenByEntity[themeNode] extra["tt"] = themeToken.get("id") if causeNode != None: extra["c"] = causeNode.get("id") causeToken = sentenceGraph.entityHeadTokenByEntity[causeNode] extra["ct"] = causeToken.get("id") sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId # make example # assert (category == 1 or category == -1) return (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra) def buildArgumentFeatures(self, sentenceGraph, paths, features, eventNode, argNode, tag): eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode] argToken = sentenceGraph.entityHeadTokenByEntity[argNode] if eventToken != argToken and paths.has_key(eventToken) and paths[eventToken].has_key(argToken): path = paths[eventToken][argToken] edges = self.multiEdgeFeatureBuilder.getEdges(sentenceGraph.dependencyGraph, path) else: path = [eventToken, argToken] edges = None self.multiEdgeFeatureBuilder.tag = tag self.multiEdgeFeatureBuilder.setFeatureVector(features, eventNode, argNode) if not "disable_entity_features" in self.styles: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not "disable_terminus_features" in self.styles: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not "disable_single_element_features" in self.styles: self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, edges, sentenceGraph) if not "disable_ngram_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathGrams(2, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(3, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(4, path, edges, sentenceGraph) # remove for fast if not "disable_path_edge_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, edges, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) self.multiEdgeFeatureBuilder.tag = ""
class Round2TriggerExampleBuilder(ExampleBuilder): def nxMultiDiGraphToUndirected(self, graph): undirected = NX10.MultiGraph(name=graph.name) undirected.add_nodes_from(graph) undirected.add_edges_from(graph.edges_iter()) return undirected def getPredictionStrength(self, element): eType = element.get("type") predictions = element.get("predictions") if predictions == None: return 0 predictions = predictions.split(",") for prediction in predictions: predClass, predStrength = prediction.split(":") if predClass == eType: predStrength = float(predStrength) return predStrength return 0 def getInteractionEdgeLengths(self, sentenceGraph, paths): """ Return dependency and linear length of all interaction edges (measured between the two tokens). """ interactionLengths = {} for interaction in sentenceGraph.interactions: # Calculated interaction edge dep and lin length e1 = sentenceGraph.entitiesById[interaction.get("e1")] e2 = sentenceGraph.entitiesById[interaction.get("e2")] t1 = sentenceGraph.entityHeadTokenByEntity[e1] t2 = sentenceGraph.entityHeadTokenByEntity[e2] # Get dep path length if t1 != t2 and paths.has_key(t1) and paths[t1].has_key(t2): pathLength = len(paths[t1][t2]) else: # no dependencyPath pathLength = 999999 # more than any real path # Linear distance t1Pos = -1 t2Pos = -1 for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == t1: t1Pos = i if t2Pos != -1: break if sentenceGraph.tokens[i] == t2: t2Pos = i if t1Pos != -1: break linLength = abs(t1Pos - t2Pos) interactionLengths[interaction] = (interaction, pathLength, linLength, t2Pos) return interactionLengths def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None): if classSet == None: classSet = IdSet(1) assert (classSet.getId("neg") == 1) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" if gazetteerFileName != None: self.gazetteer = Gazetteer.loadGztr(gazetteerFileName) print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName else: print >> sys.stderr, "No gazetteer loaded" self.gazetteer = None self.styles = style self.skiplist = set() if skiplist != None: f = open(skiplist, "rt") for line in f.readlines(): self.skiplist.add(line.strip()) f.close() self.styles = [ "trigger_features", "typed", "directed", "no_linear", "entities", "genia_limits", "noMasking", "maxFeatures" ] self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "graph_kernel" in self.styles: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder( self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) @classmethod def run(cls, input, gold, output, parse, tokenization, style, idFileTag=None, append=False): """ An interface for running the example builder without needing to create a class """ classSet, featureSet = cls.getIdSets(idFileTag) if style != None: e = Round2TriggerExampleBuilder(style=style, classSet=classSet, featureSet=featureSet) else: e = Round2TriggerExampleBuilder(classSet=classSet, featureSet=featureSet) sentences = cls.getSentences(input, parse, tokenization) if gold != None: goldSentences = cls.getSentences(gold, parse, tokenization) else: goldSentences = None e.buildExamplesForSentences(sentences, goldSentences, output, idFileTag, append=append) def buildExamplesForSentences(self, sentences, goldSentences, output, idFileTag=None, append=False): examples = [] counter = ProgressCounter(len(sentences), "Build examples") if append: outfile = open(output, "at") else: outfile = open(output, "wt") exampleCount = 0 for i in range(len(sentences)): sentence = sentences[i] goldSentence = [None] if goldSentences != None: goldSentence = goldSentences[i] counter.update( 1, "Building examples (" + sentence[0].getSentenceId() + "): ") examples = self.buildExamples(sentence[0], goldSentence[0], append=append) exampleCount += len(examples) examples = self.preProcessExamples(examples) ExampleUtils.appendExamples(examples, outfile) outfile.close() print >> sys.stderr, "Examples built:", exampleCount print >> sys.stderr, "Features:", len(self.featureSet.getNames()) #IF LOCAL if self.exampleStats.getExampleCount() > 0: self.exampleStats.printStats() #ENDIF # Save Ids if idFileTag != None: print >> sys.stderr, "Saving class names to", idFileTag + ".class_names" self.classSet.write(idFileTag + ".class_names") print >> sys.stderr, "Saving feature names to", idFileTag + ".feature_names" self.featureSet.write(idFileTag + ".feature_names") def preProcessExamples(self, allExamples): if "normalize" in self.styles: print >> sys.stderr, " Normalizing feature vectors" ExampleUtils.normalizeFeatureVectors(allExamples) return allExamples def getMergedEntityType(self, entities): """ If a single token belongs to multiple entities of different types, a new, composite type is defined. This type is the alphabetically ordered types of these entities joined with '---'. """ types = set() for entity in entities: types.add(entity.get("type")) types = list(types) types.sort() typeString = "" for type in types: if type == "Protein" and "all_tokens" in self.styles: continue if typeString != "": typeString += "---" typeString += type if typeString == "": return "neg" if "limit_merged_types" in self.styles: if typeString.find("---") != -1: if typeString == "Gene_expression---Positive_regulation": return typeString else: return typeString.split("---")[0] else: return typeString return typeString def getTokenFeatures(self, token, sentenceGraph): """ Returns a list of features based on the attributes of a token. These can be used to define more complex features. """ # These features are cached when this method is first called # for a token. if self.tokenFeatures.has_key(token): return self.tokenFeatures[token] tokTxt = sentenceGraph.getTokenText(token) features = {} features["_txt_" + tokTxt] = 1 # F 69.35 -> 68.22 #normalizedText = tokTxt.replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower() #features["_norTxt_"+normalizedText]=1 #features["_norStem_" + PorterStemmer.stem(normalizedText)]=1 features["_POS_" + token.get("POS")] = 1 if sentenceGraph.tokenIsName[token]: features["_isName"] = 1 for entity in sentenceGraph.tokenIsEntityHead[token]: if entity.get("isName") == "True": features["_annType_" + entity.get("type")] = 1 # Filip's gazetteer based features (can be used separately from exclude_gazetteer) if "gazetteer_features" in self.styles: tokTxtLower = tokTxt.lower() if "stem_gazetteer" in self.styles: tokTxtLower = PorterStemmer.stem(tokTxtLower) if self.gazetteer and tokTxtLower in self.gazetteer: for label, weight in self.gazetteer[tokTxtLower].items(): features["_knownLabel_" + label] = weight # 1 performs slightly worse self.tokenFeatures[token] = features return features def buildLinearOrderFeatures(self, sentenceGraph, index, tag, features): """ Linear features are built by marking token features with a tag that defines their relative position in the linear order. """ tag = "linear_" + tag for tokenFeature, w in self.getTokenFeatures( sentenceGraph.tokens[index], sentenceGraph).iteritems(): features[self.featureSet.getId(tag + tokenFeature)] = w def buildExamples(self, sentenceGraph, goldGraph, append=False): examples = self.buildExamplesInner(sentenceGraph, goldGraph) entityCounts = {} exampleCounts = {} for entity in sentenceGraph.entities: eType = entity.get("type") if eType == "Protein": continue if not entityCounts.has_key(eType): entityCounts[eType] = 0 exampleCounts[eType] = 0 entityCounts[eType] += 1 for example in examples: eTypes = self.classSet.getName(example[1]).split("---") for eType in eTypes: if not exampleCounts.has_key(eType): exampleCounts[eType] = 0 exampleCounts[eType] += 1 #for key in sorted(entityCounts.keys()): # if entityCounts[key] != exampleCounts[key]: # print >> sys.stderr, "Warning, sentence", sentenceGraph.getSentenceId(), "example", key, "diff", entityCounts[key] - exampleCounts[key] return examples def buildExamplesInner(self, sentenceGraph, goldGraph): """ Build one example for each token of the sentence """ if sentenceGraph.sentenceElement.get("origId") in self.skiplist: print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get( "origId") return [] self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) self.triggerFeatureBuilder.initSentence(sentenceGraph) undirected = self.nxMultiDiGraphToUndirected( sentenceGraph.dependencyGraph) paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) # Get argument order self.interactionLengths = self.getInteractionEdgeLengths( sentenceGraph, paths) self.interactionLengths = self.interactionLengths.values() self.interactionLengths.sort(compareInteractionPrecedence) # Map tokens to entities tokenByOffset = {} for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] if goldGraph != None: goldToken = goldGraph.tokens[i] assert token.get("id") == goldToken.get("id") and token.get( "charOffset") == goldToken.get("charOffset") tokenByOffset[token.get("charOffset")] = token.get("id") # Map gold entities to their head offsets goldEntitiesByOffset = {} for token in sentenceGraph.tokens: goldEntitiesByOffset[token.get("charOffset")] = [] entityToGold = {} for entity in sentenceGraph.entities: entityToGold[entity] = [] if goldGraph != None: for entity in goldGraph.entities: offset = entity.get("headOffset") assert offset != None goldEntitiesByOffset[offset].append(entity) # Map predicted entities to gold entities for entity in sentenceGraph.entities: eType = entity.get("type") eOffset = entity.get("headOffset") for goldEntity in goldEntitiesByOffset[eOffset]: if goldEntity.get("type") == eType: entityToGold[entity].append(goldEntity) # Map entities to interactions #interactionsByEntityId = {} #for entity in sentenceGraph.entities: # interactionsByEntityId[entity.get("id")] = [] # Map tokens to interactions interactionsByToken = {} for token in sentenceGraph.tokens: interactionsByToken[token] = [] for interactionTuple in self.interactionLengths: interaction = interactionTuple[0] if interaction.get("type") == "neg": continue e1Id = interaction.get("e1") token = sentenceGraph.entityHeadTokenByEntity[ sentenceGraph.entitiesById[e1Id]] interactionsByToken[token].append(interaction) examples = [] exampleIndex = 0 self.tokenFeatures = {} #namedEntityNorStrings = set() namedEntityHeadTokens = [] if not "names" in self.styles: namedEntityCount = 0 for entity in sentenceGraph.entities: if entity.get( "isName" ) == "True": # known data which can be used for features namedEntityCount += 1 #namedEntityNorStrings.add( entity.get("text").replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower() ) namedEntityCountFeature = "nameCount_" + str(namedEntityCount) #if namedEntityCount == 0: # no names, no need for triggers # return [] if "pos_pairs" in self.styles: namedEntityHeadTokens = self.getNamedEntityHeadTokens( sentenceGraph) #neFeatures = {} # F: 69.35 -> 69.14 #for norString in namedEntityNorStrings: # neFeatures[self.featureSet.getId("norNE_" + norString)] = 1 bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 bowFeatures = {} for k, v in bagOfWords.iteritems(): bowFeatures[self.featureSet.getId(k)] = v self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True) fixedInEdges = [] for edge in inEdges: fixedInEdges.append((edge[0], edge[1], edge[2]["element"])) inEdges = fixedInEdges inEdges.sort(compareDependencyEdgesById) self.inEdgesByToken[token] = inEdges outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True) fixedOutEdges = [] for edge in outEdges: fixedOutEdges.append((edge[0], edge[1], edge[2]["element"])) outEdges = fixedOutEdges outEdges.sort(compareDependencyEdgesById) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] # Recognize only non-named entities (i.e. interaction words) if sentenceGraph.tokenIsName[ token] and not "names" in self.styles and not "all_tokens" in self.styles: continue # CLASS #if len(sentenceGraph.tokenIsEntityHead[token]) > 0: # category = self.classSet.getId(self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token])) #else: # category = 1 offset = token.get("charOffset") if len(goldEntitiesByOffset[offset]) > 0: category = self.classSet.getId( self.getMergedEntityType(goldEntitiesByOffset[offset])) else: category = 1 tokenText = token.get("text").lower() if "stem_gazetteer" in self.styles: tokenText = PorterStemmer.stem(tokenText) if ("exclude_gazetteer" in self.styles ) and self.gazetteer and tokenText not in self.gazetteer: features = {} features[self.featureSet.getId("exclude_gazetteer")] = 1 extra = { "xtype": "token", "t": token.get("id"), "excluded": "True" } examples.append( (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)) exampleIndex += 1 continue # FEATURES features = {} self.features = features if not "names" in self.styles: features[self.featureSet.getId(namedEntityCountFeature)] = 1 #for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) #features.update(neFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_" + text)] = 1 features[self.featureSet.getId("POS_" + token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_" + stem)] = 1 features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1 # Normalized versions of the string (if same as non-normalized, overlap without effect) normalizedText = text.replace("-", "").replace("/", "").replace( ",", "").replace("\\", "").replace(" ", "").lower() if normalizedText == "bound": # should be for all irregular verbs normalizedText = "bind" features[self.featureSet.getId("txt_" + normalizedText)] = 1 norStem = PorterStemmer.stem(normalizedText) features[self.featureSet.getId("stem_" + norStem)] = 1 features[self.featureSet.getId("nonstem_" + normalizedText[len(norStem):])] = 1 if "gazetteer_features_maintoken" in self.styles: tokTxtLower = text.lower() if "stem_gazetteer" in self.styles: tokTxtLower = PorterStemmer.stem(tokTxtLower) if self.gazetteer and tokTxtLower in self.gazetteer: for label, weight in self.gazetteer[tokTxtLower].items(): features[self.featureSet.getId( "gaz_knownLabel_" + label)] = weight # 1 performs slightly worse # Linear order features #for index in [-3,-2,-1,1,2,3,4,5]: # 69.35 -> 68.97 for index in [-3, -2, -1, 1, 2, 3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j - 1] == "-": features[self.featureSet.getId( "has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_" + text[j - 1:j + 1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_" + text[j - 2:j + 1].lower())] = 1 # Attached edges (Hanging in and out edges) t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_" + edgeType)] = 1 features[self.featureSet.getId("t1HIn_" + edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_" + tokenText)] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + tokenText)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_" + edgeType)] = 1 features[self.featureSet.getId("t1HOut_" + edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_" + tokenText)] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + tokenText)] = 1 extra = {"xtype": "token", "t": token.get("id")} examples.append( (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)) exampleIndex += 1 # chains self.buildChains(token, sentenceGraph, features) if "pos_pairs" in self.styles: self.buildPOSPairs(token, namedEntityHeadTokens, features) self.buildPredictionFeatures(sentenceGraph, paths, token, interactionsByToken[token]) return examples def buildChains(self, token, sentenceGraph, features, depthLeft=3, chain="", visited=None): if depthLeft == 0: return strDepthLeft = "dist_" + str(depthLeft) if visited == None: visited = set() inEdges = self.inEdgesByToken[token] outEdges = self.outEdgesByToken[token] edgeSet = visited.union(self.edgeSetByToken[token]) for edge in inEdges: if not edge in visited: edgeType = edge[2].get("type") features[self.featureSet.getId("dep_" + strDepthLeft + edgeType)] = 1 nextToken = edge[0] for tokenFeature, w in self.getTokenFeatures( nextToken, sentenceGraph).iteritems(): features[self.featureSet.getId(strDepthLeft + tokenFeature)] = w # for entity in sentenceGraph.tokenIsEntityHead[nextToken]: # if entity.get("isName") == "True": # features[self.featureSet.getId("name_dist_"+strDepthLeft)] = 1 # features[self.featureSet.getId("name_dist_"+strDepthLeft+entity.get("type"))] = 1 # features[self.featureSet.getId("POS_dist_"+strDepthLeft+nextToken.get("POS"))] = 1 # tokenText = sentenceGraph.getTokenText(nextToken) # features[self.featureSet.getId("text_dist_"+strDepthLeft+tokenText)] = 1 if sentenceGraph.tokenIsName[nextToken]: features[self.featureSet.getId("name_chain_dist_" + strDepthLeft + chain + "-frw_" + edgeType)] = 1 features[self.featureSet.getId("chain_dist_" + strDepthLeft + chain + "-frw_" + edgeType)] = 1 self.buildChains(nextToken, sentenceGraph, features, depthLeft - 1, chain + "-frw_" + edgeType, edgeSet) for edge in outEdges: if not edge in visited: edgeType = edge[2].get("type") features[self.featureSet.getId("dep_dist_" + strDepthLeft + edgeType)] = 1 nextToken = edge[1] for tokenFeature, w in self.getTokenFeatures( nextToken, sentenceGraph).iteritems(): features[self.featureSet.getId(strDepthLeft + tokenFeature)] = w # for entity in sentenceGraph.tokenIsEntityHead[nextToken]: # if entity.get("isName") == "True": # features[self.featureSet.getId("name_dist_"+strDepthLeft)] = 1 # features[self.featureSet.getId("name_dist_"+strDepthLeft+entity.get("type"))] = 1 # features[self.featureSet.getId("POS_dist_"+strDepthLeft+nextToken.get("POS"))] = 1 # tokenText = sentenceGraph.getTokenText(nextToken) # features[self.featureSet.getId("text_dist_"+strDepthLeft+tokenText)] = 1 if sentenceGraph.tokenIsName[nextToken]: features[self.featureSet.getId("name_chain_dist_" + strDepthLeft + chain + "-rev_" + edgeType)] = 1 features[self.featureSet.getId("chain_dist_" + strDepthLeft + chain + "-rev_" + edgeType)] = 1 self.buildChains(nextToken, sentenceGraph, features, depthLeft - 1, chain + "-rev_" + edgeType, edgeSet) def getNamedEntityHeadTokens(self, sentenceGraph): headTokens = [] for entity in sentenceGraph.entities: if entity.get( "isName" ) == "True": # known data which can be used for features headTokens.append( sentenceGraph.entityHeadTokenByEntity[entity]) return headTokens def buildPOSPairs(self, token, namedEntityHeadTokens, features): tokenPOS = token.get("POS") assert tokenPOS != None for headToken in namedEntityHeadTokens: headPOS = headToken.get("POS") features[self.featureSet.getId("POS_pair_NE_" + tokenPOS + "-" + headPOS)] = 1 ###################################################### # Unmerging-style features ###################################################### def buildPredictionFeatures( self, sentenceGraph, paths, token, interactions): #themeEntities, causeEntities=None): # NOTE!!!! TODO # add also features for arguments present, but not in this combination self.buildInterArgumentBagOfWords(interactions, sentenceGraph) if sentenceGraph.entitiesByToken.has_key(token): for eventEntity in sentenceGraph.entitiesByToken[token]: eventEntityType = eventEntity.get("type") self.setFeature("rootType_" + eventEntity.get("type"), 1) self.setFeature("predStrength" + eventEntityType, self.getPredictionStrength(eventEntity)) self.triggerFeatureBuilder.setFeatureVector(self.features) self.triggerFeatureBuilder.tag = "trg" + eventEntityType + "_" self.triggerFeatureBuilder.buildFeatures(token) self.triggerFeatureBuilder.tag = None argThemeCount = 0 argCauseCount = 0 # Current example's edge combination for i in range(len(interactions)): arg = interactions[i] if arg.get("type") == "Theme": argThemeCount += 1 self.buildArgumentFeatures(sentenceGraph, paths, self.features, token, arg, "argTheme") self.buildArgumentFeatures(sentenceGraph, paths, self.features, token, arg, "argTheme" + str(i)) else: # Cause argCauseCount += 1 self.buildArgumentFeatures(sentenceGraph, paths, self.features, token, arg, "argCause") self.buildArgumentFeatures(sentenceGraph, paths, self.features, token, arg, "argCause" + str(i)) self.setFeature("argCount", len(interactions)) self.setFeature("argCount_" + str(len(interactions)), 1) self.setFeature("argThemeCount", argThemeCount) self.setFeature("argThemeCount_" + str(argThemeCount), 1) self.setFeature("argCauseCount", argCauseCount) self.setFeature("argCauseCount_" + str(argCauseCount), 1) self.triggerFeatureBuilder.tag = "" self.triggerFeatureBuilder.setFeatureVector(None) def buildArgumentFeatures(self, sentenceGraph, paths, features, eventToken, arg, tag): argEntity = sentenceGraph.entitiesById[arg.get("e2")] argToken = sentenceGraph.entityHeadTokenByEntity[argEntity] self.buildEdgeFeatures(sentenceGraph, paths, features, eventToken, argToken, tag) self.triggerFeatureBuilder.tag = tag + "trg_" self.triggerFeatureBuilder.buildFeatures(argToken) if argEntity.get("isName") == "True": self.setFeature(tag + "Protein", 1) else: self.setFeature(tag + "Event", 1) self.setFeature("nestingEvent", 1) self.setFeature(tag + "_" + argEntity.get("type"), 1) def buildEdgeFeatures(self, sentenceGraph, paths, features, eventToken, argToken, tag): #eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode] #argToken = sentenceGraph.entityHeadTokenByEntity[argNode] self.multiEdgeFeatureBuilder.tag = tag + "_" self.multiEdgeFeatureBuilder.setFeatureVector(features, None, None, False) self.setFeature(tag + "_present", 1) if eventToken != argToken and paths.has_key( eventToken) and paths[eventToken].has_key(argToken): path = paths[eventToken][argToken] edges = self.multiEdgeFeatureBuilder.getEdges( sentenceGraph.dependencyGraph, path) else: path = [eventToken, argToken] edges = None if not "disable_entity_features" in self.styles: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) #if not "disable_terminus_features" in self.styles: # self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not "disable_single_element_features" in self.styles: self.multiEdgeFeatureBuilder.buildSingleElementFeatures( path, edges, sentenceGraph) if not "disable_ngram_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathGrams( 2, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 3, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 4, path, edges, sentenceGraph) # remove for fast if not "disable_path_edge_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures( path, edges, sentenceGraph) #self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None, None, None, False) self.multiEdgeFeatureBuilder.tag = "" def buildInterArgumentBagOfWords(self, arguments, sentenceGraph): if len(arguments) < 2: return indexByToken = {} for i in range(len(sentenceGraph.tokens)): indexByToken[sentenceGraph.tokens[i]] = i argTokenIndices = set() for arg in arguments: argEntity = sentenceGraph.entitiesById[arg.get("e2")] argToken = sentenceGraph.entityHeadTokenByEntity[argEntity] argTokenIndices.add(indexByToken[argToken]) minIndex = min(argTokenIndices) maxIndex = max(argTokenIndices) self.setFeature("argBoWRange", (maxIndex - minIndex)) self.setFeature("argBoWRange_" + str(maxIndex - minIndex), 1) bow = set() for i in range(minIndex + 1, maxIndex): token = sentenceGraph.tokens[i] if len(sentenceGraph.tokenIsEntityHead[token] ) == 0 and not sentenceGraph.tokenIsName[token]: bow.add(token.get("text")) bow = sorted(list(bow)) for word in bow: self.setFeature("argBoW_" + word, 1) if word in ["/", "-"]: self.setFeature("argBoW_slashOrHyphen", 1) if len(bow) == 1: self.setFeature("argBoWonly_" + bow[0], 1) if bow[0] in ["/", "-"]: self.setFeature("argBoWonly_slashOrHyphen", 1)
class AsymmetricEventExampleBuilder(ExampleBuilder): def __init__(self, style=["typed", "directed"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert (classSet.getId("neg") == 1) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) if style.find(",") != -1: style = style.split(",") self.styles = style self.negFrac = None self.posPairGaz = POSPairGazetteer() for s in style: if s.find("negFrac") != -1: self.negFrac = float(s.split("_")[-1]) print >> sys.stderr, "Downsampling negatives to", self.negFrac self.negRand = random.Random(15) elif s.find("posPairGaz") != -1: self.posPairGaz = POSPairGazetteer( loadFrom=s.split("_", 1)[-1]) self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) if "graph_kernel" in self.styles: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder( self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) if "ontology" in self.styles: self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder( self.featureSet) if "nodalida" in self.styles: self.nodalidaFeatureBuilder = NodalidaFeatureBuilder( self.featureSet) #IF LOCAL if "bioinfer_limits" in self.styles: self.bioinferOntologies = OntologyUtils.getBioInferTempOntology() #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName) #ENDIF self.pathLengths = length assert (self.pathLengths == None) self.types = types if "random" in self.styles: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet) #self.outFile = open("exampleTempFile.txt","wt") @classmethod def run(cls, input, output, parse, tokenization, style, idFileTag=None): classSet, featureSet = cls.getIdSets(idFileTag) if style != None: e = cls(style=style, classSet=classSet, featureSet=featureSet) else: e = cls(classSet=classSet, featureSet=featureSet) sentences = cls.getSentences(input, parse, tokenization) e.buildExamplesForSentences(sentences, output, idFileTag) if "printClassIds" in e.styles: print >> sys.stderr, e.classSet.Ids def definePredictedValueRange(self, sentences, elementName): self.multiEdgeFeatureBuilder.definePredictedValueRange( sentences, elementName) def getPredictedValueRange(self): return self.multiEdgeFeatureBuilder.predictedRange def filterEdgesByType(self, edges, typesToInclude): if len(typesToInclude) == 0: return edges edgesToKeep = [] for edge in edges: if edge.get("type") in typesToInclude: edgesToKeep.append(edge) return edgesToKeep def getCategoryNameFromTokens(self, sentenceGraph, t1, t2, directed=True): types = set() themeE1Types = set() intEdges = [] if sentenceGraph.interactionGraph.has_edge(t1, t2): intEdges = sentenceGraph.interactionGraph.get_edge_data(t1, t2, default={}) # NOTE: Only works if keys are ordered integers for i in range(len(intEdges)): types.add(intEdges[i]["element"].get("type")) # if (not directed) and sentenceGraph.interactionGraph.has_edge(t2, t1): # intEdgesReverse = sentenceGraph.interactionGraph.get_edge(t2, t1, default={}) # # NOTE: Only works if keys are ordered integers # for i in range(len(intEdgesReverse)): # intElement = intEdgesReverse[i]["element"] # intType = intElement.get("type") # types.add(intType) # intEdges.extend(intEdgesReverse) for i in range(len(intEdges)): intElement = intEdges[i]["element"] intType = intElement.get("type") if intType == "Theme": e1Entity = sentenceGraph.entitiesById[intElement.get("e1")] themeE1Types.add(e1Entity.get("type")) #types.add(intType) if len(themeE1Types) != 0: themeE1Types = list(themeE1Types) themeE1Types.sort() categoryName = "" for name in themeE1Types: if categoryName != "": categoryName += "---" categoryName += name return categoryName else: types = list(types) types.sort() categoryName = "" for name in types: if categoryName != "": categoryName += "---" categoryName += name if categoryName != "": return categoryName else: return "neg" def getCategoryName(self, sentenceGraph, e1, e2, directed=True): interactions = sentenceGraph.getInteractions(e1, e2) if not directed: interactions.extend(sentenceGraph.getInteractions(e2, e1)) types = set() for interaction in interactions: types.add(interaction.attrib["type"]) types = list(types) types.sort() categoryName = "" for name in types: if categoryName != "": categoryName += "---" categoryName += name if categoryName != "": return categoryName else: return "neg" def preProcessExamples(self, allExamples): # Duplicates cannot be removed here, as they should only be removed from the training set. This is done # in the classifier. # if "no_duplicates" in self.styles: # count = len(allExamples) # print >> sys.stderr, " Removing duplicates,", # allExamples = ExampleUtils.removeDuplicates(allExamples) # print >> sys.stderr, "removed", count - len(allExamples) if "normalize" in self.styles: print >> sys.stderr, " Normalizing feature vectors" ExampleUtils.normalizeFeatureVectors(allExamples) return allExamples def isPotentialGeniaInteraction(self, e1, e2): if e1.get("isName") == "True": return False else: return True #IF LOCAL def getBioInferParentType(self, eType): if eType == "Physical_entity" or OntologyUtils.hasParent( eType, "Physical_entity", self.bioinferOntologies): return "Physical" elif eType == "Property_entity" or OntologyUtils.hasParent( eType, "Property_entity", self.bioinferOntologies): return "Property" elif OntologyUtils.hasParent(eType, "Relationship", self.bioinferOntologies): return "Process" else: assert False, eType # if self.bioinferOntologies["Entity"].has_key(eType): # if OntologyUtils.hasParent(eType, "Physical_entity", self.bioinferOntologies): # assert not OntologyUtils.hasParent(eType, "Property_entity", self.bioinferOntologies), eType # return "Physical" # else: # assert OntologyUtils.hasParent(eType, "Property_entity", self.bioinferOntologies), eType # return "Property" # # else: # assert self.bioinferOntologies.has_key(eType), eType # #assert OntologyUtils.hasParent(eType, "Process_entity", self.bioinferOntologies["Relationship"]), eType # return "Process" def isPotentialBioInferInteraction(self, e1, e2, categoryName): e1Type = self.getBioInferParentType(e1.get("type")) e2Type = self.getBioInferParentType(e2.get("type")) if e1Type == "Process" or e1Type == "Property": return True elif e1Type == "Physical" and e2Type == "Physical": return True elif e1Type == "Physical" and e2Type == "Process": # hack return True else: assert ( categoryName == "neg" ), categoryName + " category for " + e1Type + " and " + e2Type return False #ENDIF def nxMultiDiGraphToUndirected(self, graph): undirected = NX10.MultiGraph(name=graph.name) undirected.add_nodes_from(graph) undirected.add_edges_from(graph.edges_iter()) return undirected def buildExamples(self, sentenceGraph): examples = [] exampleIndex = 0 clearGraph = sentenceGraph.getCleared() #undirected = sentenceGraph.getUndirectedDependencyGraph() undirected = self.nxMultiDiGraphToUndirected( sentenceGraph.dependencyGraph) ##undirected = sentenceGraph.dependencyGraph.to_undirected() ###undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) self.triggerFeatureBuilder.initSentence(clearGraph) # Generate examples based on interactions between entities or interactions between tokens if "entities" in self.styles: loopRange = len(sentenceGraph.entities) else: loopRange = len(sentenceGraph.tokens) #for i in range(loopRange-1): for i in range(loopRange): # allow self-interactions #for j in range(i+1,loopRange): for j in range(i, loopRange): # allow self-interactions eI = None eJ = None if "entities" in self.styles: eI = sentenceGraph.entities[i] eJ = sentenceGraph.entities[j] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True": # continue if eI.get("type") == "neg" or eJ.get("type") == "neg": continue else: tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] # # only consider paths between entities (NOTE! entities, not only named entities) # if "headsOnly" in self.styles: # if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0): # continue if "directed" in self.styles: # define forward if "entities" in self.styles: categoryName = self.getCategoryName( sentenceGraph, eI, eJ, True) else: categoryName = self.getCategoryNameFromTokens( sentenceGraph, tI, tJ, True) self.exampleStats.beginExample(categoryName) if self.negFrac == None or categoryName != "neg" or ( categoryName == "neg" and self.negRand.random() < self.negFrac): makeExample = True if ("genia_limits" in self.styles ) and not self.isPotentialGeniaInteraction(eI, eJ): makeExample = False self.exampleStats.filter("genia_limits") if self.posPairGaz.getNegFrac( (tI.get("POS"), tJ.get("POS"))) == 1.0: makeExample = False self.exampleStats.filter("pos_pair") if makeExample: if not sentenceGraph.tokenIsName[tI]: examples.append( self.buildExample(tI, tJ, paths, clearGraph, categoryName, exampleIndex, eI, eJ)) exampleIndex += 1 else: self.exampleStats.filter("genia_token_limits") else: self.exampleStats.filter("neg_frac") self.exampleStats.endExample() # define reverse if "entities" in self.styles: categoryName = self.getCategoryName( sentenceGraph, eJ, eI, True) else: categoryName = self.getCategoryNameFromTokens( sentenceGraph, tJ, tI, True) self.exampleStats.beginExample(categoryName) if self.negFrac == None or categoryName != "neg" or ( categoryName == "neg" and self.negRand.random() < self.negFrac): makeExample = True if ("genia_limits" in self.styles ) and not self.isPotentialGeniaInteraction(eJ, eI): makeExample = False self.exampleStats.filter("genia_limits") if ("bioinfer_limits" in self.styles ) and not self.isPotentialBioInferInteraction( eJ, eI, categoryName): makeExample = False self.exampleStats.filter("bioinfer_limits") if self.posPairGaz.getNegFrac( (tJ.get("POS"), tI.get("POS"))) == 1.0: makeExample = False self.exampleStats.filter("pos_pair") if makeExample: if not sentenceGraph.tokenIsName[tJ]: examples.append( self.buildExample(tJ, tI, paths, clearGraph, categoryName, exampleIndex, eJ, eI)) exampleIndex += 1 else: self.exampleStats.filter("genia_token_limits") else: self.exampleStats.filter("neg_frac") self.exampleStats.endExample() # else: # if "entities" in self.styles: # categoryName = self.getCategoryName(sentenceGraph, eI, eJ, False) # else: # categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, False) # forwardExample = self.buildExample(tI, tJ, paths, clearGraph, categoryName, exampleIndex, eI, eJ) # if not "graph_kernel" in self.styles: # reverseExample = self.buildExample(tJ, tI, paths, clearGraph, categoryName, exampleIndex, eJ, eI) # forwardExample[2].update(reverseExample[2]) # examples.append(forwardExample) # exampleIndex += 1 return examples def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, exampleIndex, entity1=None, entity2=None): # define features features = {} if True: #token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): if token1 != token2 and paths.has_key( token1) and paths[token1].has_key(token2): path = paths[token1][token2] else: path = [token1, token2] assert (self.pathLengths == None) if self.pathLengths == None or len(path) - 1 in self.pathLengths: if not "no_trigger": self.triggerFeatureBuilder.setFeatureVector(self.features) self.triggerFeatureBuilder.tag = "trg_t1_" self.triggerFeatureBuilder.buildFeatures(eventToken) self.triggerFeatureBuilder.tag = "trg_t2_" self.triggerFeatureBuilder.buildFeatures(eventToken) # if not "no_ontology" in self.styles: # self.ontologyFeatureBuilder.setFeatureVector(features) # self.ontologyFeatureBuilder.buildOntologyFeaturesForPath(sentenceGraph, path) # self.ontologyFeatureBuilder.setFeatureVector(None) if "graph_kernel" in self.styles or not "no_dependency" in self.styles: if token1 != token2 and paths.has_key( token1) and paths[token1].has_key(token2): edges = self.multiEdgeFeatureBuilder.getEdges( sentenceGraph.dependencyGraph, path) else: edges = None if "graph_kernel" in self.styles: self.graphKernelFeatureBuilder.setFeatureVector( features, entity1, entity2) self.graphKernelFeatureBuilder.buildGraphKernelFeatures( sentenceGraph, path, edges) self.graphKernelFeatureBuilder.setFeatureVector(None) if "entity_type" in self.styles: features[self.featureSet.getId("e1_" + entity1.attrib["type"])] = 1 features[self.featureSet.getId("e2_" + entity2.attrib["type"])] = 1 features[self.featureSet.getId("distance_" + str(len(path)))] = 1 if not "no_dependency" in self.styles: if token1 == token2: features[self.featureSet.getId("tokenSelfLoop")] = 1 self.multiEdgeFeatureBuilder.setFeatureVector( features, entity1, entity2) #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast if not "disable_entity_features" in self.styles: self.multiEdgeFeatureBuilder.buildEntityFeatures( sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not "disable_terminus_features" in self.styles: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures( path, sentenceGraph) # remove for fast if not "disable_single_element_features" in self.styles: self.multiEdgeFeatureBuilder.buildSingleElementFeatures( path, edges, sentenceGraph) if not "disable_ngram_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathGrams( 2, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 3, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 4, path, edges, sentenceGraph) # remove for fast #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast #if edges != None: # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast if not "disable_path_edge_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures( path, edges, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures( sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) if "nodalida" in self.styles: self.nodalidaFeatureBuilder.setFeatureVector( features, entity1, entity2) shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths( sentenceGraph.dependencyGraph, path) print shortestPaths if len(shortestPaths) > 0: self.nodalidaFeatureBuilder.buildNGrams( shortestPaths, sentenceGraph) self.nodalidaFeatureBuilder.setFeatureVector(None) if not "no_linear" in self.styles: self.tokenFeatureBuilder.setFeatureVector(features) for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == token1: token1Index = i if sentenceGraph.tokens[i] == token2: token2Index = i linearPreTag = "linfw_" if token1Index > token2Index: token1Index, token2Index = token2Index, token1Index linearPreTag = "linrv_" self.tokenFeatureBuilder.buildLinearOrderFeatures( token1Index, sentenceGraph, 2, 2, preTag="linTok1") self.tokenFeatureBuilder.buildLinearOrderFeatures( token2Index, sentenceGraph, 2, 2, preTag="linTok2") # Before, middle, after # self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf") # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw") # self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af") # before-middle, middle, middle-after # self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2) self.tokenFeatureBuilder.setFeatureVector(None) if "random" in self.styles: self.randomFeatureBuilder.setFeatureVector(features) self.randomFeatureBuilder.buildRandomFeatures(100, 0.01) self.randomFeatureBuilder.setFeatureVector(None) if "genia_limits" in self.styles: e1Type = entity1.get("type") e2Type = entity2.get("type") assert (entity1.get("isName") == "False") if entity2.get("isName") == "True": features[self.featureSet.getId( "GENIA_target_protein")] = 1 else: features[self.featureSet.getId( "GENIA_nested_event")] = 1 if e1Type.find( "egulation" ) != -1: # leave r out to avoid problems with capitalization if entity2.get("isName") == "True": features[self.featureSet.getId( "GENIA_regulation_of_protein")] = 1 else: features[self.featureSet.getId( "GENIA_regulation_of_event")] = 1 else: features[self.featureSet.getId("always_negative")] = 1 if "subset" in self.styles: features[self.featureSet.getId("out_of_scope")] = 1 else: features[self.featureSet.getId("always_negative")] = 1 if "subset" in self.styles: features[self.featureSet.getId("out_of_scope")] = 1 path = [token1, token2] self.triggerFeatureBuilder.tag = "" self.triggerFeatureBuilder.setFeatureVector(None) # define extra attributes # if int(path[0].attrib["id"].split("_")[-1]) < int(path[-1].attrib["id"].split("_")[-1]): # #extra = {"xtype":"edge","type":"i","t1":path[0],"t2":path[-1]} # extra = {"xtype":"asym","type":"i","t1":path[0].get("id"),"t2":path[-1].get("id")} # extra["deprev"] = False # else: # #extra = {"xtype":"edge","type":"i","t1":path[-1],"t2":path[0]} # extra = {"xtype":"asym","type":"i","t1":path[-1].get("id"),"t2":path[0].get("id")} # extra["deprev"] = True extra = { "xtype": "asym", "type": "i", "t1": token1.get("id"), "t2": token2.get("id") } if entity1 != None: #extra["e1"] = entity1 extra["e1"] = entity1.get("id") if entity2 != None: #extra["e2"] = entity2 extra["e2"] = entity2.get("id") extra["categoryName"] = categoryName sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId # make example if "binary" in self.styles: if categoryName != "neg": category = 1 else: category = -1 categoryName = "i" else: category = self.classSet.getId(categoryName) return (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)
class DirectEventExampleBuilder(ExampleBuilder): def __init__(self, style=["typed", "directed", "headsOnly"], length=None, types=[], featureSet=None, classSet=None, gazetteer=None, pathGazetteer=None, negFrac=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert (classSet.getId("neg") == 1) if gazetteer != None: print >> sys.stderr, "Loading gazetteer from", gazetteer self.gazetteer = Gazetteer.loadGztr(gazetteer) else: print >> sys.stderr, "No gazetteer loaded" self.gazetteer = None self.pathGazetteer = None self.pathGazetteerDependencies = None self.pathGazetteerPairs = None if pathGazetteer != None: print >> sys.stderr, "Loading path gazetteer from", pathGazetteer self.pathGazetteer = PathGazetteer.load(pathGazetteer) self.pathGazetteerDependencies = PathGazetteer.getDependencies( self.pathGazetteer) self.pathGazetteerPairs = PathGazetteer.getPairs( self.pathGazetteer) else: print >> sys.stderr, "No path gazetteer loaded" ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.negFrac = negFrac print >> sys.stderr, "Downsampling negatives to", negFrac self.negRand = random.Random() self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if True: #"noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) #if "ontology" in self.styles: # self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) self.pathLengths = length assert (self.pathLengths == None) self.types = types self.eventsByOrigId = {} self.headTokensByOrigId = {} self.interSentenceEvents = set() self.examplesByEventOrigId = {} self.skippedByType = {} self.skippedByTypeAndReason = {} self.builtByType = {} self.gazMatchCache = {} #self.outFile = open("exampleTempFile.txt","wt") @classmethod def run(cls, input, output, parse, tokenization, style, idFileTag=None, gazetteer=None, pathGazetteer=None, negFrac=None): classSet, featureSet = cls.getIdSets(idFileTag) if style != None: e = DirectEventExampleBuilder(style=style, classSet=classSet, featureSet=featureSet, gazetteer=gazetteer, pathGazetteer=pathGazetteer, negFrac=negFrac) else: e = DirectEventExampleBuilder(classSet=classSet, featureSet=featureSet, gazetteer=gazetteer, pathGazetteer=pathGazetteer, negFrac=negFrac) sentences = cls.getSentences(input, parse, tokenization) e.buildExamplesForSentences(sentences, output, idFileTag) e.printStats() def getGazetteerMatch(self, string): if string in self.gazMatchCache: return self.gazMatchCache[string] origString = string if "stem_gazetteer" in self.styles: string = PorterStemmer.stem(string) if string in self.gazetteer: self.gazMatchCache[origString] = string return string elif string.find("-") != -1: replaced = string.replace("-", "") else: self.gazMatchCache[origString] = None return None if replaced in self.gazetteer: self.gazMatchCache[origString] = replaced return replaced else: splitted = string.rsplit("-", 1)[-1] if splitted in self.gazetteer: self.gazMatchCache[origString] = splitted return splitted else: self.gazMatchCache[origString] = None return None def isInGazetteer(self, string): return self.getGazetteerMatch(string) != None def printStats(self): eventsByType = {} for event in self.eventsByOrigId.values(): eventsByType[event.get("type")] = eventsByType.get( event.get("type"), 0) + 1 f = open("missed-events", "wt") missedEvents = {} for key in self.examplesByEventOrigId.keys(): if self.examplesByEventOrigId[key] == 0: if not missedEvents.has_key( self.eventsByOrigId[key].get("type")): missedEvents[self.eventsByOrigId[key].get("type")] = [] missedEvents[self.eventsByOrigId[key].get("type")].append(key) for key in sorted(missedEvents.keys()): f.write(key + "\n") for id in sorted(missedEvents[key]): f.write(" " + id + " ") if id in self.interSentenceEvents: f.write("intersentence ") text = self.headTokensByOrigId[id].get("text").lower() if not self.isInGazetteer(text): text = self.headTokensByOrigId[id].get("text").lower() if "stem_gazetteer" in self.styles: stemmed = PorterStemmer.stem(text) f.write("not-in-gazetteer (" + text + " / " + stemmed + ")") f.write("\n") f.close() print >> sys.stderr, "Example selection missed events (other, intersentence, non-gazetteer)" for key in sorted(eventsByType.keys()): inter = 0 other = 0 nongaz = 0 if missedEvents.has_key(key): for id in missedEvents[key]: tokText = self.headTokensByOrigId[id].get("text").lower() if id in self.interSentenceEvents: inter += 1 elif not self.isInGazetteer(tokText): nongaz += 1 else: other += 1 if inter == other == nongaz == 0: print >> sys.stderr, " " + key + " (" + str( eventsByType[key]) + "): missed none" else: print >> sys.stderr, " " + key + " (" + str( eventsByType[key]) + "): " + str(other) + ", " + str( inter) + ", " + str(nongaz) print >> sys.stderr, "Example generation (total, built/skipped)" for key in sorted( list(set(self.skippedByType.keys() + self.builtByType.keys()))): string = " " + key + ": (" + str( self.builtByType.get(key, 0) + self.skippedByType.get(key, 0)) + ", " + str( self.builtByType.get(key, 0)) + "/" + str( self.skippedByType.get(key, 0)) + ") [" for key2 in sorted(self.skippedByTypeAndReason[key].keys()): string += key2 + ":" + str( self.skippedByTypeAndReason[key][key2]) + " " string += "]" print >> sys.stderr, string def definePredictedValueRange(self, sentences, elementName): self.multiEdgeFeatureBuilder.definePredictedValueRange( sentences, elementName) def getPredictedValueRange(self): return self.multiEdgeFeatureBuilder.predictedRange def preProcessExamples(self, allExamples): if "normalize" in self.styles: print >> sys.stderr, " Normalizing feature vectors" ExampleUtils.normalizeFeatureVectors(allExamples) return allExamples # def isPotentialGeniaInteraction(self, e1, e2): # if e1.get("isName") == "True" and e2.get("isName") == "True": # return False # elif e1.get("isName") == "True" and e2.get("isName") == "False": # return False # else: # return True def getArgumentEntities(self, sentenceGraph, entityNode): eId = entityNode.get("id") assert (eId != None) themeNodes = [] causeNodes = [] for edge in sentenceGraph.interactions: if edge.get("e1") == eId: edgeType = edge.get("type") assert (edgeType in ["Theme", "Cause"]), edgeType if edgeType == "Theme": themeNodes.append( sentenceGraph.entitiesById[edge.get("e2")]) elif edgeType == "Cause": causeNodes.append( sentenceGraph.entitiesById[edge.get("e2")]) return themeNodes, causeNodes def makeGSEvents(self, sentenceGraph): self.namedEntityHeadTokenIds = set() self.gsEvents = {} # [token]->[event-type]->[1-n argument sets] for token in sentenceGraph.tokens: self.gsEvents[token] = {} for entity in sentenceGraph.entities: if entity.get("type") == "neg": continue elif entity.get("isName") == "True": self.namedEntityHeadTokenIds.add( sentenceGraph.entityHeadTokenByEntity[entity].get("id")) continue eId = entity.get("id") eOrigId = entity.get("origId") assert not self.eventsByOrigId.has_key(eOrigId) self.eventsByOrigId[eOrigId] = entity if not self.examplesByEventOrigId.has_key(eOrigId): self.examplesByEventOrigId[eOrigId] = 0 if len(sentenceGraph.interSentenceInteractions) > 0: for interaction in sentenceGraph.interSentenceInteractions: if interaction.get("e1") == eId: self.interSentenceEvents.add(eOrigId) eType = entity.get("type") arguments = set() for interaction in sentenceGraph.interactions: if interaction.get("e1") == eId: e2 = sentenceGraph.entitiesById[interaction.get("e2")] e2TokenId = sentenceGraph.entityHeadTokenByEntity[e2].get( "id") arguments.add((interaction.get("type"), e2TokenId)) #arguments.add( (interaction.get("type"), interaction.get("e2") ) ) arguments = tuple(sorted(list(arguments))) eHeadToken = sentenceGraph.entityHeadTokenByEntity[entity] self.headTokensByOrigId[eOrigId] = eHeadToken if not self.gsEvents[eHeadToken].has_key(eType): self.gsEvents[eHeadToken][eType] = {} if len(arguments) > 0: if not self.gsEvents[eHeadToken][eType].has_key(arguments): self.gsEvents[eHeadToken][eType][arguments] = [] self.gsEvents[eHeadToken][eType][arguments].append(eOrigId) def getGSEventType(self, sentenceGraph, eHeadToken, themeTokens, causeTokens): #eHeadToken = sentenceGraph.entityHeadTokenByEntity[entity] #eType = entity.get("type") if len(self.gsEvents[eHeadToken]) == 0: return "neg", [] argumentSet = set() for themeNode in themeTokens: if themeNode != None: argumentSet.add(("Theme", themeNode.get("id"))) for causeNode in causeTokens: if causeNode != None: argumentSet.add(("Cause", causeNode.get("id"))) argumentSet = tuple(sorted(list(argumentSet))) gsTypes = set() eventIds = [] for eventType in sorted(self.gsEvents[eHeadToken].keys()): if argumentSet in self.gsEvents[eHeadToken][eventType].keys(): gsTypes.add(eventType) eventIds.extend( self.gsEvents[eHeadToken][eventType][argumentSet]) if len(gsTypes) == 0: return "neg", eventIds elif len(gsTypes) == 1: return list(gsTypes)[0], eventIds else: gsTypes = sorted(list(gsTypes)) string = gsTypes[0] for gsType in gsTypes[1:]: string += "---" + gsType return string, eventIds def nxMultiDiGraphToUndirected(self, graph): undirected = NX10.MultiGraph(name=graph.name) undirected.add_nodes_from(graph) undirected.add_edges_from(graph.edges_iter()) return undirected def buildExamples(self, sentenceGraph): self.makeGSEvents(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) self.triggerFeatureBuilder.initSentence(sentenceGraph) examples = [] exampleIndex = 0 #undirected = sentenceGraph.dependencyGraph.to_undirected() undirected = self.nxMultiDiGraphToUndirected( sentenceGraph.dependencyGraph) paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) eventTokens = [] nameTokens = [] gazCategories = {None: {"neg": -1}} #stems = {} for token in sentenceGraph.tokens: gazText = self.getGazetteerMatch(token.get("text").lower()) if gazText != None: gazCategories[token] = self.gazetteer[gazText] else: gazCategories[token] = {"neg": -1} if token.get("id") in self.namedEntityHeadTokenIds: nameTokens.append(token) elif gazText != None: eventTokens.append(token) allTokens = eventTokens + nameTokens #if len(nameTokens) == 0: # there can be no events in this sentence # self.gsEvents = None # return [] for token in eventTokens: #gazCategories = self.gazetteer[token.get("text").lower()] #print token.get("text").lower(), gazCategories #multiargument = False potentialRegulation = False potentialBinding = False for key in gazCategories[token].keys(): if key in [ "Regulation", "Positive_regulation", "Negative_regulation" ]: #multiargument = True potentialRegulation = True break for key in gazCategories[token].keys(): if key in ["Binding"]: #multiargument = True potentialBinding = True break if potentialRegulation: combinations = combine.combine(allTokens, allTokens + [None]) else: combinations = [] for t2 in nameTokens: #allTokens: combinations.append((t2, None)) if potentialBinding: for i in range(len(nameTokens) - 1): for j in range(i + 1, len(nameTokens)): combinations.append( ((nameTokens[i], nameTokens[j]), None)) for combination in combinations: theme2Binding = False if type(combination[0]) == types.ListType or type( combination[0]) == types.TupleType: theme2Binding = True categoryName, eventIds = self.getGSEventType( sentenceGraph, token, combination[0], [combination[1]]) else: categoryName, eventIds = self.getGSEventType( sentenceGraph, token, [combination[0]], [combination[1]]) for id in eventIds: self.examplesByEventOrigId[id] += 1 skip = False s = self.skippedByTypeAndReason if not s.has_key(categoryName): s[categoryName] = {} if gazCategories[token].get("neg", -1) > 0.99: pass if combination[0] == combination[1]: pass #skip = True if combination[0] == token or combination[1] == token: if theme2Binding or gazCategories[combination[0]].get( "Positive_regulation", -1) < 0: skip = True s[categoryName]["duparg"] = s[categoryName].get( "duparg", 0) + 1 if combination[0] == None and combination[1] == None: skip = True s[categoryName]["noncmb"] = s[categoryName].get( "noncmb", 0) + 1 validCat = self.isValidEvent(paths, sentenceGraph, token, combination) if validCat != "OK": #not self.isValidEvent(paths, sentenceGraph, token, combination): skip = True #s[categoryName]["valid"] = s[categoryName].get("valid", 0) + 1 s[categoryName][validCat] = s[categoryName].get( validCat, 0) + 1 if len(nameTokens) == 0: skip = True s[categoryName]["non"] = s[categoryName].get("non", 0) + 1 if theme2Binding: if gazCategories[combination[0][0]].get( "neg", -1) > 0.99 or gazCategories[combination[0][1]].get( "neg", -1) > 0.99: skip = True s[categoryName]["gazarg"] = s[categoryName].get( "gazarg", 0) + 1 else: if gazCategories[combination[0]].get( "neg", -1) > 0.99 or gazCategories[combination[1]].get( "neg", -1) > 0.99: skip = True s[categoryName]["gazarg"] = s[categoryName].get( "gazarg", 0) + 1 if (skip and self.negFrac == None) or (skip and self.negFrac != None and categoryName == "neg"): self.skippedByType[categoryName] = self.skippedByType.get( categoryName, 0) + 1 else: if self.negFrac == None or categoryName != "neg" or ( categoryName == "neg" and self.negRand.random() < self.negFrac): self.builtByType[categoryName] = self.builtByType.get( categoryName, 0) + 1 if theme2Binding: newExample = self.buildExample( exampleIndex, sentenceGraph, paths, token, combination[0], [combination[1]]) else: newExample = self.buildExample( exampleIndex, sentenceGraph, paths, token, [combination[0]], [combination[1]]) if len(eventIds) > 0: newExample[3]["numEv"] = str(len(eventIds)) examples.append(newExample) exampleIndex += 1 self.gsEvents = None return examples def isValidEvent(self, paths, sentenceGraph, eventToken, argTokens): # This one lets through Positive_regulations that are # excluded from the duparg-rule oneTokenEvent = True for argToken in argTokens: if argToken != None and eventToken != argToken: oneTokenEvent = False break if oneTokenEvent: return "OK" #True if not paths.has_key(eventToken): return "nopaths" #False newArgTokens = [] for argToken in argTokens: if type(argToken) == types.ListType or type( argToken) == types.TupleType: newArgTokens.extend(argToken) else: newArgTokens.append(argToken) argTokens = newArgTokens oneArgValid = True if False: oneArgValid = False for argToken in argTokens: if argToken == None: continue if paths[eventToken].has_key(argToken): path = paths[eventToken][argToken] else: #print argToken, argToken.get("text") #return False continue depPaths = self.multiEdgeFeatureBuilder.getEdgeCombinations( sentenceGraph.dependencyGraph, path) validArg = False for p in depPaths: if p in self.pathGazetteer and self.pathGazetteer[p][0] > 0: validArg = True break if validArg: oneArgValid = True # The first and last dependency of a path if False: oneEdgeValid = False for argToken in argTokens: if argToken == None: continue if paths[eventToken].has_key(argToken): path = paths[eventToken][argToken] else: #print argToken, argToken.get("text") #return False continue depPaths = self.multiEdgeFeatureBuilder.getEdgeCombinations( sentenceGraph.dependencyGraph, path) validArg = False for p in depPaths: p = p.replace("<", "") p = p.replace(">", "") p = p.split(".") pair = (p[0], p[-1]) if pair in self.pathGazetteerPairs: validArg = True break if validArg: oneEdgeValid = True break if not oneEdgeValid: return "pair" # Event must not have unseen dependencies in any of its paths if False: for argToken in argTokens: if argToken == None: continue if paths[eventToken].has_key(argToken): path = paths[eventToken][argToken] else: continue deps = self.multiEdgeFeatureBuilder.getEdgeSet( sentenceGraph.dependencyGraph, path) for d in deps: if d[2].get("type") not in self.pathGazetteerDependencies: #print "Unk", d[2].get("type") return "unkdep" # validArg = True # for p in depPaths: # if p in self.pathGazetteer and self.pathGazetteer[p][0] == 0: # validArg = False # break # if not validArg: # return False if not oneArgValid: return "novalidarg" #False return "OK" #True def setGazetteerFeatures(self, token, tag): gazText = self.getGazetteerMatch(token.get("text").lower()) if gazText != None: gazCategories = self.gazetteer[gazText] for k, v in gazCategories.iteritems(): self.setFeature(tag + "gaz_event_value_" + k, v) self.setFeature(tag + "gaz_event_" + k, 1) if k.find("egulation") != -1: self.setFeature(tag + "potReg", 1) else: self.setFeature(tag + "notInGaz", 1) def buildExample(self, exampleIndex, sentenceGraph, paths, eventToken, themeTokens, causeTokens=None): features = {} self.features = features categoryName, eventIds = self.getGSEventType(sentenceGraph, eventToken, themeTokens, causeTokens) category = self.classSet.getId(categoryName) potentialRegulation = False eventTokenText = eventToken.get("text").lower() gazText = self.getGazetteerMatch(eventTokenText) gazCategories = self.gazetteer[gazText] for k, v in gazCategories.iteritems(): if k.find("egulation") != -1: potentialRegulation = True self.setGazetteerFeatures(eventToken, "") self.triggerFeatureBuilder.setFeatureVector(self.features) self.triggerFeatureBuilder.tag = "trg_" self.triggerFeatureBuilder.buildFeatures(eventToken) themeEntities = [] hasTheme = False if len(themeTokens) > 1: self.setFeature("multiTheme", 1) potentialRegulation = False for themeToken in themeTokens: if themeToken != None: hasTheme = True self.setGazetteerFeatures(themeToken, "theme_") self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, themeToken, "theme_") self.triggerFeatureBuilder.tag = "ttrg_" self.triggerFeatureBuilder.buildFeatures(themeToken) themeEntity = None if sentenceGraph.entitiesByToken.has_key(themeToken): for themeEntity in sentenceGraph.entitiesByToken[ themeToken]: if themeEntity.get("isName") == "True": self.setFeature("themeProtein", 1) if potentialRegulation: self.setFeature("regulationThemeProtein", 1) themeEntities.append(themeEntity) break if not features.has_key("themeProtein"): self.setFeature("themeEvent", 1) self.setFeature("nestingEvent", 1) if potentialRegulation: self.setFeature("regulationThemeEvent", 1) if hasTheme: self.setFeature("noTheme", 1) causeEntities = [] hasCause = False for causeToken in causeTokens: if causeToken != None: hasCause = True self.setGazetteerFeatures(causeToken, "cause_") self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, causeToken, "cause_") self.triggerFeatureBuilder.tag = "ctrg_" self.triggerFeatureBuilder.buildFeatures(causeToken) causeEntity = None if sentenceGraph.entitiesByToken.has_key(causeToken): for causeEntity in sentenceGraph.entitiesByToken[ causeToken]: if causeEntity.get("isName") == "True": self.setFeature("causeProtein", 1) if potentialRegulation: self.setFeature("regulationCauseProtein", 1) causeEntities.append(causeEntity) break if not features.has_key("causeProtein"): self.setFeature("causeEvent", 1) self.setFeature("nestingEvent", 1) if potentialRegulation: self.setFeature("regulationCauseEvent", 1) if not hasCause: self.setFeature("noCause", 1) self.triggerFeatureBuilder.tag = "" self.triggerFeatureBuilder.setFeatureVector(None) # Common features # if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization # if entity2.get("isName") == "True": # features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1 # else: # features[self.featureSet.getId("GENIA_regulation_of_event")] = 1 # define extra attributes extra = {"xtype": "event", "type": categoryName} extra["et"] = eventToken.get("id") if len(eventIds) > 0: eventIds.sort() extra["eids"] = "" for eventId in eventIds: extra["eids"] += str(eventId) + "," extra["eids"] = extra["eids"][:-1] for themeToken in themeTokens: if themeToken != None: if extra.has_key("tt"): extra["tt"] = extra["tt"] + "," + themeToken.get("id") else: extra["tt"] = themeToken.get("id") for themeEntity in themeEntities: if extra.has_key("t"): extra["t"] = extra["t"] + "," + themeEntity.get("id") else: extra["t"] = themeEntity.get("id") for causeToken in causeTokens: if causeToken != None: extra["ct"] = causeTokens[0].get("id") if len(causeEntities) > 0: extra["c"] = causeEntities[0].get("id") sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId # make example #assert (category == 1 or category == -1) self.features = None return (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra) def buildArgumentFeatures(self, sentenceGraph, paths, features, eventToken, argToken, tag): #eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode] #argToken = sentenceGraph.entityHeadTokenByEntity[argNode] self.multiEdgeFeatureBuilder.tag = tag self.multiEdgeFeatureBuilder.setFeatureVector(features, None, None, False) if eventToken != argToken and paths.has_key( eventToken) and paths[eventToken].has_key(argToken): path = paths[eventToken][argToken] edges = self.multiEdgeFeatureBuilder.getEdges( sentenceGraph.dependencyGraph, path) else: path = [eventToken, argToken] edges = None # if not "disable_entity_features" in self.styles: # # doesn't improve beyond 52.32 # self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) # # buildPathLengthFeatures 52.32 -> 51-51 # self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) # if not "disable_terminus_features" in self.styles: # # didn't improve from 52.32 # self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not "disable_single_element_features" in self.styles: # 50.74 -> 52.32 self.multiEdgeFeatureBuilder.buildSingleElementFeatures( path, edges, sentenceGraph) if not "disable_ngram_features" in self.styles: # ngrams alone - 50.74 self.multiEdgeFeatureBuilder.buildPathGrams( 2, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 3, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 4, path, edges, sentenceGraph) # remove for fast # disabling length 4 drops performance # if not "disable_path_edge_features" in self.styles: # self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, edges, sentenceGraph) # self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) # buildSentenceFeatures seems to decrease performance by 8 %-points self.multiEdgeFeatureBuilder.setFeatureVector(None, None, None, False) self.multiEdgeFeatureBuilder.tag = ""
class UnmergedEdgeExampleBuilder(ExampleBuilder): def __init__(self, style=["typed", "directed", "headsOnly"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert (classSet.getId("neg") == 1) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) self.pathLengths = length assert (self.pathLengths == None) self.types = types if "random" in self.styles: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet) #self.outFile = open("exampleTempFile.txt","wt") @classmethod def run(cls, input, output, parse, tokenization, style, idFileTag=None): classSet, featureSet = cls.getIdSets(idFileTag) if style == None: e = UnmergedEdgeExampleBuilder(classSet=classSet, featureSet=featureSet) else: e = UnmergedEdgeExampleBuilder(style=style, classSet=classSet, featureSet=featureSet) sentences = cls.getSentences(input, parse, tokenization) e.buildExamplesForSentences(sentences, output, idFileTag) print e.classSet.Ids def definePredictedValueRange(self, sentences, elementName): self.multiEdgeFeatureBuilder.definePredictedValueRange( sentences, elementName) def getPredictedValueRange(self): return self.multiEdgeFeatureBuilder.predictedRange def filterEdgesByType(self, edges, typesToInclude): if len(typesToInclude) == 0: return edges edgesToKeep = [] for edge in edges: if edge.get("type") in typesToInclude: edgesToKeep.append(edge) return edgesToKeep def getCategoryName(self, sentenceGraph, e1, e2, directed=True): # Dummies are potential entities that do not exist in the # training data. If both entities of an interaction are dummies # it can't exist in the training data and is therefore a negative if e1[2] or e2[2]: return "neg" e1 = e1[0] e2 = e2[0] interactions = sentenceGraph.getInteractions(e1, e2) if not directed: interactions.extend(sentenceGraph.getInteractions(e2, e1)) types = set() for interaction in interactions: types.add(interaction.attrib["type"]) types = list(types) types.sort() categoryName = "" for name in types: if categoryName != "": categoryName += "---" categoryName += name if categoryName != "": return categoryName else: return "neg" def preProcessExamples(self, allExamples): if "normalize" in self.styles: print >> sys.stderr, " Normalizing feature vectors" ExampleUtils.normalizeFeatureVectors(allExamples) return allExamples def isPotentialGeniaInteraction(self, e1, e2): if e1.get("isName") == "True": return False else: return True def nxMultiDiGraphToUndirected(self, graph): undirected = NX10.MultiGraph(name=graph.name) undirected.add_nodes_from(graph) undirected.add_edges_from(graph.edges_iter()) return undirected def getInteractionEdgeLengths(self, sentenceGraph, paths): """ Return dependency and linear length of all interaction edges (measured between the two tokens). """ interactionLengths = {} for interaction in sentenceGraph.interactions: # Calculated interaction edge dep and lin length e1 = sentenceGraph.entitiesById[interaction.get("e1")] e2 = sentenceGraph.entitiesById[interaction.get("e2")] t1 = sentenceGraph.entityHeadTokenByEntity[e1] t2 = sentenceGraph.entityHeadTokenByEntity[e2] # Get dep path length if t1 != t2 and paths.has_key(t1) and paths[t1].has_key(t2): pathLength = len(paths[t1][t2]) else: # no dependencyPath pathLength = 999999 # more than any real path # Linear distance t1Pos = -1 t2Pos = -1 for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == t1: t1Pos = i if t2Pos != -1: break if sentenceGraph.tokens[i] == t2: t2Pos = i if t1Pos != -1: break linLength = abs(t1Pos - t2Pos) interactionLengths[interaction] = (pathLength, linLength) return interactionLengths def getPrecedenceLevels(self, sentenceGraph, paths): """ Get overlapping entity precedence """ interactionLengths = self.getInteractionEdgeLengths( sentenceGraph, paths) interactionsByEntity = {} # Convenience mapping entityPrecedenceValues = {} for entity in sentenceGraph.entities: interactionsByEntity[entity] = [] eId = entity.get("id") # Add access to interactions argDepDist = 0 # Sum of lengths of shortest paths argLinDist = 0 # Sum of linear distances for interaction in sentenceGraph.interactions: if interaction.get( "e1" ) == eId: # An argument of the entity defined by the node interactionsByEntity[entity].append(interaction) argDepDist += interactionLengths[interaction][0] argLinDist += interactionLengths[interaction][1] # Store precedence counts (num args, sum of dep lengths, sum of lin lengths) entityPrecedenceValues[entity] = (len(interactionsByEntity), argDepDist, argLinDist, entity) # Determine level of entity from precedence counts levelByEntity = {} # slot number #levelByInteraction = {} # slot number of parent node # There is one slot group per token, per type for token in sentenceGraph.tokens: # per token entitiesByType = {} for entity in sentenceGraph.tokenIsEntityHead[token]: # per type if entity.get( "isName") == "True": # Names can never have duplicates assert not levelByEntity.has_key(entity) levelByEntity[entity] = 0 continue eType = entity.get("type") if eType == "neg": continue if not entitiesByType.has_key(eType): entitiesByType[eType] = [] entitiesByType[eType].append(entity) for eType in sorted(entitiesByType.keys()): # Slot ordering by precedence sortedEntities = [] for entity in entitiesByType[eType]: sortedEntities.append(entityPrecedenceValues[entity]) sortedEntities.sort(compareEntityPrecedence) level = 0 for precedenceTuple in sortedEntities: entity = precedenceTuple[3] assert not levelByEntity.has_key(entity) levelByEntity[entity] = level # Interactions have the same slot as their parent entity #for interaction in interactionsByEntity[entity]: # assert not levelByInteraction.has_key(interaction) # levelByInteraction[interaction] = level level += 1 return levelByEntity #, levelByInteraction def buildExamples(self, sentenceGraph): examples = [] exampleIndex = 0 #undirected = sentenceGraph.getUndirectedDependencyGraph() undirected = self.nxMultiDiGraphToUndirected( sentenceGraph.dependencyGraph) ##undirected = sentenceGraph.dependencyGraph.to_undirected() ###undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) # Determine overlapping entity precedence #levelByEntity, levelByInteraction = self.getPrecedenceLevels(sentenceGraph, paths) levelByEntity = self.getPrecedenceLevels(sentenceGraph, paths) entities = [] # There is one entity group for each token, for each type of entity for token in sentenceGraph.tokens: # per token entitiesByType = {} for entity in sentenceGraph.tokenIsEntityHead[token]: # per type if entity.get( "isName") == "True": # Names can never have duplicates entities.append((entity, 0, False)) continue eType = entity.get("type") if eType == "neg": continue if not entitiesByType.has_key(eType): entitiesByType[eType] = [] entitiesByType[eType].append(entity) # Create slot groups for tokens for which exists at least one entity eTypes = sorted(entitiesByType.keys()) if len(eTypes) == 0: continue # Create slot groups and insert GS data there for eType in eTypes: # Use first entity of a type as the dummy entity for unfilled slots dummyEntity = entitiesByType[eType][0] # Define entity slots entityGroup = [None, None, None, None] #entityGroup = [None, None] # Insert existing entities into slots for entity in entitiesByType[eType]: if levelByEntity.has_key(entity): level = levelByEntity[entity] if level < len(entityGroup): entityGroup[level] = (entity, level, False) # Create dummies for potential entities for i in range(len(entityGroup)): if entityGroup[i] == None: entityGroup[i] = (dummyEntity, i, True) # Put all slots into one potential entity list #print entityGroup for e in entityGroup: entities.append(e) # Generate examples based on interactions between entities for i in range(len(entities) - 1): for j in range(i + 1, len(entities)): eI = entities[i][0] eJ = entities[j][0] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] # define forward example categoryName = self.getCategoryName(sentenceGraph, entities[i], entities[j], True) if (not "genia_limits" in self.styles) or self.isPotentialGeniaInteraction( eI, eJ): examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, entities[i], entities[j])) exampleIndex += 1 # define reverse categoryName = self.getCategoryName(sentenceGraph, entities[j], entities[i], True) if (not "genia_limits" in self.styles) or self.isPotentialGeniaInteraction( eJ, eI): examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, entities[j], entities[i])) exampleIndex += 1 return examples def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, exampleIndex, e1=None, e2=None): entity1 = e1[0] entity2 = e2[0] # define features features = {} features[self.featureSet.getId("gov_level")] = e1[1] features[self.featureSet.getId("gov_level_" + str(e1[1]))] = 1 features[self.featureSet.getId("dep_level")] = e2[1] features[self.featureSet.getId("dep_level_" + str(e2[1]))] = 1 features[self.featureSet.getId("level_pair_" + str(e1[1]) + "_" + str(e2[1]))] = 1 if True: #token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): if token1 != token2 and paths.has_key( token1) and paths[token1].has_key(token2): path = paths[token1][token2] else: path = [token1, token2] assert (self.pathLengths == None) if self.pathLengths == None or len(path) - 1 in self.pathLengths: if not "no_dependency" in self.styles: if token1 != token2 and paths.has_key( token1) and paths[token1].has_key(token2): edges = self.multiEdgeFeatureBuilder.getEdges( sentenceGraph.dependencyGraph, path) else: edges = None if "entity_type" in self.styles: features[self.featureSet.getId("e1_" + entity1.attrib["type"])] = 1 features[self.featureSet.getId("e2_" + entity2.attrib["type"])] = 1 features[self.featureSet.getId("distance_" + str(len(path)))] = 1 if not "no_dependency" in self.styles: self.multiEdgeFeatureBuilder.setFeatureVector( features, entity1, entity2) #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast if not "disable_entity_features" in self.styles: self.multiEdgeFeatureBuilder.buildEntityFeatures( sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not "disable_terminus_features" in self.styles: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures( path, sentenceGraph) # remove for fast if not "disable_single_element_features" in self.styles: self.multiEdgeFeatureBuilder.buildSingleElementFeatures( path, edges, sentenceGraph) if not "disable_ngram_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathGrams( 2, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 3, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 4, path, edges, sentenceGraph) # remove for fast #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast #if edges != None: # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast if not "disable_path_edge_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures( path, edges, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures( sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) if not "no_linear" in self.styles: self.tokenFeatureBuilder.setFeatureVector(features) for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == token1: token1Index = i if sentenceGraph.tokens[i] == token2: token2Index = i linearPreTag = "linfw_" if token1Index > token2Index: token1Index, token2Index = token2Index, token1Index linearPreTag = "linrv_" self.tokenFeatureBuilder.buildLinearOrderFeatures( token1Index, sentenceGraph, 2, 2, preTag="linTok1") self.tokenFeatureBuilder.buildLinearOrderFeatures( token2Index, sentenceGraph, 2, 2, preTag="linTok2") # Before, middle, after # self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf") # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw") # self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af") # before-middle, middle, middle-after # self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2) self.tokenFeatureBuilder.setFeatureVector(None) if "random" in self.styles: self.randomFeatureBuilder.setFeatureVector(features) self.randomFeatureBuilder.buildRandomFeatures(100, 0.01) self.randomFeatureBuilder.setFeatureVector(None) if "genia_limits" in self.styles: e1Type = entity1.get("type") e2Type = entity2.get("type") assert (entity1.get("isName") == "False") if entity2.get("isName") == "True": features[self.featureSet.getId( "GENIA_target_protein")] = 1 else: features[self.featureSet.getId( "GENIA_nested_event")] = 1 if e1Type.find( "egulation" ) != -1: # leave r out to avoid problems with capitalization if entity2.get("isName") == "True": features[self.featureSet.getId( "GENIA_regulation_of_protein")] = 1 else: features[self.featureSet.getId( "GENIA_regulation_of_event")] = 1 else: features[self.featureSet.getId("always_negative")] = 1 if "subset" in self.styles: features[self.featureSet.getId("out_of_scope")] = 1 else: features[self.featureSet.getId("always_negative")] = 1 if "subset" in self.styles: features[self.featureSet.getId("out_of_scope")] = 1 path = [token1, token2] # define extra attributes if int(path[0].attrib["id"].split("_")[-1]) < int( path[-1].attrib["id"].split("_")[-1]): #extra = {"xtype":"edge","type":"i","t1":path[0],"t2":path[-1]} extra = { "xtype": "ue", "type": "i", "t1": path[0].get("id"), "t2": path[-1].get("id") } extra["deprev"] = False else: #extra = {"xtype":"edge","type":"i","t1":path[-1],"t2":path[0]} extra = { "xtype": "ue", "type": "i", "t1": path[-1].get("id"), "t2": path[0].get("id") } extra["deprev"] = True if entity1 != None: extra["e1"] = entity1.get("id") extra["l1"] = str(e1[1]) extra["d1"] = str(e1[2])[ 0] # is a dummy node (an entity not in existing triggers) if entity2 != None: extra["e2"] = entity2.get("id") extra["l2"] = str(e2[1]) extra["d2"] = str(e2[2])[ 0] # is a dummy node (an entity not in existing triggers) extra["categoryName"] = categoryName sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId # make example if "binary" in self.styles: if categoryName != "neg": category = 1 else: category = -1 categoryName = "i" else: category = self.classSet.getId(categoryName) return (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)
class DirectEventExampleBuilder(ExampleBuilder): def __init__(self, style=["typed","directed","headsOnly"], length=None, types=[], featureSet=None, classSet=None, gazetteer=None, pathGazetteer=None, negFrac=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) if gazetteer != None: print >> sys.stderr, "Loading gazetteer from", gazetteer self.gazetteer=Gazetteer.loadGztr(gazetteer) else: print >> sys.stderr, "No gazetteer loaded" self.gazetteer=None self.pathGazetteer=None self.pathGazetteerDependencies = None self.pathGazetteerPairs = None if pathGazetteer != None: print >> sys.stderr, "Loading path gazetteer from", pathGazetteer self.pathGazetteer=PathGazetteer.load(pathGazetteer) self.pathGazetteerDependencies = PathGazetteer.getDependencies(self.pathGazetteer) self.pathGazetteerPairs = PathGazetteer.getPairs(self.pathGazetteer) else: print >> sys.stderr, "No path gazetteer loaded" ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.negFrac = negFrac print >> sys.stderr, "Downsampling negatives to", negFrac self.negRand = random.Random() self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if True:#"noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) #if "ontology" in self.styles: # self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) self.pathLengths = length assert(self.pathLengths == None) self.types = types self.eventsByOrigId = {} self.headTokensByOrigId = {} self.interSentenceEvents = set() self.examplesByEventOrigId = {} self.skippedByType = {} self.skippedByTypeAndReason = {} self.builtByType = {} self.gazMatchCache = {} #self.outFile = open("exampleTempFile.txt","wt") @classmethod def run(cls, input, output, parse, tokenization, style, idFileTag=None, gazetteer=None, pathGazetteer=None, negFrac=None): classSet, featureSet = cls.getIdSets(idFileTag) if style != None: e = DirectEventExampleBuilder(style=style, classSet=classSet, featureSet=featureSet, gazetteer=gazetteer, pathGazetteer=pathGazetteer, negFrac=negFrac) else: e = DirectEventExampleBuilder(classSet=classSet, featureSet=featureSet, gazetteer=gazetteer, pathGazetteer=pathGazetteer, negFrac=negFrac) sentences = cls.getSentences(input, parse, tokenization) e.buildExamplesForSentences(sentences, output, idFileTag) e.printStats() def getGazetteerMatch(self, string): if string in self.gazMatchCache: return self.gazMatchCache[string] origString = string if "stem_gazetteer" in self.styles: string = PorterStemmer.stem(string) if string in self.gazetteer: self.gazMatchCache[origString] = string return string elif string.find("-") != -1: replaced = string.replace("-","") else: self.gazMatchCache[origString] = None return None if replaced in self.gazetteer: self.gazMatchCache[origString] = replaced return replaced else: splitted = string.rsplit("-",1)[-1] if splitted in self.gazetteer: self.gazMatchCache[origString] = splitted return splitted else: self.gazMatchCache[origString] = None return None def isInGazetteer(self, string): return self.getGazetteerMatch(string) != None def printStats(self): eventsByType = {} for event in self.eventsByOrigId.values(): eventsByType[event.get("type")] = eventsByType.get(event.get("type"),0) + 1 f = open("missed-events", "wt") missedEvents = {} for key in self.examplesByEventOrigId.keys(): if self.examplesByEventOrigId[key] == 0: if not missedEvents.has_key(self.eventsByOrigId[key].get("type")): missedEvents[self.eventsByOrigId[key].get("type")] = [] missedEvents[self.eventsByOrigId[key].get("type")].append(key) for key in sorted(missedEvents.keys()): f.write(key + "\n") for id in sorted(missedEvents[key]): f.write(" " + id + " ") if id in self.interSentenceEvents: f.write("intersentence ") text = self.headTokensByOrigId[id].get("text").lower() if not self.isInGazetteer(text): text = self.headTokensByOrigId[id].get("text").lower() if "stem_gazetteer" in self.styles: stemmed = PorterStemmer.stem(text) f.write("not-in-gazetteer (" + text + " / " + stemmed +")" ) f.write("\n") f.close() print >> sys.stderr, "Example selection missed events (other, intersentence, non-gazetteer)" for key in sorted(eventsByType.keys()): inter = 0 other = 0 nongaz = 0 if missedEvents.has_key(key): for id in missedEvents[key]: tokText = self.headTokensByOrigId[id].get("text").lower() if id in self.interSentenceEvents: inter += 1 elif not self.isInGazetteer(tokText): nongaz += 1 else: other += 1 if inter == other == nongaz == 0: print >> sys.stderr, " " + key + " (" + str(eventsByType[key]) + "): missed none" else: print >> sys.stderr, " " + key + " (" + str(eventsByType[key]) + "): " + str(other) + ", " + str(inter) + ", " + str(nongaz) print >> sys.stderr, "Example generation (total, built/skipped)" for key in sorted(list(set(self.skippedByType.keys() + self.builtByType.keys()))): string = " " + key + ": (" + str(self.builtByType.get(key,0)+self.skippedByType.get(key,0)) + ", " + str(self.builtByType.get(key,0)) + "/" + str(self.skippedByType.get(key,0)) + ") [" for key2 in sorted(self.skippedByTypeAndReason[key].keys()): string += key2 + ":" + str(self.skippedByTypeAndReason[key][key2]) + " " string += "]" print >> sys.stderr, string def definePredictedValueRange(self, sentences, elementName): self.multiEdgeFeatureBuilder.definePredictedValueRange(sentences, elementName) def getPredictedValueRange(self): return self.multiEdgeFeatureBuilder.predictedRange def preProcessExamples(self, allExamples): if "normalize" in self.styles: print >> sys.stderr, " Normalizing feature vectors" ExampleUtils.normalizeFeatureVectors(allExamples) return allExamples # def isPotentialGeniaInteraction(self, e1, e2): # if e1.get("isName") == "True" and e2.get("isName") == "True": # return False # elif e1.get("isName") == "True" and e2.get("isName") == "False": # return False # else: # return True def getArgumentEntities(self, sentenceGraph, entityNode): eId = entityNode.get("id") assert(eId != None) themeNodes = [] causeNodes = [] for edge in sentenceGraph.interactions: if edge.get("e1") == eId: edgeType = edge.get("type") assert(edgeType in ["Theme", "Cause"]), edgeType if edgeType == "Theme": themeNodes.append( sentenceGraph.entitiesById[edge.get("e2")] ) elif edgeType == "Cause": causeNodes.append( sentenceGraph.entitiesById[edge.get("e2")] ) return themeNodes, causeNodes def makeGSEvents(self, sentenceGraph): self.namedEntityHeadTokenIds = set() self.gsEvents = {} # [token]->[event-type]->[1-n argument sets] for token in sentenceGraph.tokens: self.gsEvents[token] = {} for entity in sentenceGraph.entities: if entity.get("type") == "neg": continue elif entity.get("isName") == "True": self.namedEntityHeadTokenIds.add(sentenceGraph.entityHeadTokenByEntity[entity].get("id")) continue eId = entity.get("id") eOrigId = entity.get("origId") assert not self.eventsByOrigId.has_key(eOrigId) self.eventsByOrigId[eOrigId] = entity if not self.examplesByEventOrigId.has_key(eOrigId): self.examplesByEventOrigId[eOrigId] = 0 if len(sentenceGraph.interSentenceInteractions) > 0: for interaction in sentenceGraph.interSentenceInteractions: if interaction.get("e1") == eId: self.interSentenceEvents.add(eOrigId) eType = entity.get("type") arguments = set() for interaction in sentenceGraph.interactions: if interaction.get("e1") == eId: e2 = sentenceGraph.entitiesById[interaction.get("e2")] e2TokenId = sentenceGraph.entityHeadTokenByEntity[e2].get("id") arguments.add( (interaction.get("type"), e2TokenId ) ) #arguments.add( (interaction.get("type"), interaction.get("e2") ) ) arguments = tuple(sorted(list(arguments))) eHeadToken = sentenceGraph.entityHeadTokenByEntity[entity] self.headTokensByOrigId[eOrigId] = eHeadToken if not self.gsEvents[eHeadToken].has_key(eType): self.gsEvents[eHeadToken][eType] = {} if len(arguments) > 0: if not self.gsEvents[eHeadToken][eType].has_key(arguments): self.gsEvents[eHeadToken][eType][arguments] = [] self.gsEvents[eHeadToken][eType][arguments].append(eOrigId) def getGSEventType(self, sentenceGraph, eHeadToken, themeTokens, causeTokens): #eHeadToken = sentenceGraph.entityHeadTokenByEntity[entity] #eType = entity.get("type") if len(self.gsEvents[eHeadToken]) == 0: return "neg", [] argumentSet = set() for themeNode in themeTokens: if themeNode != None: argumentSet.add( ("Theme", themeNode.get("id")) ) for causeNode in causeTokens: if causeNode != None: argumentSet.add( ("Cause", causeNode.get("id")) ) argumentSet = tuple(sorted(list(argumentSet))) gsTypes = set() eventIds = [] for eventType in sorted(self.gsEvents[eHeadToken].keys()): if argumentSet in self.gsEvents[eHeadToken][eventType].keys(): gsTypes.add(eventType) eventIds.extend(self.gsEvents[eHeadToken][eventType][argumentSet]) if len(gsTypes) == 0: return "neg", eventIds elif len(gsTypes) == 1: return list(gsTypes)[0], eventIds else: gsTypes = sorted(list(gsTypes)) string = gsTypes[0] for gsType in gsTypes[1:]: string += "---" + gsType return string, eventIds def nxMultiDiGraphToUndirected(self, graph): undirected = NX10.MultiGraph(name=graph.name) undirected.add_nodes_from(graph) undirected.add_edges_from(graph.edges_iter()) return undirected def buildExamples(self, sentenceGraph): self.makeGSEvents(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) self.triggerFeatureBuilder.initSentence(sentenceGraph) examples = [] exampleIndex = 0 #undirected = sentenceGraph.dependencyGraph.to_undirected() undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) eventTokens = [] nameTokens = [] gazCategories = {None:{"neg":-1}} #stems = {} for token in sentenceGraph.tokens: gazText = self.getGazetteerMatch(token.get("text").lower()) if gazText != None: gazCategories[token] = self.gazetteer[gazText] else: gazCategories[token] = {"neg":-1} if token.get("id") in self.namedEntityHeadTokenIds: nameTokens.append(token) elif gazText != None: eventTokens.append(token) allTokens = eventTokens + nameTokens #if len(nameTokens) == 0: # there can be no events in this sentence # self.gsEvents = None # return [] for token in eventTokens: #gazCategories = self.gazetteer[token.get("text").lower()] #print token.get("text").lower(), gazCategories #multiargument = False potentialRegulation = False potentialBinding = False for key in gazCategories[token].keys(): if key in ["Regulation","Positive_regulation","Negative_regulation"]: #multiargument = True potentialRegulation = True break for key in gazCategories[token].keys(): if key in ["Binding"]: #multiargument = True potentialBinding = True break if potentialRegulation: combinations = combine.combine(allTokens, allTokens+[None]) else: combinations = [] for t2 in nameTokens: #allTokens: combinations.append( (t2, None) ) if potentialBinding: for i in range(len(nameTokens) - 1): for j in range(i+1, len(nameTokens)): combinations.append( ((nameTokens[i],nameTokens[j]), None) ) for combination in combinations: theme2Binding = False if type(combination[0]) == types.ListType or type(combination[0]) == types.TupleType: theme2Binding = True categoryName, eventIds = self.getGSEventType(sentenceGraph, token, combination[0], [combination[1]]) else: categoryName, eventIds = self.getGSEventType(sentenceGraph, token, [combination[0]], [combination[1]]) for id in eventIds: self.examplesByEventOrigId[id] += 1 skip = False s = self.skippedByTypeAndReason if not s.has_key(categoryName): s[categoryName] = {} if gazCategories[token].get("neg",-1) > 0.99: pass if combination[0] == combination[1]: pass #skip = True if combination[0] == token or combination[1] == token: if theme2Binding or gazCategories[combination[0]].get("Positive_regulation",-1) < 0: skip = True s[categoryName]["duparg"] = s[categoryName].get("duparg", 0) + 1 if combination[0] == None and combination[1] == None: skip = True s[categoryName]["noncmb"] = s[categoryName].get("noncmb", 0) + 1 validCat = self.isValidEvent(paths, sentenceGraph, token, combination) if validCat != "OK": #not self.isValidEvent(paths, sentenceGraph, token, combination): skip = True #s[categoryName]["valid"] = s[categoryName].get("valid", 0) + 1 s[categoryName][validCat] = s[categoryName].get(validCat, 0) + 1 if len(nameTokens) == 0: skip = True s[categoryName]["non"] = s[categoryName].get("non", 0) + 1 if theme2Binding: if gazCategories[combination[0][0]].get("neg",-1) > 0.99 or gazCategories[combination[0][1]].get("neg",-1) > 0.99: skip = True s[categoryName]["gazarg"] = s[categoryName].get("gazarg", 0) + 1 else: if gazCategories[combination[0]].get("neg",-1) > 0.99 or gazCategories[combination[1]].get("neg",-1) > 0.99: skip = True s[categoryName]["gazarg"] = s[categoryName].get("gazarg", 0) + 1 if (skip and self.negFrac == None) or (skip and self.negFrac != None and categoryName == "neg"): self.skippedByType[categoryName] = self.skippedByType.get(categoryName, 0) + 1 else: if self.negFrac == None or categoryName != "neg" or (categoryName == "neg" and self.negRand.random() < self.negFrac): self.builtByType[categoryName] = self.builtByType.get(categoryName, 0) + 1 if theme2Binding: newExample = self.buildExample(exampleIndex, sentenceGraph, paths, token, combination[0], [combination[1]]) else: newExample = self.buildExample(exampleIndex, sentenceGraph, paths, token, [combination[0]], [combination[1]]) if len(eventIds) > 0: newExample[3]["numEv"] = str(len(eventIds)) examples.append( newExample ) exampleIndex += 1 self.gsEvents = None return examples def isValidEvent(self, paths, sentenceGraph, eventToken, argTokens): # This one lets through Positive_regulations that are # excluded from the duparg-rule oneTokenEvent = True for argToken in argTokens: if argToken != None and eventToken != argToken: oneTokenEvent = False break if oneTokenEvent: return "OK" #True if not paths.has_key(eventToken): return "nopaths" #False newArgTokens = [] for argToken in argTokens: if type(argToken) == types.ListType or type(argToken) == types.TupleType: newArgTokens.extend(argToken) else: newArgTokens.append(argToken) argTokens = newArgTokens oneArgValid = True if False: oneArgValid = False for argToken in argTokens: if argToken == None: continue if paths[eventToken].has_key(argToken): path = paths[eventToken][argToken] else: #print argToken, argToken.get("text") #return False continue depPaths = self.multiEdgeFeatureBuilder.getEdgeCombinations(sentenceGraph.dependencyGraph, path) validArg = False for p in depPaths: if p in self.pathGazetteer and self.pathGazetteer[p][0] > 0: validArg = True break if validArg: oneArgValid = True # The first and last dependency of a path if False: oneEdgeValid = False for argToken in argTokens: if argToken == None: continue if paths[eventToken].has_key(argToken): path = paths[eventToken][argToken] else: #print argToken, argToken.get("text") #return False continue depPaths = self.multiEdgeFeatureBuilder.getEdgeCombinations(sentenceGraph.dependencyGraph, path) validArg = False for p in depPaths: p = p.replace("<","") p = p.replace(">","") p = p.split(".") pair = (p[0], p[-1]) if pair in self.pathGazetteerPairs: validArg = True break if validArg: oneEdgeValid = True break if not oneEdgeValid: return "pair" # Event must not have unseen dependencies in any of its paths if False: for argToken in argTokens: if argToken == None: continue if paths[eventToken].has_key(argToken): path = paths[eventToken][argToken] else: continue deps = self.multiEdgeFeatureBuilder.getEdgeSet(sentenceGraph.dependencyGraph, path) for d in deps: if d[2].get("type") not in self.pathGazetteerDependencies: #print "Unk", d[2].get("type") return "unkdep" # validArg = True # for p in depPaths: # if p in self.pathGazetteer and self.pathGazetteer[p][0] == 0: # validArg = False # break # if not validArg: # return False if not oneArgValid: return "novalidarg" #False return "OK" #True def setGazetteerFeatures(self, token, tag): gazText = self.getGazetteerMatch(token.get("text").lower()) if gazText != None: gazCategories = self.gazetteer[gazText] for k,v in gazCategories.iteritems(): self.setFeature(tag+"gaz_event_value_"+k, v) self.setFeature(tag+"gaz_event_"+k, 1) if k.find("egulation") != -1: self.setFeature(tag+"potReg", 1) else: self.setFeature(tag+"notInGaz", 1) def buildExample(self, exampleIndex, sentenceGraph, paths, eventToken, themeTokens, causeTokens=None): features = {} self.features = features categoryName, eventIds = self.getGSEventType(sentenceGraph, eventToken, themeTokens, causeTokens) category = self.classSet.getId(categoryName) potentialRegulation = False eventTokenText = eventToken.get("text").lower() gazText = self.getGazetteerMatch(eventTokenText) gazCategories = self.gazetteer[gazText] for k,v in gazCategories.iteritems(): if k.find("egulation") != -1: potentialRegulation = True self.setGazetteerFeatures(eventToken,"") self.triggerFeatureBuilder.setFeatureVector(self.features) self.triggerFeatureBuilder.tag = "trg_" self.triggerFeatureBuilder.buildFeatures(eventToken) themeEntities = [] hasTheme = False if len(themeTokens) > 1: self.setFeature("multiTheme", 1) potentialRegulation = False for themeToken in themeTokens: if themeToken != None: hasTheme = True self.setGazetteerFeatures(themeToken,"theme_") self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, themeToken, "theme_") self.triggerFeatureBuilder.tag = "ttrg_" self.triggerFeatureBuilder.buildFeatures(themeToken) themeEntity = None if sentenceGraph.entitiesByToken.has_key(themeToken): for themeEntity in sentenceGraph.entitiesByToken[themeToken]: if themeEntity.get("isName") == "True": self.setFeature("themeProtein", 1) if potentialRegulation: self.setFeature("regulationThemeProtein", 1) themeEntities.append(themeEntity) break if not features.has_key("themeProtein"): self.setFeature("themeEvent", 1) self.setFeature("nestingEvent", 1) if potentialRegulation: self.setFeature("regulationThemeEvent", 1) if hasTheme: self.setFeature("noTheme", 1) causeEntities = [] hasCause = False for causeToken in causeTokens: if causeToken != None: hasCause = True self.setGazetteerFeatures(causeToken,"cause_") self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, causeToken, "cause_") self.triggerFeatureBuilder.tag = "ctrg_" self.triggerFeatureBuilder.buildFeatures(causeToken) causeEntity = None if sentenceGraph.entitiesByToken.has_key(causeToken): for causeEntity in sentenceGraph.entitiesByToken[causeToken]: if causeEntity.get("isName") == "True": self.setFeature("causeProtein", 1) if potentialRegulation: self.setFeature("regulationCauseProtein", 1) causeEntities.append(causeEntity) break if not features.has_key("causeProtein"): self.setFeature("causeEvent", 1) self.setFeature("nestingEvent", 1) if potentialRegulation: self.setFeature("regulationCauseEvent", 1) if not hasCause: self.setFeature("noCause", 1) self.triggerFeatureBuilder.tag = "" self.triggerFeatureBuilder.setFeatureVector(None) # Common features # if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization # if entity2.get("isName") == "True": # features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1 # else: # features[self.featureSet.getId("GENIA_regulation_of_event")] = 1 # define extra attributes extra = {"xtype":"event","type":categoryName} extra["et"] = eventToken.get("id") if len(eventIds) > 0: eventIds.sort() extra["eids"] = "" for eventId in eventIds: extra["eids"] += str(eventId) + "," extra["eids"] = extra["eids"][:-1] for themeToken in themeTokens: if themeToken != None: if extra.has_key("tt"): extra["tt"] = extra["tt"] + "," + themeToken.get("id") else: extra["tt"] = themeToken.get("id") for themeEntity in themeEntities: if extra.has_key("t"): extra["t"] = extra["t"] + "," + themeEntity.get("id") else: extra["t"] = themeEntity.get("id") for causeToken in causeTokens: if causeToken != None: extra["ct"] = causeTokens[0].get("id") if len(causeEntities) > 0: extra["c"] = causeEntities[0].get("id") sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId # make example #assert (category == 1 or category == -1) self.features = None return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) def buildArgumentFeatures(self, sentenceGraph, paths, features, eventToken, argToken, tag): #eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode] #argToken = sentenceGraph.entityHeadTokenByEntity[argNode] self.multiEdgeFeatureBuilder.tag = tag self.multiEdgeFeatureBuilder.setFeatureVector(features, None, None, False) if eventToken != argToken and paths.has_key(eventToken) and paths[eventToken].has_key(argToken): path = paths[eventToken][argToken] edges = self.multiEdgeFeatureBuilder.getEdges(sentenceGraph.dependencyGraph, path) else: path = [eventToken, argToken] edges = None # if not "disable_entity_features" in self.styles: # # doesn't improve beyond 52.32 # self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) # # buildPathLengthFeatures 52.32 -> 51-51 # self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) # if not "disable_terminus_features" in self.styles: # # didn't improve from 52.32 # self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not "disable_single_element_features" in self.styles: # 50.74 -> 52.32 self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, edges, sentenceGraph) if not "disable_ngram_features" in self.styles: # ngrams alone - 50.74 self.multiEdgeFeatureBuilder.buildPathGrams(2, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(3, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(4, path, edges, sentenceGraph) # remove for fast # disabling length 4 drops performance # if not "disable_path_edge_features" in self.styles: # self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, edges, sentenceGraph) # self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) # buildSentenceFeatures seems to decrease performance by 8 %-points self.multiEdgeFeatureBuilder.setFeatureVector(None, None, None, False) self.multiEdgeFeatureBuilder.tag = ""
class AsymmetricEventExampleBuilder(ExampleBuilder): def __init__(self, style=["typed","directed"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) if style.find(",") != -1: style = style.split(",") self.styles = style self.negFrac = None self.posPairGaz = POSPairGazetteer() for s in style: if s.find("negFrac") != -1: self.negFrac = float(s.split("_")[-1]) print >> sys.stderr, "Downsampling negatives to", self.negFrac self.negRand = random.Random(15) elif s.find("posPairGaz") != -1: self.posPairGaz = POSPairGazetteer(loadFrom=s.split("_", 1)[-1]) self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) if "graph_kernel" in self.styles: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) if "ontology" in self.styles: self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) if "nodalida" in self.styles: self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet) #IF LOCAL if "bioinfer_limits" in self.styles: self.bioinferOntologies = OntologyUtils.getBioInferTempOntology() #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName) #ENDIF self.pathLengths = length assert(self.pathLengths == None) self.types = types if "random" in self.styles: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet) #self.outFile = open("exampleTempFile.txt","wt") @classmethod def run(cls, input, output, parse, tokenization, style, idFileTag=None): classSet, featureSet = cls.getIdSets(idFileTag) if style != None: e = cls(style=style, classSet=classSet, featureSet=featureSet) else: e = cls(classSet=classSet, featureSet=featureSet) sentences = cls.getSentences(input, parse, tokenization) e.buildExamplesForSentences(sentences, output, idFileTag) if "printClassIds" in e.styles: print >> sys.stderr, e.classSet.Ids def definePredictedValueRange(self, sentences, elementName): self.multiEdgeFeatureBuilder.definePredictedValueRange(sentences, elementName) def getPredictedValueRange(self): return self.multiEdgeFeatureBuilder.predictedRange def filterEdgesByType(self, edges, typesToInclude): if len(typesToInclude) == 0: return edges edgesToKeep = [] for edge in edges: if edge.get("type") in typesToInclude: edgesToKeep.append(edge) return edgesToKeep def getCategoryNameFromTokens(self, sentenceGraph, t1, t2, directed=True): types = set() themeE1Types = set() intEdges = [] if sentenceGraph.interactionGraph.has_edge(t1, t2): intEdges = sentenceGraph.interactionGraph.get_edge_data(t1, t2, default={}) # NOTE: Only works if keys are ordered integers for i in range(len(intEdges)): types.add(intEdges[i]["element"].get("type")) # if (not directed) and sentenceGraph.interactionGraph.has_edge(t2, t1): # intEdgesReverse = sentenceGraph.interactionGraph.get_edge(t2, t1, default={}) # # NOTE: Only works if keys are ordered integers # for i in range(len(intEdgesReverse)): # intElement = intEdgesReverse[i]["element"] # intType = intElement.get("type") # types.add(intType) # intEdges.extend(intEdgesReverse) for i in range(len(intEdges)): intElement = intEdges[i]["element"] intType = intElement.get("type") if intType == "Theme": e1Entity = sentenceGraph.entitiesById[intElement.get("e1")] themeE1Types.add(e1Entity.get("type")) #types.add(intType) if len(themeE1Types) != 0: themeE1Types = list(themeE1Types) themeE1Types.sort() categoryName = "" for name in themeE1Types: if categoryName != "": categoryName += "---" categoryName += name return categoryName else: types = list(types) types.sort() categoryName = "" for name in types: if categoryName != "": categoryName += "---" categoryName += name if categoryName != "": return categoryName else: return "neg" def getCategoryName(self, sentenceGraph, e1, e2, directed=True): interactions = sentenceGraph.getInteractions(e1, e2) if not directed: interactions.extend(sentenceGraph.getInteractions(e2, e1)) types = set() for interaction in interactions: types.add(interaction.attrib["type"]) types = list(types) types.sort() categoryName = "" for name in types: if categoryName != "": categoryName += "---" categoryName += name if categoryName != "": return categoryName else: return "neg" def preProcessExamples(self, allExamples): # Duplicates cannot be removed here, as they should only be removed from the training set. This is done # in the classifier. # if "no_duplicates" in self.styles: # count = len(allExamples) # print >> sys.stderr, " Removing duplicates,", # allExamples = ExampleUtils.removeDuplicates(allExamples) # print >> sys.stderr, "removed", count - len(allExamples) if "normalize" in self.styles: print >> sys.stderr, " Normalizing feature vectors" ExampleUtils.normalizeFeatureVectors(allExamples) return allExamples def isPotentialGeniaInteraction(self, e1, e2): if e1.get("isName") == "True": return False else: return True #IF LOCAL def getBioInferParentType(self, eType): if eType == "Physical_entity" or OntologyUtils.hasParent(eType, "Physical_entity", self.bioinferOntologies): return "Physical" elif eType == "Property_entity" or OntologyUtils.hasParent(eType, "Property_entity", self.bioinferOntologies): return "Property" elif OntologyUtils.hasParent(eType, "Relationship", self.bioinferOntologies): return "Process" else: assert False, eType # if self.bioinferOntologies["Entity"].has_key(eType): # if OntologyUtils.hasParent(eType, "Physical_entity", self.bioinferOntologies): # assert not OntologyUtils.hasParent(eType, "Property_entity", self.bioinferOntologies), eType # return "Physical" # else: # assert OntologyUtils.hasParent(eType, "Property_entity", self.bioinferOntologies), eType # return "Property" # # else: # assert self.bioinferOntologies.has_key(eType), eType # #assert OntologyUtils.hasParent(eType, "Process_entity", self.bioinferOntologies["Relationship"]), eType # return "Process" def isPotentialBioInferInteraction(self, e1, e2, categoryName): e1Type = self.getBioInferParentType(e1.get("type")) e2Type = self.getBioInferParentType(e2.get("type")) if e1Type == "Process" or e1Type == "Property": return True elif e1Type == "Physical" and e2Type == "Physical": return True elif e1Type == "Physical" and e2Type == "Process": # hack return True else: assert(categoryName == "neg"), categoryName + " category for " + e1Type + " and " + e2Type return False #ENDIF def nxMultiDiGraphToUndirected(self, graph): undirected = NX10.MultiGraph(name=graph.name) undirected.add_nodes_from(graph) undirected.add_edges_from(graph.edges_iter()) return undirected def buildExamples(self, sentenceGraph): examples = [] exampleIndex = 0 clearGraph = sentenceGraph.getCleared() #undirected = sentenceGraph.getUndirectedDependencyGraph() undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) ##undirected = sentenceGraph.dependencyGraph.to_undirected() ###undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) self.triggerFeatureBuilder.initSentence(clearGraph) # Generate examples based on interactions between entities or interactions between tokens if "entities" in self.styles: loopRange = len(sentenceGraph.entities) else: loopRange = len(sentenceGraph.tokens) #for i in range(loopRange-1): for i in range(loopRange): # allow self-interactions #for j in range(i+1,loopRange): for j in range(i,loopRange): # allow self-interactions eI = None eJ = None if "entities" in self.styles: eI = sentenceGraph.entities[i] eJ = sentenceGraph.entities[j] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True": # continue if eI.get("type") == "neg" or eJ.get("type") == "neg": continue else: tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] # # only consider paths between entities (NOTE! entities, not only named entities) # if "headsOnly" in self.styles: # if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0): # continue if "directed" in self.styles: # define forward if "entities" in self.styles: categoryName = self.getCategoryName(sentenceGraph, eI, eJ, True) else: categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, True) self.exampleStats.beginExample(categoryName) if self.negFrac == None or categoryName != "neg" or (categoryName == "neg" and self.negRand.random() < self.negFrac): makeExample = True if ("genia_limits" in self.styles) and not self.isPotentialGeniaInteraction(eI, eJ): makeExample = False self.exampleStats.filter("genia_limits") if self.posPairGaz.getNegFrac((tI.get("POS"), tJ.get("POS"))) == 1.0: makeExample = False self.exampleStats.filter("pos_pair") if makeExample: if not sentenceGraph.tokenIsName[tI]: examples.append( self.buildExample(tI, tJ, paths, clearGraph, categoryName, exampleIndex, eI, eJ) ) exampleIndex += 1 else: self.exampleStats.filter("genia_token_limits") else: self.exampleStats.filter("neg_frac") self.exampleStats.endExample() # define reverse if "entities" in self.styles: categoryName = self.getCategoryName(sentenceGraph, eJ, eI, True) else: categoryName = self.getCategoryNameFromTokens(sentenceGraph, tJ, tI, True) self.exampleStats.beginExample(categoryName) if self.negFrac == None or categoryName != "neg" or (categoryName == "neg" and self.negRand.random() < self.negFrac): makeExample = True if ("genia_limits" in self.styles) and not self.isPotentialGeniaInteraction(eJ, eI): makeExample = False self.exampleStats.filter("genia_limits") if ("bioinfer_limits" in self.styles) and not self.isPotentialBioInferInteraction(eJ, eI, categoryName): makeExample = False self.exampleStats.filter("bioinfer_limits") if self.posPairGaz.getNegFrac((tJ.get("POS"), tI.get("POS"))) == 1.0: makeExample = False self.exampleStats.filter("pos_pair") if makeExample: if not sentenceGraph.tokenIsName[tJ]: examples.append( self.buildExample(tJ, tI, paths, clearGraph, categoryName, exampleIndex, eJ, eI) ) exampleIndex += 1 else: self.exampleStats.filter("genia_token_limits") else: self.exampleStats.filter("neg_frac") self.exampleStats.endExample() # else: # if "entities" in self.styles: # categoryName = self.getCategoryName(sentenceGraph, eI, eJ, False) # else: # categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, False) # forwardExample = self.buildExample(tI, tJ, paths, clearGraph, categoryName, exampleIndex, eI, eJ) # if not "graph_kernel" in self.styles: # reverseExample = self.buildExample(tJ, tI, paths, clearGraph, categoryName, exampleIndex, eJ, eI) # forwardExample[2].update(reverseExample[2]) # examples.append(forwardExample) # exampleIndex += 1 return examples def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, exampleIndex, entity1=None, entity2=None): # define features features = {} if True: #token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): if token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): path = paths[token1][token2] else: path = [token1, token2] assert(self.pathLengths == None) if self.pathLengths == None or len(path)-1 in self.pathLengths: if not "no_trigger": self.triggerFeatureBuilder.setFeatureVector(self.features) self.triggerFeatureBuilder.tag = "trg_t1_" self.triggerFeatureBuilder.buildFeatures(eventToken) self.triggerFeatureBuilder.tag = "trg_t2_" self.triggerFeatureBuilder.buildFeatures(eventToken) # if not "no_ontology" in self.styles: # self.ontologyFeatureBuilder.setFeatureVector(features) # self.ontologyFeatureBuilder.buildOntologyFeaturesForPath(sentenceGraph, path) # self.ontologyFeatureBuilder.setFeatureVector(None) if "graph_kernel" in self.styles or not "no_dependency" in self.styles: if token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): edges = self.multiEdgeFeatureBuilder.getEdges(sentenceGraph.dependencyGraph, path) else: edges = None if "graph_kernel" in self.styles: self.graphKernelFeatureBuilder.setFeatureVector(features, entity1, entity2) self.graphKernelFeatureBuilder.buildGraphKernelFeatures(sentenceGraph, path, edges) self.graphKernelFeatureBuilder.setFeatureVector(None) if "entity_type" in self.styles: features[self.featureSet.getId("e1_"+entity1.attrib["type"])] = 1 features[self.featureSet.getId("e2_"+entity2.attrib["type"])] = 1 features[self.featureSet.getId("distance_"+str(len(path)))] = 1 if not "no_dependency" in self.styles: if token1 == token2: features[self.featureSet.getId("tokenSelfLoop")] = 1 self.multiEdgeFeatureBuilder.setFeatureVector(features, entity1, entity2) #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast if not "disable_entity_features" in self.styles: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not "disable_terminus_features" in self.styles: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not "disable_single_element_features" in self.styles: self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, edges, sentenceGraph) if not "disable_ngram_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathGrams(2, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(3, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(4, path, edges, sentenceGraph) # remove for fast #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast #if edges != None: # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast if not "disable_path_edge_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, edges, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) if "nodalida" in self.styles: self.nodalidaFeatureBuilder.setFeatureVector(features, entity1, entity2) shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths(sentenceGraph.dependencyGraph, path) print shortestPaths if len(shortestPaths) > 0: self.nodalidaFeatureBuilder.buildNGrams(shortestPaths, sentenceGraph) self.nodalidaFeatureBuilder.setFeatureVector(None) if not "no_linear" in self.styles: self.tokenFeatureBuilder.setFeatureVector(features) for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == token1: token1Index = i if sentenceGraph.tokens[i] == token2: token2Index = i linearPreTag = "linfw_" if token1Index > token2Index: token1Index, token2Index = token2Index, token1Index linearPreTag = "linrv_" self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1") self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2") # Before, middle, after # self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf") # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw") # self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af") # before-middle, middle, middle-after # self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2) self.tokenFeatureBuilder.setFeatureVector(None) if "random" in self.styles: self.randomFeatureBuilder.setFeatureVector(features) self.randomFeatureBuilder.buildRandomFeatures(100, 0.01) self.randomFeatureBuilder.setFeatureVector(None) if "genia_limits" in self.styles: e1Type = entity1.get("type") e2Type = entity2.get("type") assert(entity1.get("isName") == "False") if entity2.get("isName") == "True": features[self.featureSet.getId("GENIA_target_protein")] = 1 else: features[self.featureSet.getId("GENIA_nested_event")] = 1 if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization if entity2.get("isName") == "True": features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1 else: features[self.featureSet.getId("GENIA_regulation_of_event")] = 1 else: features[self.featureSet.getId("always_negative")] = 1 if "subset" in self.styles: features[self.featureSet.getId("out_of_scope")] = 1 else: features[self.featureSet.getId("always_negative")] = 1 if "subset" in self.styles: features[self.featureSet.getId("out_of_scope")] = 1 path = [token1, token2] self.triggerFeatureBuilder.tag = "" self.triggerFeatureBuilder.setFeatureVector(None) # define extra attributes # if int(path[0].attrib["id"].split("_")[-1]) < int(path[-1].attrib["id"].split("_")[-1]): # #extra = {"xtype":"edge","type":"i","t1":path[0],"t2":path[-1]} # extra = {"xtype":"asym","type":"i","t1":path[0].get("id"),"t2":path[-1].get("id")} # extra["deprev"] = False # else: # #extra = {"xtype":"edge","type":"i","t1":path[-1],"t2":path[0]} # extra = {"xtype":"asym","type":"i","t1":path[-1].get("id"),"t2":path[0].get("id")} # extra["deprev"] = True extra = {"xtype":"asym","type":"i","t1":token1.get("id"),"t2":token2.get("id")} if entity1 != None: #extra["e1"] = entity1 extra["e1"] = entity1.get("id") if entity2 != None: #extra["e2"] = entity2 extra["e2"] = entity2.get("id") extra["categoryName"] = categoryName sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId # make example if "binary" in self.styles: if categoryName != "neg": category = 1 else: category = -1 categoryName = "i" else: category = self.classSet.getId(categoryName) return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)
class Round2TriggerExampleBuilder(ExampleBuilder): def nxMultiDiGraphToUndirected(self, graph): undirected = NX10.MultiGraph(name=graph.name) undirected.add_nodes_from(graph) undirected.add_edges_from(graph.edges_iter()) return undirected def getPredictionStrength(self, element): eType = element.get("type") predictions = element.get("predictions") if predictions == None: return 0 predictions = predictions.split(",") for prediction in predictions: predClass, predStrength = prediction.split(":") if predClass == eType: predStrength = float(predStrength) return predStrength return 0 def getInteractionEdgeLengths(self, sentenceGraph, paths): """ Return dependency and linear length of all interaction edges (measured between the two tokens). """ interactionLengths = {} for interaction in sentenceGraph.interactions: # Calculated interaction edge dep and lin length e1 = sentenceGraph.entitiesById[interaction.get("e1")] e2 = sentenceGraph.entitiesById[interaction.get("e2")] t1 = sentenceGraph.entityHeadTokenByEntity[e1] t2 = sentenceGraph.entityHeadTokenByEntity[e2] # Get dep path length if t1 != t2 and paths.has_key(t1) and paths[t1].has_key(t2): pathLength = len(paths[t1][t2]) else: # no dependencyPath pathLength = 999999 # more than any real path # Linear distance t1Pos = -1 t2Pos = -1 for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == t1: t1Pos = i if t2Pos != -1: break if sentenceGraph.tokens[i] == t2: t2Pos = i if t1Pos != -1: break linLength = abs(t1Pos - t2Pos) interactionLengths[interaction] = (interaction, pathLength, linLength, t2Pos) return interactionLengths def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None): if classSet == None: classSet = IdSet(1) assert classSet.getId("neg") == 1 if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) # gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" if gazetteerFileName != None: self.gazetteer = Gazetteer.loadGztr(gazetteerFileName) print >>sys.stderr, "Loaded gazetteer from", gazetteerFileName else: print >>sys.stderr, "No gazetteer loaded" self.gazetteer = None self.styles = style self.skiplist = set() if skiplist != None: f = open(skiplist, "rt") for line in f.readlines(): self.skiplist.add(line.strip()) f.close() self.styles = [ "trigger_features", "typed", "directed", "no_linear", "entities", "genia_limits", "noMasking", "maxFeatures", ] self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "graph_kernel" in self.styles: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) @classmethod def run(cls, input, gold, output, parse, tokenization, style, idFileTag=None, append=False): """ An interface for running the example builder without needing to create a class """ classSet, featureSet = cls.getIdSets(idFileTag) if style != None: e = Round2TriggerExampleBuilder(style=style, classSet=classSet, featureSet=featureSet) else: e = Round2TriggerExampleBuilder(classSet=classSet, featureSet=featureSet) sentences = cls.getSentences(input, parse, tokenization) if gold != None: goldSentences = cls.getSentences(gold, parse, tokenization) else: goldSentences = None e.buildExamplesForSentences(sentences, goldSentences, output, idFileTag, append=append) def buildExamplesForSentences(self, sentences, goldSentences, output, idFileTag=None, append=False): examples = [] counter = ProgressCounter(len(sentences), "Build examples") if append: outfile = open(output, "at") else: outfile = open(output, "wt") exampleCount = 0 for i in range(len(sentences)): sentence = sentences[i] goldSentence = [None] if goldSentences != None: goldSentence = goldSentences[i] counter.update(1, "Building examples (" + sentence[0].getSentenceId() + "): ") examples = self.buildExamples(sentence[0], goldSentence[0], append=append) exampleCount += len(examples) examples = self.preProcessExamples(examples) ExampleUtils.appendExamples(examples, outfile) outfile.close() print >>sys.stderr, "Examples built:", exampleCount print >>sys.stderr, "Features:", len(self.featureSet.getNames()) # IF LOCAL if self.exampleStats.getExampleCount() > 0: self.exampleStats.printStats() # ENDIF # Save Ids if idFileTag != None: print >>sys.stderr, "Saving class names to", idFileTag + ".class_names" self.classSet.write(idFileTag + ".class_names") print >>sys.stderr, "Saving feature names to", idFileTag + ".feature_names" self.featureSet.write(idFileTag + ".feature_names") def preProcessExamples(self, allExamples): if "normalize" in self.styles: print >>sys.stderr, " Normalizing feature vectors" ExampleUtils.normalizeFeatureVectors(allExamples) return allExamples def getMergedEntityType(self, entities): """ If a single token belongs to multiple entities of different types, a new, composite type is defined. This type is the alphabetically ordered types of these entities joined with '---'. """ types = set() for entity in entities: types.add(entity.get("type")) types = list(types) types.sort() typeString = "" for type in types: if type == "Protein" and "all_tokens" in self.styles: continue if typeString != "": typeString += "---" typeString += type if typeString == "": return "neg" if "limit_merged_types" in self.styles: if typeString.find("---") != -1: if typeString == "Gene_expression---Positive_regulation": return typeString else: return typeString.split("---")[0] else: return typeString return typeString def getTokenFeatures(self, token, sentenceGraph): """ Returns a list of features based on the attributes of a token. These can be used to define more complex features. """ # These features are cached when this method is first called # for a token. if self.tokenFeatures.has_key(token): return self.tokenFeatures[token] tokTxt = sentenceGraph.getTokenText(token) features = {} features["_txt_" + tokTxt] = 1 # F 69.35 -> 68.22 # normalizedText = tokTxt.replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower() # features["_norTxt_"+normalizedText]=1 # features["_norStem_" + PorterStemmer.stem(normalizedText)]=1 features["_POS_" + token.get("POS")] = 1 if sentenceGraph.tokenIsName[token]: features["_isName"] = 1 for entity in sentenceGraph.tokenIsEntityHead[token]: if entity.get("isName") == "True": features["_annType_" + entity.get("type")] = 1 # Filip's gazetteer based features (can be used separately from exclude_gazetteer) if "gazetteer_features" in self.styles: tokTxtLower = tokTxt.lower() if "stem_gazetteer" in self.styles: tokTxtLower = PorterStemmer.stem(tokTxtLower) if self.gazetteer and tokTxtLower in self.gazetteer: for label, weight in self.gazetteer[tokTxtLower].items(): features["_knownLabel_" + label] = weight # 1 performs slightly worse self.tokenFeatures[token] = features return features def buildLinearOrderFeatures(self, sentenceGraph, index, tag, features): """ Linear features are built by marking token features with a tag that defines their relative position in the linear order. """ tag = "linear_" + tag for tokenFeature, w in self.getTokenFeatures(sentenceGraph.tokens[index], sentenceGraph).iteritems(): features[self.featureSet.getId(tag + tokenFeature)] = w def buildExamples(self, sentenceGraph, goldGraph, append=False): examples = self.buildExamplesInner(sentenceGraph, goldGraph) entityCounts = {} exampleCounts = {} for entity in sentenceGraph.entities: eType = entity.get("type") if eType == "Protein": continue if not entityCounts.has_key(eType): entityCounts[eType] = 0 exampleCounts[eType] = 0 entityCounts[eType] += 1 for example in examples: eTypes = self.classSet.getName(example[1]).split("---") for eType in eTypes: if not exampleCounts.has_key(eType): exampleCounts[eType] = 0 exampleCounts[eType] += 1 # for key in sorted(entityCounts.keys()): # if entityCounts[key] != exampleCounts[key]: # print >> sys.stderr, "Warning, sentence", sentenceGraph.getSentenceId(), "example", key, "diff", entityCounts[key] - exampleCounts[key] return examples def buildExamplesInner(self, sentenceGraph, goldGraph): """ Build one example for each token of the sentence """ if sentenceGraph.sentenceElement.get("origId") in self.skiplist: print >>sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get("origId") return [] self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) self.triggerFeatureBuilder.initSentence(sentenceGraph) undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) # Get argument order self.interactionLengths = self.getInteractionEdgeLengths(sentenceGraph, paths) self.interactionLengths = self.interactionLengths.values() self.interactionLengths.sort(compareInteractionPrecedence) # Map tokens to entities tokenByOffset = {} for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] if goldGraph != None: goldToken = goldGraph.tokens[i] assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset") tokenByOffset[token.get("charOffset")] = token.get("id") # Map gold entities to their head offsets goldEntitiesByOffset = {} for token in sentenceGraph.tokens: goldEntitiesByOffset[token.get("charOffset")] = [] entityToGold = {} for entity in sentenceGraph.entities: entityToGold[entity] = [] if goldGraph != None: for entity in goldGraph.entities: offset = entity.get("headOffset") assert offset != None goldEntitiesByOffset[offset].append(entity) # Map predicted entities to gold entities for entity in sentenceGraph.entities: eType = entity.get("type") eOffset = entity.get("headOffset") for goldEntity in goldEntitiesByOffset[eOffset]: if goldEntity.get("type") == eType: entityToGold[entity].append(goldEntity) # Map entities to interactions # interactionsByEntityId = {} # for entity in sentenceGraph.entities: # interactionsByEntityId[entity.get("id")] = [] # Map tokens to interactions interactionsByToken = {} for token in sentenceGraph.tokens: interactionsByToken[token] = [] for interactionTuple in self.interactionLengths: interaction = interactionTuple[0] if interaction.get("type") == "neg": continue e1Id = interaction.get("e1") token = sentenceGraph.entityHeadTokenByEntity[sentenceGraph.entitiesById[e1Id]] interactionsByToken[token].append(interaction) examples = [] exampleIndex = 0 self.tokenFeatures = {} # namedEntityNorStrings = set() namedEntityHeadTokens = [] if not "names" in self.styles: namedEntityCount = 0 for entity in sentenceGraph.entities: if entity.get("isName") == "True": # known data which can be used for features namedEntityCount += 1 # namedEntityNorStrings.add( entity.get("text").replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower() ) namedEntityCountFeature = "nameCount_" + str(namedEntityCount) # if namedEntityCount == 0: # no names, no need for triggers # return [] if "pos_pairs" in self.styles: namedEntityHeadTokens = self.getNamedEntityHeadTokens(sentenceGraph) # neFeatures = {} # F: 69.35 -> 69.14 # for norString in namedEntityNorStrings: # neFeatures[self.featureSet.getId("norNE_" + norString)] = 1 bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 bowFeatures = {} for k, v in bagOfWords.iteritems(): bowFeatures[self.featureSet.getId(k)] = v self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True) fixedInEdges = [] for edge in inEdges: fixedInEdges.append((edge[0], edge[1], edge[2]["element"])) inEdges = fixedInEdges inEdges.sort(compareDependencyEdgesById) self.inEdgesByToken[token] = inEdges outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True) fixedOutEdges = [] for edge in outEdges: fixedOutEdges.append((edge[0], edge[1], edge[2]["element"])) outEdges = fixedOutEdges outEdges.sort(compareDependencyEdgesById) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] # Recognize only non-named entities (i.e. interaction words) if sentenceGraph.tokenIsName[token] and not "names" in self.styles and not "all_tokens" in self.styles: continue # CLASS # if len(sentenceGraph.tokenIsEntityHead[token]) > 0: # category = self.classSet.getId(self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token])) # else: # category = 1 offset = token.get("charOffset") if len(goldEntitiesByOffset[offset]) > 0: category = self.classSet.getId(self.getMergedEntityType(goldEntitiesByOffset[offset])) else: category = 1 tokenText = token.get("text").lower() if "stem_gazetteer" in self.styles: tokenText = PorterStemmer.stem(tokenText) if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer: features = {} features[self.featureSet.getId("exclude_gazetteer")] = 1 extra = {"xtype": "token", "t": token.get("id"), "excluded": "True"} examples.append((sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)) exampleIndex += 1 continue # FEATURES features = {} self.features = features if not "names" in self.styles: features[self.featureSet.getId(namedEntityCountFeature)] = 1 # for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) # features.update(neFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_" + text)] = 1 features[self.featureSet.getId("POS_" + token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_" + stem)] = 1 features[self.featureSet.getId("nonstem_" + text[len(stem) :])] = 1 # Normalized versions of the string (if same as non-normalized, overlap without effect) normalizedText = ( text.replace("-", "").replace("/", "").replace(",", "").replace("\\", "").replace(" ", "").lower() ) if normalizedText == "bound": # should be for all irregular verbs normalizedText = "bind" features[self.featureSet.getId("txt_" + normalizedText)] = 1 norStem = PorterStemmer.stem(normalizedText) features[self.featureSet.getId("stem_" + norStem)] = 1 features[self.featureSet.getId("nonstem_" + normalizedText[len(norStem) :])] = 1 if "gazetteer_features_maintoken" in self.styles: tokTxtLower = text.lower() if "stem_gazetteer" in self.styles: tokTxtLower = PorterStemmer.stem(tokTxtLower) if self.gazetteer and tokTxtLower in self.gazetteer: for label, weight in self.gazetteer[tokTxtLower].items(): features[self.featureSet.getId("gaz_knownLabel_" + label)] = weight # 1 performs slightly worse # Linear order features # for index in [-3,-2,-1,1,2,3,4,5]: # 69.35 -> 68.97 for index in [-3, -2, -1, 1, 2, 3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j - 1] == "-": features[self.featureSet.getId("has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_" + text[j - 1 : j + 1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_" + text[j - 2 : j + 1].lower())] = 1 # Attached edges (Hanging in and out edges) t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_" + edgeType)] = 1 features[self.featureSet.getId("t1HIn_" + edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_" + tokenText)] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + tokenText)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_" + edgeType)] = 1 features[self.featureSet.getId("t1HOut_" + edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_" + tokenText)] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + tokenText)] = 1 extra = {"xtype": "token", "t": token.get("id")} examples.append((sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)) exampleIndex += 1 # chains self.buildChains(token, sentenceGraph, features) if "pos_pairs" in self.styles: self.buildPOSPairs(token, namedEntityHeadTokens, features) self.buildPredictionFeatures(sentenceGraph, paths, token, interactionsByToken[token]) return examples def buildChains(self, token, sentenceGraph, features, depthLeft=3, chain="", visited=None): if depthLeft == 0: return strDepthLeft = "dist_" + str(depthLeft) if visited == None: visited = set() inEdges = self.inEdgesByToken[token] outEdges = self.outEdgesByToken[token] edgeSet = visited.union(self.edgeSetByToken[token]) for edge in inEdges: if not edge in visited: edgeType = edge[2].get("type") features[self.featureSet.getId("dep_" + strDepthLeft + edgeType)] = 1 nextToken = edge[0] for tokenFeature, w in self.getTokenFeatures(nextToken, sentenceGraph).iteritems(): features[self.featureSet.getId(strDepthLeft + tokenFeature)] = w # for entity in sentenceGraph.tokenIsEntityHead[nextToken]: # if entity.get("isName") == "True": # features[self.featureSet.getId("name_dist_"+strDepthLeft)] = 1 # features[self.featureSet.getId("name_dist_"+strDepthLeft+entity.get("type"))] = 1 # features[self.featureSet.getId("POS_dist_"+strDepthLeft+nextToken.get("POS"))] = 1 # tokenText = sentenceGraph.getTokenText(nextToken) # features[self.featureSet.getId("text_dist_"+strDepthLeft+tokenText)] = 1 if sentenceGraph.tokenIsName[nextToken]: features[self.featureSet.getId("name_chain_dist_" + strDepthLeft + chain + "-frw_" + edgeType)] = 1 features[self.featureSet.getId("chain_dist_" + strDepthLeft + chain + "-frw_" + edgeType)] = 1 self.buildChains(nextToken, sentenceGraph, features, depthLeft - 1, chain + "-frw_" + edgeType, edgeSet) for edge in outEdges: if not edge in visited: edgeType = edge[2].get("type") features[self.featureSet.getId("dep_dist_" + strDepthLeft + edgeType)] = 1 nextToken = edge[1] for tokenFeature, w in self.getTokenFeatures(nextToken, sentenceGraph).iteritems(): features[self.featureSet.getId(strDepthLeft + tokenFeature)] = w # for entity in sentenceGraph.tokenIsEntityHead[nextToken]: # if entity.get("isName") == "True": # features[self.featureSet.getId("name_dist_"+strDepthLeft)] = 1 # features[self.featureSet.getId("name_dist_"+strDepthLeft+entity.get("type"))] = 1 # features[self.featureSet.getId("POS_dist_"+strDepthLeft+nextToken.get("POS"))] = 1 # tokenText = sentenceGraph.getTokenText(nextToken) # features[self.featureSet.getId("text_dist_"+strDepthLeft+tokenText)] = 1 if sentenceGraph.tokenIsName[nextToken]: features[self.featureSet.getId("name_chain_dist_" + strDepthLeft + chain + "-rev_" + edgeType)] = 1 features[self.featureSet.getId("chain_dist_" + strDepthLeft + chain + "-rev_" + edgeType)] = 1 self.buildChains(nextToken, sentenceGraph, features, depthLeft - 1, chain + "-rev_" + edgeType, edgeSet) def getNamedEntityHeadTokens(self, sentenceGraph): headTokens = [] for entity in sentenceGraph.entities: if entity.get("isName") == "True": # known data which can be used for features headTokens.append(sentenceGraph.entityHeadTokenByEntity[entity]) return headTokens def buildPOSPairs(self, token, namedEntityHeadTokens, features): tokenPOS = token.get("POS") assert tokenPOS != None for headToken in namedEntityHeadTokens: headPOS = headToken.get("POS") features[self.featureSet.getId("POS_pair_NE_" + tokenPOS + "-" + headPOS)] = 1 ###################################################### # Unmerging-style features ###################################################### def buildPredictionFeatures(self, sentenceGraph, paths, token, interactions): # themeEntities, causeEntities=None): # NOTE!!!! TODO # add also features for arguments present, but not in this combination self.buildInterArgumentBagOfWords(interactions, sentenceGraph) if sentenceGraph.entitiesByToken.has_key(token): for eventEntity in sentenceGraph.entitiesByToken[token]: eventEntityType = eventEntity.get("type") self.setFeature("rootType_" + eventEntity.get("type"), 1) self.setFeature("predStrength" + eventEntityType, self.getPredictionStrength(eventEntity)) self.triggerFeatureBuilder.setFeatureVector(self.features) self.triggerFeatureBuilder.tag = "trg" + eventEntityType + "_" self.triggerFeatureBuilder.buildFeatures(token) self.triggerFeatureBuilder.tag = None argThemeCount = 0 argCauseCount = 0 # Current example's edge combination for i in range(len(interactions)): arg = interactions[i] if arg.get("type") == "Theme": argThemeCount += 1 self.buildArgumentFeatures(sentenceGraph, paths, self.features, token, arg, "argTheme") self.buildArgumentFeatures(sentenceGraph, paths, self.features, token, arg, "argTheme" + str(i)) else: # Cause argCauseCount += 1 self.buildArgumentFeatures(sentenceGraph, paths, self.features, token, arg, "argCause") self.buildArgumentFeatures(sentenceGraph, paths, self.features, token, arg, "argCause" + str(i)) self.setFeature("argCount", len(interactions)) self.setFeature("argCount_" + str(len(interactions)), 1) self.setFeature("argThemeCount", argThemeCount) self.setFeature("argThemeCount_" + str(argThemeCount), 1) self.setFeature("argCauseCount", argCauseCount) self.setFeature("argCauseCount_" + str(argCauseCount), 1) self.triggerFeatureBuilder.tag = "" self.triggerFeatureBuilder.setFeatureVector(None) def buildArgumentFeatures(self, sentenceGraph, paths, features, eventToken, arg, tag): argEntity = sentenceGraph.entitiesById[arg.get("e2")] argToken = sentenceGraph.entityHeadTokenByEntity[argEntity] self.buildEdgeFeatures(sentenceGraph, paths, features, eventToken, argToken, tag) self.triggerFeatureBuilder.tag = tag + "trg_" self.triggerFeatureBuilder.buildFeatures(argToken) if argEntity.get("isName") == "True": self.setFeature(tag + "Protein", 1) else: self.setFeature(tag + "Event", 1) self.setFeature("nestingEvent", 1) self.setFeature(tag + "_" + argEntity.get("type"), 1) def buildEdgeFeatures(self, sentenceGraph, paths, features, eventToken, argToken, tag): # eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode] # argToken = sentenceGraph.entityHeadTokenByEntity[argNode] self.multiEdgeFeatureBuilder.tag = tag + "_" self.multiEdgeFeatureBuilder.setFeatureVector(features, None, None, False) self.setFeature(tag + "_present", 1) if eventToken != argToken and paths.has_key(eventToken) and paths[eventToken].has_key(argToken): path = paths[eventToken][argToken] edges = self.multiEdgeFeatureBuilder.getEdges(sentenceGraph.dependencyGraph, path) else: path = [eventToken, argToken] edges = None if not "disable_entity_features" in self.styles: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) # if not "disable_terminus_features" in self.styles: # self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not "disable_single_element_features" in self.styles: self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, edges, sentenceGraph) if not "disable_ngram_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathGrams(2, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(3, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(4, path, edges, sentenceGraph) # remove for fast if not "disable_path_edge_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, edges, sentenceGraph) # self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None, None, None, False) self.multiEdgeFeatureBuilder.tag = "" def buildInterArgumentBagOfWords(self, arguments, sentenceGraph): if len(arguments) < 2: return indexByToken = {} for i in range(len(sentenceGraph.tokens)): indexByToken[sentenceGraph.tokens[i]] = i argTokenIndices = set() for arg in arguments: argEntity = sentenceGraph.entitiesById[arg.get("e2")] argToken = sentenceGraph.entityHeadTokenByEntity[argEntity] argTokenIndices.add(indexByToken[argToken]) minIndex = min(argTokenIndices) maxIndex = max(argTokenIndices) self.setFeature("argBoWRange", (maxIndex - minIndex)) self.setFeature("argBoWRange_" + str(maxIndex - minIndex), 1) bow = set() for i in range(minIndex + 1, maxIndex): token = sentenceGraph.tokens[i] if len(sentenceGraph.tokenIsEntityHead[token]) == 0 and not sentenceGraph.tokenIsName[token]: bow.add(token.get("text")) bow = sorted(list(bow)) for word in bow: self.setFeature("argBoW_" + word, 1) if word in ["/", "-"]: self.setFeature("argBoW_slashOrHyphen", 1) if len(bow) == 1: self.setFeature("argBoWonly_" + bow[0], 1) if bow[0] in ["/", "-"]: self.setFeature("argBoWonly_slashOrHyphen", 1)