class EventExampleBuilder(ExampleBuilder): def __init__(self, style=["typed", "directed", "headsOnly"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert classSet.getId("neg") == 1 ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True # self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) # if "ontology" in self.styles: # self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) self.pathLengths = length assert self.pathLengths == None self.types = types # self.outFile = open("exampleTempFile.txt","wt") @classmethod def run(cls, input, output, parse, tokenization, style, idFileTag=None): classSet, featureSet = cls.getIdSets(idFileTag) e = EventExampleBuilder(style=style, classSet=classSet, featureSet=featureSet) sentences = cls.getSentences(input, parse, tokenization) e.buildExamplesForSentences(sentences, output, idFileTag) def definePredictedValueRange(self, sentences, elementName): self.multiEdgeFeatureBuilder.definePredictedValueRange(sentences, elementName) def getPredictedValueRange(self): return self.multiEdgeFeatureBuilder.predictedRange def preProcessExamples(self, allExamples): if "normalize" in self.styles: print >>sys.stderr, " Normalizing feature vectors" ExampleUtils.normalizeFeatureVectors(allExamples) return allExamples def isPotentialGeniaInteraction(self, e1, e2): if e1.get("isName") == "True" and e2.get("isName") == "True": return False elif e1.get("isName") == "True" and e2.get("isName") == "False": return False else: return True def getArgumentEntities(self, sentenceGraph, entityNode): eId = entityNode.get("id") assert eId != None themeNodes = [] causeNodes = [] for edge in sentenceGraph.interactions: if edge.get("e1") == eId: edgeType = edge.get("type") assert edgeType in ["Theme", "Cause"], edgeType if edgeType == "Theme": themeNodes.append(sentenceGraph.entitiesById[edge.get("e2")]) elif edgeType == "Cause": causeNodes.append(sentenceGraph.entitiesById[edge.get("e2")]) return themeNodes, causeNodes def makeGSEvents(self, sentenceGraph): self.gsEvents = {} # [token]->[event-type]->[1-n argument sets] for token in sentenceGraph.tokens: self.gsEvents[token] = {} for entity in sentenceGraph.entities: if entity.get("type") == "neg": continue eId = entity.get("id") eType = entity.get("type") arguments = set() for interaction in sentenceGraph.interactions: if interaction.get("e1") == eId: arguments.add((interaction.get("type"), interaction.get("e2"))) eHeadToken = sentenceGraph.entityHeadTokenByEntity[entity] if not self.gsEvents[eHeadToken].has_key(eType): self.gsEvents[eHeadToken][eType] = [] self.gsEvents[eHeadToken][eType].append(arguments) def isGSEvent(self, sentenceGraph, entity, themeNodes, causeNodes): eHeadToken = sentenceGraph.entityHeadTokenByEntity[entity] eType = entity.get("type") if not self.gsEvents[eHeadToken].has_key(eType): return False argumentSet = set() for themeNode in themeNodes: if themeNode != None: argumentSet.add(("Theme", themeNode.get("id"))) for causeNode in causeNodes: if causeNode != None: argumentSet.add(("Cause", causeNode.get("id"))) if argumentSet in self.gsEvents[eHeadToken][eType]: return True else: return False # def isEvent(self, sentenceGraph, eventNode, themeNodes, causeNodes): # goldThemeNodes, goldCauseNodes = self.getArgumentEntities(sentenceGraph, eventNode) # for node in themeNodes: # if node != None and node not in goldThemeNodes: # return False # for node in causeNodes: # if node != None and node not in goldCauseNodes: # return False # return True def buildExamples(self, sentenceGraph): self.makeGSEvents(sentenceGraph) eventNodes = [] nameNodes = [] for entity in sentenceGraph.entities: if entity.get("type") == "neg": continue if entity.get("isName") == "True": nameNodes.append(entity) else: eventNodes.append(entity) allNodes = eventNodes + nameNodes examples = [] exampleIndex = 0 undirected = sentenceGraph.dependencyGraph.to_undirected() paths = NX.all_pairs_shortest_path(undirected, cutoff=999) for eventNode in eventNodes: eventType = eventNode.get("type") if eventType in [ "Gene_expression", "Transcription", "Protein_catabolism", "Localization", "Phosphorylation", ]: for nameNode in nameNodes: if self.isPotentialGeniaInteraction(eventNode, nameNode): examples.append(self.buildExample(exampleIndex, sentenceGraph, paths, eventNode, nameNode)) exampleIndex += 1 elif eventType in ["Regulation", "Positive_regulation", "Negative_regulation"]: combinations = combine.combine(allNodes + [None], allNodes + [None]) for combination in combinations: if combination[0] == combination[1]: continue if combination[0] == eventNode or combination[1] == eventNode: continue if combination[0] != None and not self.isPotentialGeniaInteraction(eventNode, combination[0]): continue if combination[1] != None and not self.isPotentialGeniaInteraction(eventNode, combination[1]): continue examples.append( self.buildExample(exampleIndex, sentenceGraph, paths, eventNode, combination[0], combination[1]) ) exampleIndex += 1 elif eventType in ["Binding"]: continue else: assert False, eventType self.gsEvents = None return examples def buildExample(self, exampleIndex, sentenceGraph, paths, eventNode, themeNode, causeNode=None): features = {} if self.isGSEvent(sentenceGraph, eventNode, [themeNode], [causeNode]): category = self.classSet.getId("pos") else: category = self.classSet.getId("neg") if themeNode != None: self.buildArgumentFeatures(sentenceGraph, paths, features, eventNode, themeNode, "theme_") if causeNode != None: self.buildArgumentFeatures(sentenceGraph, paths, features, eventNode, causeNode, "cause_") # Common features # eventType = eventNode.get("type") # e2Type = entity2.get("type") # assert(entity1.get("isName") == "False") # if entity2.get("isName") == "True": # features[self.featureSet.getId("GENIA_target_protein")] = 1 # else: # features[self.featureSet.getId("GENIA_nested_event")] = 1 # if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization # if entity2.get("isName") == "True": # features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1 # else: # features[self.featureSet.getId("GENIA_regulation_of_event")] = 1 # define extra attributes extra = {"xtype": "trigger-event", "type": eventNode.get("type")} extra["e"] = eventNode.get("id") eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode] extra["et"] = eventToken.get("id") if themeNode != None: extra["t"] = themeNode.get("id") themeToken = sentenceGraph.entityHeadTokenByEntity[themeNode] extra["tt"] = themeToken.get("id") if causeNode != None: extra["c"] = causeNode.get("id") causeToken = sentenceGraph.entityHeadTokenByEntity[causeNode] extra["ct"] = causeToken.get("id") sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId # make example # assert (category == 1 or category == -1) return (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra) def buildArgumentFeatures(self, sentenceGraph, paths, features, eventNode, argNode, tag): eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode] argToken = sentenceGraph.entityHeadTokenByEntity[argNode] if eventToken != argToken and paths.has_key(eventToken) and paths[eventToken].has_key(argToken): path = paths[eventToken][argToken] edges = self.multiEdgeFeatureBuilder.getEdges(sentenceGraph.dependencyGraph, path) else: path = [eventToken, argToken] edges = None self.multiEdgeFeatureBuilder.tag = tag self.multiEdgeFeatureBuilder.setFeatureVector(features, eventNode, argNode) if not "disable_entity_features" in self.styles: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not "disable_terminus_features" in self.styles: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not "disable_single_element_features" in self.styles: self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, edges, sentenceGraph) if not "disable_ngram_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathGrams(2, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(3, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(4, path, edges, sentenceGraph) # remove for fast if not "disable_path_edge_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, edges, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) self.multiEdgeFeatureBuilder.tag = ""
class MultiEdgeExampleBuilder(ExampleBuilder): """ This example builder makes edge examples, i.e. examples describing the event arguments. """ def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 or (len(classSet.Ids)== 2 and classSet.getId("neg") == -1) ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = self.getParameters(style, [ "typed", "directed", "headsOnly", "graph_kernel", "noAnnType", "noMasking", "maxFeatures", "genia_limits", "epi_limits", "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits", "genia_task1", "ontology", "nodalida", "bacteria_renaming", "trigger_features", "rel_features", "ddi_features", "evex", "giuliano", "random", "themeOnly", "causeOnly", "no_path", "entities", "skip_extra_triggers", "headsOnly", "graph_kernel", "trigger_features", "no_task", "no_dependency", "disable_entity_features", "disable_terminus_features", "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features", "no_linear", "subset", "binary", "pos_only", "entity_type" ]) if style == None: # no parameters given style["typed"] = style["directed"] = style["headsOnly"] = True # self.styles = style # if "selftrain_group" in self.styles: # self.selfTrainGroups = set() # if "selftrain_group-1" in self.styles: # self.selfTrainGroups.add("-1") # if "selftrain_group0" in self.styles: # self.selfTrainGroups.add("0") # if "selftrain_group1" in self.styles: # self.selfTrainGroups.add("1") # if "selftrain_group2" in self.styles: # self.selfTrainGroups.add("2") # if "selftrain_group3" in self.styles: # self.selfTrainGroups.add("3") # print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) # NOTE Temporarily re-enabling predicted range #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None) if self.styles["graph_kernel"]: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet) if self.styles["noAnnType"]: self.multiEdgeFeatureBuilder.noAnnType = True if self.styles["noMasking"]: self.multiEdgeFeatureBuilder.maskNamedEntities = False if self.styles["maxFeatures"]: self.multiEdgeFeatureBuilder.maximum = True if self.styles["genia_task1"]: self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity") self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) if self.styles["ontology"]: self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) if self.styles["nodalida"]: self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet) if self.styles["bacteria_renaming"]: self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder(self.featureSet) if self.styles["trigger_features"]: self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = True if self.styles["genia_task1"]: self.triggerFeatureBuilder.filterAnnTypes.add("Entity") #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName) if self.styles["rel_features"]: self.relFeatureBuilder = RELFeatureBuilder(featureSet) if self.styles["ddi_features"]: self.drugFeatureBuilder = DrugFeatureBuilder(featureSet) if self.styles["evex"]: self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet) if self.styles["giuliano"]: self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet) self.pathLengths = length assert(self.pathLengths == None) self.types = types if self.styles["random"]: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet) def definePredictedValueRange(self, sentences, elementName): self.multiEdgeFeatureBuilder.definePredictedValueRange(sentences, elementName) def getPredictedValueRange(self): return self.multiEdgeFeatureBuilder.predictedRange def filterEdgesByType(self, edges, typesToInclude): if len(typesToInclude) == 0: return edges edgesToKeep = [] for edge in edges: if edge.get("type") in typesToInclude: edgesToKeep.append(edge) return edgesToKeep def getCategoryNameFromTokens(self, sentenceGraph, t1, t2, directed=True): """ Example class. Multiple overlapping edges create a merged type. """ types = set() # if sentenceGraph.interactionGraph.has_edge(t1, t2): # intEdges = sentenceGraph.interactionGraph.get_edge_data(t1, t2, default={}) # # NOTE: Only works if keys are ordered integers # for i in range(len(intEdges)): # types.add(intEdges[i]["element"].get("type")) # if (not directed) and sentenceGraph.interactionGraph.has_edge(t2, t1): # intEdges = sentenceGraph.interactionGraph.get_edge(t2, t1, default={}) # # NOTE: Only works if keys are ordered integers # for i in range(len(intEdges)): # types.add(intEdges[i]["element"].get("type")) intEdges = sentenceGraph.interactionGraph.getEdges(t1, t2) if (not directed): intEdges = intEdges + sentenceGraph.interactionGraph.getEdges(t2, t1) for intEdge in intEdges: types.add(intEdge[2].get("type")) types = list(types) types.sort() categoryName = "" for name in types: if categoryName != "": categoryName += "---" categoryName += name if categoryName != "": return categoryName else: return "neg" def getCategoryName(self, sentenceGraph, e1, e2, directed=True, duplicateEntities=None): """ Example class. Multiple overlapping edges create a merged type. """ # interactions = [] # e1s = [e1] # if duplicateEntities != None and e1 in duplicateEntities: # e1s += duplicateEntities[e1] # e2s = [e2] # if duplicateEntities != None and e2 in duplicateEntities: # e2s += duplicateEntities[e2] # for entity1 in e1s: # for entity2 in e2s: # interactions = interactions + sentenceGraph.getInteractions(entity1, entity2) # if not directed: # interactions = interactions + sentenceGraph.getInteractions(entity2, entity1) interactions = sentenceGraph.getInteractions(e1, e2, True) #print interactions types = set() for interaction in interactions: types.add(interaction[2].get("type")) types = list(types) types.sort() categoryName = "" for name in types: if self.styles["causeOnly"] and name != "Cause": continue if self.styles["themeOnly"] and name != "Theme": continue if categoryName != "": categoryName += "---" categoryName += name if categoryName != "": return categoryName else: return "neg" def isPotentialRELInteraction(self, e1, e2): if e1.get("type") == "Protein" and e2.get("type") == "Entity": return True else: return False def isPotentialBBInteraction(self, e1, e2, sentenceGraph): #if e1.get("type") == "Bacterium" and e2.get("type") in ["Host", "HostPart", "Geographical", "Environmental", "Food", "Medical", "Soil", "Water"]: # Note: "Environment" type is misspelled as "Environmental" in the BB-task documentation if e1.get("type") == "Bacterium" and e2.get("type") in ["Host", "HostPart", "Geographical", "Environment", "Food", "Medical", "Soil", "Water"]: return True elif e1.get("type") == "Host" and e2.get("type") == "HostPart": return True else: return False def getBISuperType(self, eType): if eType in ["GeneProduct", "Protein", "ProteinFamily", "PolymeraseComplex"]: return "ProteinEntity" elif eType in ["Gene", "GeneFamily", "GeneComplex", "Regulon", "Site", "Promoter"]: return "GeneEntity" else: return None def isPotentialBIInteraction(self, e1, e2, sentenceGraph, stats): e1Type = e1.get("type") e1SuperType = self.getBISuperType(e1Type) e2Type = e2.get("type") e2SuperType = self.getBISuperType(e2Type) tag = "(" + e1Type + "/" + e2Type + ")" if e1Type == "Regulon": if e2SuperType in ["GeneEntity", "ProteinEntity"]: return True if e1SuperType == "ProteinEntity": if e2Type in ["Site", "Promoter", "Gene", "GeneComplex"]: return True if e1Type in ["Action", "Transcription", "Expression"]: return True if e1Type == "Site": if e2SuperType == "GeneEntity": return True if e1Type == "Promoter": if e2SuperType in ["GeneEntity", "ProteinEntity"]: return True if e1SuperType in ["GeneEntity", "ProteinEntity"]: if e2SuperType in ["GeneEntity", "ProteinEntity"]: return True stats.filter("bi_limits") #+tag) return False def isPotentialEPIInteraction(self, e1, e2, sentenceGraph): if e1.get("type") != "Catalysis": if e1.get("type") in ["Protein", "Entity"]: return False elif e2.get("type") in ["Protein", "Entity"]: return True else: return False else: # Catalysis if e2.get("type") != "Entity": return True else: return False assert False, (e1.get("type"), e2.get("type")) def isPotentialIDInteraction(self, e1, e2, sentenceGraph): e1Type = e1.get("type") e2Type = e2.get("type") e1IsCore = e1Type in ["Protein", "Regulon-operon", "Two-component-system", "Chemical", "Organism"] e2IsCore = e2Type in ["Protein", "Regulon-operon", "Two-component-system", "Chemical", "Organism"] if e1IsCore: return False elif e1Type in ["Gene_expression", "Transcription"]: if e2Type in ["Protein", "Regulon-operon"]: return True else: return False elif e1Type in ["Protein_catabolism", "Phosphorylation"]: if e2Type == "Protein": return True else: return False elif e1Type == "Localization": if e2IsCore or e2Type == "Entity": return True else: return False elif e1Type in ["Binding", "Process"]: if e2IsCore: return True else: return False elif "egulation" in e1Type: if e2Type != "Entity": return True else: return False elif e1Type == "Entity": if e2IsCore: return True else: return False assert False, (e1Type, e2Type) def isPotentialCOInteraction(self, e1, e2, sentenceGraph): if e1.get("type") == "Exp" and e2.get("type") == "Exp": anaphoraTok = sentenceGraph.entityHeadTokenByEntity[e1] antecedentTok = sentenceGraph.entityHeadTokenByEntity[e2] antecedentTokenFound = False for token in sentenceGraph.tokens: if token == antecedentTok: antecedentTokenFound = True if token == anaphoraTok: # if, not elif, to take into accoutn cases where e1Tok == e2Tok if antecedentTokenFound: return True else: return False assert False elif e1.get("type") == "Exp" and e2.get("type") == "Protein": return True else: return False def isPotentialGeniaInteraction(self, e1, e2): e1Type = e1.get("type") e2Type = e2.get("type") if e1Type == "Protein": return False elif e1Type in ["Entity", "Gene_expression", "Transcription", "Protein_catabolism", "Phosphorylation", "Binding"]: if e2Type == "Protein": return True else: return False elif e1Type == "Localization": if e2Type in ["Protein", "Entity"]: return True else: return False elif "egulation" in e1Type: if e2Type != "Entity": return True else: return False assert False, (e1Type, e2Type) def getGoldCategoryName(self, goldGraph, entityToGold, e1, e2, directed=True): if len(entityToGold[e1]) > 0 and len(entityToGold[e2]) > 0: return self.getCategoryName(goldGraph, entityToGold[e1][0], entityToGold[e2][0], directed=directed) else: return "neg" def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph = None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ #examples = [] exampleIndex = 0 if self.styles["trigger_features"]: self.triggerFeatureBuilder.initSentence(sentenceGraph) if self.styles["evex"]: self.evexFeatureBuilder.initSentence(sentenceGraph) # Filter entities, if needed #mergedIds = None #duplicateEntities = None #entities = sentenceGraph.entities #entities, mergedIds, duplicateEntities = self.mergeEntities(sentenceGraph, False) # "no_duplicates" in self.styles) sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities entityToDuplicates = sentenceGraph.mergedEntityToDuplicates self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) # Connect to optional gold graph if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities(entities, goldGraph.entities) paths = None if not self.styles["no_path"]: ##undirected = sentenceGraph.getUndirectedDependencyGraph() #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) ###undirected = sentenceGraph.dependencyGraph.to_undirected() ####undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work undirected = sentenceGraph.dependencyGraph.toUndirected() #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) paths = undirected #for edge in sentenceGraph.dependencyGraph.edges: # assert edge[2] != None #for edge in undirected.edges: # assert edge[2] != None #if sentenceGraph.sentenceElement.get("id") == "GENIA.d70.s5": # print [(x[0].get("id"), x[1].get("id"), x[2].get("id")) for x in sentenceGraph.dependencyGraph.edges] # Generate examples based on interactions between entities or interactions between tokens if self.styles["entities"]: loopRange = len(entities) else: loopRange = len(sentenceGraph.tokens) for i in range(loopRange-1): for j in range(i+1,loopRange): eI = None eJ = None if self.styles["entities"]: eI = entities[i] eJ = entities[j] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True": # continue if eI.get("type") == "neg" or eJ.get("type") == "neg": continue if self.styles["skip_extra_triggers"]: if eI.get("source") != None or eJ.get("source") != None: continue else: tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] # only consider paths between entities (NOTE! entities, not only named entities) if self.styles["headsOnly"]: if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0): continue if self.styles["directed"]: # define forward if self.styles["entities"]: categoryName = self.getCategoryName(sentenceGraph, eI, eJ, True) if goldGraph != None: categoryName = self.getGoldCategoryName(goldGraph, entityToGold, eI, eJ, True) else: categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, True) # make forward self.exampleStats.beginExample(categoryName) makeExample = True if self.styles["genia_limits"] and not self.isPotentialGeniaInteraction(eI, eJ): makeExample = False self.exampleStats.filter("genia_limits") if self.styles["genia_task1"] and (eI.get("type") == "Entity" or eJ.get("type") == "Entity"): makeExample = False self.exampleStats.filter("genia_task1") if self.styles["rel_limits"] and not self.isPotentialRELInteraction(eI, eJ): makeExample = False self.exampleStats.filter("rel_limits") if self.styles["co_limits"] and not self.isPotentialCOInteraction(eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("co_limits") if self.styles["bb_limits"] and not self.isPotentialBBInteraction(eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("bb_limits") if categoryName != "neg": self.exampleStats.filter("bb_limits(" + categoryName + ":" + eI.get("type") + "/" + eJ.get("type") + ")") if self.styles["bi_limits"] and not self.isPotentialBIInteraction(eI, eJ, sentenceGraph, self.exampleStats): makeExample = False #self.exampleStats.filter("bi_limits") if self.styles["epi_limits"] and not self.isPotentialEPIInteraction(eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("epi_limits") if self.styles["id_limits"] and not self.isPotentialIDInteraction(eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("id_limits") # if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"): # makeExample = False # self.exampleStats.filter("selftrain_limits") # if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups): # makeExample = False # self.exampleStats.filter("selftrain_group") if self.styles["pos_only"] and categoryName == "neg": makeExample = False self.exampleStats.filter("pos_only") if makeExample: #examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) ) ExampleUtils.appendExamples([self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ)], outfile) exampleIndex += 1 self.exampleStats.endExample() # define reverse if self.styles["entities"]: categoryName = self.getCategoryName(sentenceGraph, eJ, eI, True) if goldGraph != None: categoryName = self.getGoldCategoryName(goldGraph, entityToGold, eJ, eI, True) else: categoryName = self.getCategoryNameFromTokens(sentenceGraph, tJ, tI, True) # make reverse self.exampleStats.beginExample(categoryName) makeExample = True if self.styles["genia_limits"] and not self.isPotentialGeniaInteraction(eJ, eI): makeExample = False self.exampleStats.filter("genia_limits") if self.styles["genia_task1"] and (eI.get("type") == "Entity" or eJ.get("type") == "Entity"): makeExample = False self.exampleStats.filter("genia_task1") if self.styles["rel_limits"] and not self.isPotentialRELInteraction(eJ, eI): makeExample = False self.exampleStats.filter("rel_limits") if self.styles["co_limits"] and not self.isPotentialCOInteraction(eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("co_limits") if self.styles["bb_limits"] and not self.isPotentialBBInteraction(eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("bb_limits") if categoryName != "neg": self.exampleStats.filter("bb_limits(" + categoryName + ":" + eJ.get("type") + "/" + eI.get("type") + ")") if self.styles["bi_limits"] and not self.isPotentialBIInteraction(eJ, eI, sentenceGraph, self.exampleStats): makeExample = False #self.exampleStats.filter("bi_limits") if self.styles["epi_limits"] and not self.isPotentialEPIInteraction(eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("epi_limits") if self.styles["id_limits"] and not self.isPotentialIDInteraction(eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("id_limits") # if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"): # makeExample = False # self.exampleStats.filter("selftrain_limits") # if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups): # makeExample = False # self.exampleStats.filter("selftrain_group") if self.styles["pos_only"] and categoryName == "neg": makeExample = False self.exampleStats.filter("pos_only") if makeExample: #examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) ) ExampleUtils.appendExamples([self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI)], outfile) exampleIndex += 1 self.exampleStats.endExample() else: if self.styles["entities"]: categoryName = self.getCategoryName(sentenceGraph, eI, eJ, False) else: categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, False) self.exampleStats.beginExample(categoryName) forwardExample = self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) if not self.styles["graph_kernel"]: reverseExample = self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) forwardExample[2].update(reverseExample[2]) #examples.append(forwardExample) ExampleUtils.appendExamples([forwardExample], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, exampleIndex, entity1=None, entity2=None): """ Build a single directed example for the potential edge between token1 and token2 """ # dummy return for speed testing #return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),1,{},{}) # define features features = {} if True: #token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): #if token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): # path = paths[token1][token2] #else: # path = [token1, token2] if not self.styles["no_path"]: # directedPath reduces performance by 0.01 pp #directedPath = sentenceGraph.dependencyGraph.getPaths(token1, token2) #if len(directedPath) == 0: # directedPath = sentenceGraph.dependencyGraph.getPaths(token2, token1) # for dp in directedPath: # dp.reverse() #if len(directedPath) == 0: # path = paths.getPaths(token1, token2) #else: # path = directedPath path = paths.getPaths(token1, token2) if len(path) > 0: #if len(path) > 1: # print len(path) path = path[0] pathExists = True else: path = [token1, token2] pathExists = False else: path = [token1, token2] pathExists = False #print token1.get("id"), token2.get("id") assert(self.pathLengths == None) if self.pathLengths == None or len(path)-1 in self.pathLengths: # if not "no_ontology" in self.styles: # self.ontologyFeatureBuilder.setFeatureVector(features) # self.ontologyFeatureBuilder.buildOntologyFeaturesForPath(sentenceGraph, path) # self.ontologyFeatureBuilder.setFeatureVector(None) if self.styles["trigger_features"]: # F 85.52 -> 85.55 self.triggerFeatureBuilder.setFeatureVector(features) self.triggerFeatureBuilder.tag = "trg1_" self.triggerFeatureBuilder.buildFeatures(token1) self.triggerFeatureBuilder.tag = "trg2_" self.triggerFeatureBuilder.buildFeatures(token2) self.triggerFeatureBuilder.setFeatureVector(None) # REL features if self.styles["rel_features"] and not self.styles["no_task"]: self.relFeatureBuilder.setFeatureVector(features) self.relFeatureBuilder.tag = "rel1_" self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token1)) self.relFeatureBuilder.tag = "rel2_" self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token2)) self.relFeatureBuilder.setFeatureVector(None) if self.styles["bacteria_renaming"] and not self.styles["no_task"]: self.bacteriaRenamingFeatureBuilder.setFeatureVector(features) self.bacteriaRenamingFeatureBuilder.buildPairFeatures(entity1, entity2) #self.bacteriaRenamingFeatureBuilder.buildSubstringFeatures(entity1, entity2) # decreases perf. 74.76 -> 72.41 self.bacteriaRenamingFeatureBuilder.setFeatureVector(None) if self.styles["co_limits"] and not self.styles["no_task"]: e1Offset = Range.charOffsetToSingleTuple(entity1.get("charOffset")) e2Offset = Range.charOffsetToSingleTuple(entity2.get("charOffset")) if Range.contains(e1Offset, e2Offset): features[self.featureSet.getId("e1_contains_e2")] = 1 if entity2.get("isName") == "True": features[self.featureSet.getId("e1_contains_e2name")] = 1 if Range.contains(e2Offset, e1Offset): features[self.featureSet.getId("e2_contains_e1")] = 1 if entity1.get("isName") == "True": features[self.featureSet.getId("e2_contains_e1name")] = 1 if self.styles["ddi_features"]: self.drugFeatureBuilder.setFeatureVector(features) self.drugFeatureBuilder.tag = "ddi_" self.drugFeatureBuilder.buildPairFeatures(entity1, entity2) if self.styles["ddi_mtmx"]: self.drugFeatureBuilder.buildMTMXFeatures(entity1, entity2) self.drugFeatureBuilder.setFeatureVector(None) #if "graph_kernel" in self.styles or not "no_dependency" in self.styles: # #print "Getting edges" # if token1 != token2 and pathExists: # #print "g1" # edges = self.multiEdgeFeatureBuilder.getEdges(sentenceGraph.dependencyGraph, path) # #print "g2" # else: # edges = None if self.styles["graph_kernel"]: self.graphKernelFeatureBuilder.setFeatureVector(features, entity1, entity2) self.graphKernelFeatureBuilder.buildGraphKernelFeatures(sentenceGraph, path) self.graphKernelFeatureBuilder.setFeatureVector(None) if self.styles["entity_type"]: features[self.featureSet.getId("e1_"+entity1.get("type"))] = 1 features[self.featureSet.getId("e2_"+entity2.get("type"))] = 1 features[self.featureSet.getId("distance_"+str(len(path)))] = 1 if not self.styles["no_dependency"]: #print "Dep features" self.multiEdgeFeatureBuilder.setFeatureVector(features, entity1, entity2) #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast if not self.styles["disable_entity_features"]: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not self.styles["disable_terminus_features"]: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not self.styles["disable_single_element_features"]: self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, sentenceGraph) if not self.styles["disable_ngram_features"]: #print "NGrams" self.multiEdgeFeatureBuilder.buildPathGrams(2, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(3, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(4, path, sentenceGraph) # remove for fast #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast #if edges != None: # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast if not self.styles["disable_path_edge_features"]: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) if self.styles["nodalida"]: self.nodalidaFeatureBuilder.setFeatureVector(features, entity1, entity2) shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths(sentenceGraph.dependencyGraph, path) print shortestPaths if len(shortestPaths) > 0: self.nodalidaFeatureBuilder.buildNGrams(shortestPaths, sentenceGraph) self.nodalidaFeatureBuilder.setFeatureVector(None) if not self.styles["no_linear"]: self.tokenFeatureBuilder.setFeatureVector(features) for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == token1: token1Index = i if sentenceGraph.tokens[i] == token2: token2Index = i linearPreTag = "linfw_" if token1Index > token2Index: token1Index, token2Index = token2Index, token1Index linearPreTag = "linrv_" self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1") self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2") # Before, middle, after # self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf") # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw") # self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af") # before-middle, middle, middle-after # self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2) self.tokenFeatureBuilder.setFeatureVector(None) if self.styles["random"]: self.randomFeatureBuilder.setFeatureVector(features) self.randomFeatureBuilder.buildRandomFeatures(100, 0.01) self.randomFeatureBuilder.setFeatureVector(None) if self.styles["genia_limits"] and not self.styles["no_task"]: e1Type = entity1.get("type") e2Type = entity2.get("type") assert(entity1.get("isName") == "False") if entity2.get("isName") == "True": features[self.featureSet.getId("GENIA_target_protein")] = 1 else: features[self.featureSet.getId("GENIA_nested_event")] = 1 if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization if entity2.get("isName") == "True": features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1 else: features[self.featureSet.getId("GENIA_regulation_of_event")] = 1 if self.styles["bi_limits"]: # Make features based on entity types e1Type = entity1.get("type") e2Type = entity2.get("type") e1SuperType = str(self.getBISuperType(e1Type)) e2SuperType = str(self.getBISuperType(e2Type)) features[self.featureSet.getId("BI_e1_"+e1Type)] = 1 features[self.featureSet.getId("BI_e2_"+e2Type)] = 1 features[self.featureSet.getId("BI_e1sup_"+e1SuperType)] = 1 features[self.featureSet.getId("BI_e2sup_"+e2SuperType)] = 1 features[self.featureSet.getId("BI_e1e2_"+e1Type+"_"+e2Type)] = 1 features[self.featureSet.getId("BI_e1e2sup_"+e1SuperType+"_"+e2SuperType)] = 1 if self.styles["evex"]: self.evexFeatureBuilder.setFeatureVector(features, entity1, entity2) self.evexFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph) self.evexFeatureBuilder.setFeatureVector(None) if self.styles["giuliano"]: self.giulianoFeatureBuilder.setFeatureVector(features, entity1, entity2) self.giulianoFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph) self.giulianoFeatureBuilder.setFeatureVector(None) else: features[self.featureSet.getId("always_negative")] = 1 if self.styles["subset"]: features[self.featureSet.getId("out_of_scope")] = 1 else: features[self.featureSet.getId("always_negative")] = 1 if self.styles["subset"]: features[self.featureSet.getId("out_of_scope")] = 1 path = [token1, token2] # define extra attributes #if int(path[0].get("id").split("_")[-1]) < int(path[-1].get("id").split("_")[-1]): if int(path[0].get("charOffset").split("-")[0]) < int(path[-1].get("charOffset").split("-")[0]): #extra = {"xtype":"edge","type":"i","t1":path[0],"t2":path[-1]} extra = {"xtype":"edge","type":"i","t1":path[0].get("id"),"t2":path[-1].get("id")} extra["deprev"] = False else: #extra = {"xtype":"edge","type":"i","t1":path[-1],"t2":path[0]} extra = {"xtype":"edge","type":"i","t1":path[-1].get("id"),"t2":path[0].get("id")} extra["deprev"] = True if entity1 != None: #extra["e1"] = entity1 extra["e1"] = entity1.get("id") if sentenceGraph.mergedEntityToDuplicates != None: #extra["e1GoldIds"] = mergedEntityIds[entity1] extra["e1DuplicateIds"] = ",".join([x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity1]]) if entity2 != None: #extra["e2"] = entity2 extra["e2"] = entity2.get("id") if sentenceGraph.mergedEntityToDuplicates != None: extra["e2DuplicateIds"] = ",".join([x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity2]]) #extra["e2GoldIds"] = mergedEntityIds[entity2] extra["categoryName"] = categoryName if self.styles["bacteria_renaming"]: if entity1.get("text") != None and entity1.get("text") != "": extra["e1t"] = entity1.get("text").replace(" ", "---").replace(":","-COL-") if entity2.get("text") != None and entity2.get("text") != "": extra["e2t"] = entity2.get("text").replace(" ", "---").replace(":","-COL-") sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId # make example if self.styles["binary"]: if categoryName != "neg": category = 1 else: category = -1 categoryName = "i" else: category = self.classSet.getId(categoryName) # NOTE: temporarily disable for replicating 110310 experiment #features[self.featureSet.getId("extra_constant")] = 1 return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)
class EventExampleBuilder(ExampleBuilder): def __init__(self, style=["typed", "directed", "headsOnly"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert (classSet.getId("neg") == 1) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) #if "ontology" in self.styles: # self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) self.pathLengths = length assert (self.pathLengths == None) self.types = types #self.outFile = open("exampleTempFile.txt","wt") @classmethod def run(cls, input, output, parse, tokenization, style, idFileTag=None): classSet, featureSet = cls.getIdSets(idFileTag) e = EventExampleBuilder(style=style, classSet=classSet, featureSet=featureSet) sentences = cls.getSentences(input, parse, tokenization) e.buildExamplesForSentences(sentences, output, idFileTag) def definePredictedValueRange(self, sentences, elementName): self.multiEdgeFeatureBuilder.definePredictedValueRange( sentences, elementName) def getPredictedValueRange(self): return self.multiEdgeFeatureBuilder.predictedRange def preProcessExamples(self, allExamples): if "normalize" in self.styles: print >> sys.stderr, " Normalizing feature vectors" ExampleUtils.normalizeFeatureVectors(allExamples) return allExamples def isPotentialGeniaInteraction(self, e1, e2): if e1.get("isName") == "True" and e2.get("isName") == "True": return False elif e1.get("isName") == "True" and e2.get("isName") == "False": return False else: return True def getArgumentEntities(self, sentenceGraph, entityNode): eId = entityNode.get("id") assert (eId != None) themeNodes = [] causeNodes = [] for edge in sentenceGraph.interactions: if edge.get("e1") == eId: edgeType = edge.get("type") assert (edgeType in ["Theme", "Cause"]), edgeType if edgeType == "Theme": themeNodes.append( sentenceGraph.entitiesById[edge.get("e2")]) elif edgeType == "Cause": causeNodes.append( sentenceGraph.entitiesById[edge.get("e2")]) return themeNodes, causeNodes def makeGSEvents(self, sentenceGraph): self.gsEvents = {} # [token]->[event-type]->[1-n argument sets] for token in sentenceGraph.tokens: self.gsEvents[token] = {} for entity in sentenceGraph.entities: if entity.get("type") == "neg": continue eId = entity.get("id") eType = entity.get("type") arguments = set() for interaction in sentenceGraph.interactions: if interaction.get("e1") == eId: arguments.add( (interaction.get("type"), interaction.get("e2"))) eHeadToken = sentenceGraph.entityHeadTokenByEntity[entity] if not self.gsEvents[eHeadToken].has_key(eType): self.gsEvents[eHeadToken][eType] = [] self.gsEvents[eHeadToken][eType].append(arguments) def isGSEvent(self, sentenceGraph, entity, themeNodes, causeNodes): eHeadToken = sentenceGraph.entityHeadTokenByEntity[entity] eType = entity.get("type") if not self.gsEvents[eHeadToken].has_key(eType): return False argumentSet = set() for themeNode in themeNodes: if themeNode != None: argumentSet.add(("Theme", themeNode.get("id"))) for causeNode in causeNodes: if causeNode != None: argumentSet.add(("Cause", causeNode.get("id"))) if argumentSet in self.gsEvents[eHeadToken][eType]: return True else: return False # def isEvent(self, sentenceGraph, eventNode, themeNodes, causeNodes): # goldThemeNodes, goldCauseNodes = self.getArgumentEntities(sentenceGraph, eventNode) # for node in themeNodes: # if node != None and node not in goldThemeNodes: # return False # for node in causeNodes: # if node != None and node not in goldCauseNodes: # return False # return True def buildExamples(self, sentenceGraph): self.makeGSEvents(sentenceGraph) eventNodes = [] nameNodes = [] for entity in sentenceGraph.entities: if entity.get("type") == "neg": continue if entity.get("isName") == "True": nameNodes.append(entity) else: eventNodes.append(entity) allNodes = eventNodes + nameNodes examples = [] exampleIndex = 0 undirected = sentenceGraph.dependencyGraph.to_undirected() paths = NX.all_pairs_shortest_path(undirected, cutoff=999) for eventNode in eventNodes: eventType = eventNode.get("type") if eventType in [ "Gene_expression", "Transcription", "Protein_catabolism", "Localization", "Phosphorylation" ]: for nameNode in nameNodes: if self.isPotentialGeniaInteraction(eventNode, nameNode): examples.append( self.buildExample(exampleIndex, sentenceGraph, paths, eventNode, nameNode)) exampleIndex += 1 elif eventType in [ "Regulation", "Positive_regulation", "Negative_regulation" ]: combinations = combine.combine(allNodes + [None], allNodes + [None]) for combination in combinations: if combination[0] == combination[1]: continue if combination[0] == eventNode or combination[ 1] == eventNode: continue if combination[ 0] != None and not self.isPotentialGeniaInteraction( eventNode, combination[0]): continue if combination[ 1] != None and not self.isPotentialGeniaInteraction( eventNode, combination[1]): continue examples.append( self.buildExample(exampleIndex, sentenceGraph, paths, eventNode, combination[0], combination[1])) exampleIndex += 1 elif eventType in ["Binding"]: continue else: assert False, eventType self.gsEvents = None return examples def buildExample(self, exampleIndex, sentenceGraph, paths, eventNode, themeNode, causeNode=None): features = {} if self.isGSEvent(sentenceGraph, eventNode, [themeNode], [causeNode]): category = self.classSet.getId("pos") else: category = self.classSet.getId("neg") if themeNode != None: self.buildArgumentFeatures(sentenceGraph, paths, features, eventNode, themeNode, "theme_") if causeNode != None: self.buildArgumentFeatures(sentenceGraph, paths, features, eventNode, causeNode, "cause_") # Common features # eventType = eventNode.get("type") # e2Type = entity2.get("type") # assert(entity1.get("isName") == "False") # if entity2.get("isName") == "True": # features[self.featureSet.getId("GENIA_target_protein")] = 1 # else: # features[self.featureSet.getId("GENIA_nested_event")] = 1 # if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization # if entity2.get("isName") == "True": # features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1 # else: # features[self.featureSet.getId("GENIA_regulation_of_event")] = 1 # define extra attributes extra = {"xtype": "trigger-event", "type": eventNode.get("type")} extra["e"] = eventNode.get("id") eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode] extra["et"] = eventToken.get("id") if themeNode != None: extra["t"] = themeNode.get("id") themeToken = sentenceGraph.entityHeadTokenByEntity[themeNode] extra["tt"] = themeToken.get("id") if causeNode != None: extra["c"] = causeNode.get("id") causeToken = sentenceGraph.entityHeadTokenByEntity[causeNode] extra["ct"] = causeToken.get("id") sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId # make example #assert (category == 1 or category == -1) return (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra) def buildArgumentFeatures(self, sentenceGraph, paths, features, eventNode, argNode, tag): eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode] argToken = sentenceGraph.entityHeadTokenByEntity[argNode] if eventToken != argToken and paths.has_key( eventToken) and paths[eventToken].has_key(argToken): path = paths[eventToken][argToken] edges = self.multiEdgeFeatureBuilder.getEdges( sentenceGraph.dependencyGraph, path) else: path = [eventToken, argToken] edges = None self.multiEdgeFeatureBuilder.tag = tag self.multiEdgeFeatureBuilder.setFeatureVector(features, eventNode, argNode) if not "disable_entity_features" in self.styles: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not "disable_terminus_features" in self.styles: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures( path, sentenceGraph) # remove for fast if not "disable_single_element_features" in self.styles: self.multiEdgeFeatureBuilder.buildSingleElementFeatures( path, edges, sentenceGraph) if not "disable_ngram_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathGrams( 2, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 3, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 4, path, edges, sentenceGraph) # remove for fast if not "disable_path_edge_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures( path, edges, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) self.multiEdgeFeatureBuilder.tag = ""
class EdgeExampleBuilder(ExampleBuilder): """ This example builder makes edge examples, i.e. examples describing the event arguments. """ def __init__(self, style=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) assert( classSet.getId("neg") == 1 or (len(classSet.Ids)== 2 and classSet.getId("neg") == -1) ) # Basic style = trigger_features:typed:directed:no_linear:entities:auto_limits:noMasking:maxFeatures self._setDefaultParameters([ "directed", "undirected", "headsOnly", "graph_kernel", "noAnnType", "mask_nodes", "limit_features", "no_auto_limits", "co_features", "genia_features", "bi_features", #"genia_limits", "epi_limits", "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits", "genia_task1", "ontology", "nodalida", "bacteria_renaming", "no_trigger_features", "rel_features", "drugbank_features", "ddi_mtmx", "evex", "giuliano", "random", "themeOnly", "causeOnly", "no_path", "token_nodes", "skip_extra_triggers", "headsOnly", "graph_kernel", "no_task", "no_dependency", "disable_entity_features", "disable_terminus_features", "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features", "linear_features", "subset", "binary", "pos_only", "entity_type", "filter_shortest_path", "maskTypeAsProtein", "keep_neg", "metamap", "sdb_merge", "sdb_features", "ontobiotope_features", "no_self_loops", "full_entities", "no_features", "wordnet", "wordvector", "se10t8_undirected", "filter_types", "doc_extra", "entity_extra"]) self.styles = self.getParameters(style) #if style == None: # no parameters given # style["typed"] = style["directed"] = style["headsOnly"] = True self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet, self.styles) # NOTE Temporarily re-enabling predicted range #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None) if self.styles["graph_kernel"]: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet) if self.styles["noAnnType"]: self.multiEdgeFeatureBuilder.noAnnType = True if self.styles["mask_nodes"]: self.multiEdgeFeatureBuilder.maskNamedEntities = True else: self.multiEdgeFeatureBuilder.maskNamedEntities = False if not self.styles["limit_features"]: self.multiEdgeFeatureBuilder.maximum = True if self.styles["genia_task1"]: self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity") self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) if self.styles["ontology"]: self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) if self.styles["ontobiotope_features"]: self.ontobiotopeFeatureBuilder = OntoBiotopeFeatureBuilder(self.featureSet) if self.styles["nodalida"]: self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet) if self.styles["bacteria_renaming"]: self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder(self.featureSet) if not self.styles["no_trigger_features"]: self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet, self.styles) self.triggerFeatureBuilder.useNonNameEntities = True if self.styles["noAnnType"]: self.triggerFeatureBuilder.noAnnType = True if self.styles["genia_task1"]: self.triggerFeatureBuilder.filterAnnTypes.add("Entity") #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName) if self.styles["rel_features"]: self.relFeatureBuilder = RELFeatureBuilder(featureSet) if self.styles["drugbank_features"]: self.drugFeatureBuilder = DrugFeatureBuilder(featureSet) if self.styles["evex"]: self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet) if self.styles["wordnet"]: self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet) if self.styles["wordvector"]: self.wordVectorFeatureBuilder = WordVectorFeatureBuilder(featureSet, self.styles) if self.styles["giuliano"]: self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet) self.types = types if self.styles["random"]: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet) def definePredictedValueRange(self, sentences, elementName): self.multiEdgeFeatureBuilder.definePredictedValueRange(sentences, elementName) def getPredictedValueRange(self): return self.multiEdgeFeatureBuilder.predictedRange def filterEdgesByType(self, edges, typesToInclude): if len(typesToInclude) == 0: return edges edgesToKeep = [] for edge in edges: if edge.get("type") in typesToInclude: edgesToKeep.append(edge) return edgesToKeep def getCategoryNameFromTokens(self, sentenceGraph, t1, t2, directed=True): """ Example class. Multiple overlapping edges create a merged type. """ types = set() intEdges = sentenceGraph.interactionGraph.getEdges(t1, t2) if not directed: intEdges = intEdges + sentenceGraph.interactionGraph.getEdges(t2, t1) for intEdge in intEdges: types.add(intEdge[2].get("type")) types = list(types) types.sort() categoryName = "" for name in types: if categoryName != "": categoryName += "---" categoryName += name if categoryName != "": return categoryName else: return "neg" def getCategoryName(self, sentenceGraph, e1, e2, directed=True): """ Example class. Multiple overlapping edges create a merged type. """ interactions = sentenceGraph.getInteractions(e1, e2, True) if not directed and not self.styles["se10t8_undirected"]: interactions = interactions + sentenceGraph.getInteractions(e2, e1, True) types = set() for interaction in interactions: types.add(interaction[2].get("type")) types = list(types) types.sort() categoryName = "" for name in types: if self.styles["causeOnly"] and name != "Cause": continue if self.styles["themeOnly"] and name != "Theme": continue if categoryName != "": categoryName += "---" if self.styles["sdb_merge"]: name = self.mergeForSeeDev(name, self.structureAnalyzer) categoryName += name if categoryName != "": return categoryName else: return "neg" def getBISuperType(self, eType): if eType in ["GeneProduct", "Protein", "ProteinFamily", "PolymeraseComplex"]: return "ProteinEntity" elif eType in ["Gene", "GeneFamily", "GeneComplex", "Regulon", "Site", "Promoter"]: return "GeneEntity" else: return None def getSeeDevSuperTypes(self, eType): if eType in ("Gene", "Gene_Family", "Box", "Promoter"): return ("DNA", "Molecule") elif eType == "RNA": return ("RNA", "DNA_Product", "Molecule") elif eType in ("Protein", "Protein_Family", "Protein_Complex", "Protein_Domain"): return ("Amino_acid_sequence", "DNA_Product", "Molecule") elif eType == "Hormone": return ("Molecule",) elif eType in ("Regulatory_Network", "Pathway"): return ("Dynamic_process",) elif eType in ("Genotype", "Tissue", "Development_Phase"): return ("Biological_context", "Context") elif eType == "Environmental_Factor": return ("Context",) else: raise Exception("Unknown SeeDev type '" + str(eType) + "'") def mergeForSeeDev(self, categoryName, structureAnalyzer): if categoryName in structureAnalyzer.typeMap["forward"]: return structureAnalyzer.typeMap["forward"][categoryName] return categoryName # for tag in ("Regulates", "Exists", "Interacts", "Is", "Occurs"): # if categoryName.startswith(tag): # categoryName = tag # break # return categoryName def processCorpus(self, input, output, gold=None, append=False, allowNewIds=True, structureAnalyzer=None): if self.styles["sdb_merge"]: structureAnalyzer.determineNonOverlappingTypes() self.structureAnalyzer = structureAnalyzer ExampleBuilder.processCorpus(self, input, output, gold, append, allowNewIds, structureAnalyzer) def isValidInteraction(self, e1, e2, structureAnalyzer,forceUndirected=False): return len(structureAnalyzer.getValidEdgeTypes(e1.get("type"), e2.get("type"), forceUndirected=forceUndirected)) > 0 def getGoldCategoryName(self, goldGraph, entityToGold, e1, e2, directed=True): if len(entityToGold[e1]) > 0 and len(entityToGold[e2]) > 0: return self.getCategoryName(goldGraph, entityToGold[e1][0], entityToGold[e2][0], directed=directed) else: return "neg" def filterEdge(self, edge, edgeTypes): import types assert edgeTypes != None if type(edgeTypes) not in [types.ListType, types.TupleType]: edgeTypes = [edgeTypes] if edge[2].get("type") in edgeTypes: return True else: return False def keepExample(self, e1, e2, categoryName, isDirected, structureAnalyzer): makeExample = True if (not self.styles["no_auto_limits"]) and not self.isValidInteraction(e1, e2, structureAnalyzer, forceUndirected=not isDirected): makeExample = False self.exampleStats.filter("auto_limits") if self.styles["genia_task1"] and (e1.get("type") == "Entity" or e2.get("type") == "Entity"): makeExample = False self.exampleStats.filter("genia_task1") if self.styles["pos_only"] and categoryName == "neg": makeExample = False self.exampleStats.filter("pos_only") if self.styles["no_self_loops"] and ((e1 == e2) or (e1.get("headOffset") == e2.get("headOffset"))): makeExample = False self.exampleStats.filter("no_self_loops") return makeExample def getExampleCategoryName(self, e1=None, e2=None, t1=None, t2=None, sentenceGraph=None, goldGraph=None, entityToGold=None, isDirected=True, structureAnalyzer=None): if self.styles["token_nodes"]: categoryName = self.getCategoryNameFromTokens(sentenceGraph, t1, t2, isDirected) else: categoryName = self.getCategoryName(sentenceGraph, e1, e2, isDirected) if goldGraph != None: categoryName = self.getGoldCategoryName(goldGraph, entityToGold, e1, e2, isDirected) if self.styles["filter_types"] != None and categoryName in self.styles["filter_types"]: categoryName = "neg" if self.styles["se10t8_undirected"]: assert e1.get("id").endswith(".e1") assert e2.get("id").endswith(".e2") #if self.styles["sdb_merge"]: # categoryName = self.mergeForSeeDev(categoryName, structureAnalyzer) return categoryName def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph = None, structureAnalyzer=None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ #examples = [] exampleIndex = 0 # example directionality if self.styles["directed"] == None and self.styles["undirected"] == None: # determine directedness from corpus examplesAreDirected = structureAnalyzer.hasDirectedTargets() if structureAnalyzer != None else True elif self.styles["directed"]: assert self.styles["undirected"] in [None, False] examplesAreDirected = True elif self.styles["undirected"]: assert self.styles["directed"] in [None, False] examplesAreDirected = False if not self.styles["no_trigger_features"]: self.triggerFeatureBuilder.initSentence(sentenceGraph) if self.styles["evex"]: self.evexFeatureBuilder.initSentence(sentenceGraph) # if self.styles["sdb_merge"]: # self.determineNonOverlappingTypes(structureAnalyzer) # Filter entities, if needed sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities entityToDuplicates = sentenceGraph.mergedEntityToDuplicates self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) # Connect to optional gold graph entityToGold = None if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities(entities, goldGraph.entities) paths = None if not self.styles["no_path"]: undirected = sentenceGraph.dependencyGraph.toUndirected() paths = undirected if self.styles["filter_shortest_path"] != None: # For DDI use filter_shortest_path=conj_and paths.resetAnalyses() # just in case paths.FloydWarshall(self.filterEdge, {"edgeTypes":self.styles["filter_shortest_path"]}) # Generate examples based on interactions between entities or interactions between tokens if self.styles["token_nodes"]: loopRange = len(sentenceGraph.tokens) else: loopRange = len(entities) for i in range(loopRange-1): for j in range(i+1,loopRange): eI = None eJ = None if self.styles["token_nodes"]: tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] else: eI = entities[i] eJ = entities[j] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] if eI.get("type") == "neg" or eJ.get("type") == "neg": continue if self.styles["skip_extra_triggers"]: if eI.get("source") != None or eJ.get("source") != None: continue # only consider paths between entities (NOTE! entities, not only named entities) if self.styles["headsOnly"]: if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0): continue examples = self.buildExamplesForPair(tI, tJ, paths, sentenceGraph, goldGraph, entityToGold, eI, eJ, structureAnalyzer, examplesAreDirected) for categoryName, features, extra in examples: # make example if self.styles["binary"]: if categoryName != "neg": category = 1 else: category = -1 extra["categoryName"] = "i" else: category = self.classSet.getId(categoryName) example = [sentenceGraph.getSentenceId()+".x"+str(exampleIndex), category, features, extra] ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 return exampleIndex def buildExamplesForPair(self, token1, token2, paths, sentenceGraph, goldGraph, entityToGold, entity1=None, entity2=None, structureAnalyzer=None, isDirected=True): # define forward categoryName = self.getExampleCategoryName(entity1, entity2, token1, token2, sentenceGraph, goldGraph, entityToGold, isDirected, structureAnalyzer=structureAnalyzer) # make forward forwardExample = None self.exampleStats.beginExample(categoryName) if self.keepExample(entity1, entity2, categoryName, isDirected, structureAnalyzer): forwardExample = self.buildExample(token1, token2, paths, sentenceGraph, categoryName, entity1, entity2, structureAnalyzer, isDirected) if isDirected: # build a separate reverse example (if that is valid) self.exampleStats.endExample() # end forward example # define reverse categoryName = self.getExampleCategoryName(entity2, entity1, token2, token1, sentenceGraph, goldGraph, entityToGold, True, structureAnalyzer=structureAnalyzer) # make reverse self.exampleStats.beginExample(categoryName) reverseExample = None if self.keepExample(entity2, entity1, categoryName, True, structureAnalyzer): reverseExample = self.buildExample(token2, token1, paths, sentenceGraph, categoryName, entity2, entity1, structureAnalyzer, isDirected) self.exampleStats.endExample() return filter(None, [forwardExample, reverseExample]) elif self.styles["se10t8_undirected"]: # undirected example with a directed type self.exampleStats.endExample() return [forwardExample] elif forwardExample != None: # merge features from the reverse example to the forward one reverseExample = self.buildExample(token2, token1, paths, sentenceGraph, categoryName, entity2, entity1, structureAnalyzer, isDirected) forwardExample[1].update(reverseExample[1]) self.exampleStats.endExample() # end merged example return [forwardExample] else: # undirected example that was filtered self.exampleStats.endExample() # end merged example return [] def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, entity1=None, entity2=None, structureAnalyzer=None, isDirected=True): """ Build a single directed example for the potential edge between token1 and token2 """ # define features if not self.styles["no_path"]: path = paths.getPaths(token1, token2) if len(path) > 0: path = path[0] #pathExists = True else: path = [token1, token2] #pathExists = False else: path = [token1, token2] #pathExists = False features = {} if not self.styles["no_features"]: features = self.buildFeatures(sentenceGraph, entity1, entity2, token1, token2, path) # define extra attributes if int(path[0].get("charOffset").split("-")[0]) < int(path[-1].get("charOffset").split("-")[0]): extra = {"xtype":"edge","type":"i","t1":path[0].get("id"),"t2":path[-1].get("id")} extra["deprev"] = False else: extra = {"xtype":"edge","type":"i","t1":path[-1].get("id"),"t2":path[0].get("id")} extra["deprev"] = True if entity1 != None: extra["e1"] = entity1.get("id") if sentenceGraph.mergedEntityToDuplicates != None: extra["e1DuplicateIds"] = ",".join([x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity1]]) if entity2 != None: extra["e2"] = entity2.get("id") if sentenceGraph.mergedEntityToDuplicates != None: extra["e2DuplicateIds"] = ",".join([x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity2]]) extra["categoryName"] = categoryName if self.styles["bacteria_renaming"]: if entity1.get("text") != None and entity1.get("text") != "": extra["e1t"] = entity1.get("text").replace(" ", "---").replace(":","-COL-") if entity2.get("text") != None and entity2.get("text") != "": extra["e2t"] = entity2.get("text").replace(" ", "---").replace(":","-COL-") if self.styles["doc_extra"]: if hasattr(sentenceGraph, "documentElement") and sentenceGraph.documentElement.get("origId") != None: extra["DOID"] = sentenceGraph.documentElement.get("origId") if self.styles["entity_extra"]: if entity1.get("origId") != None: extra["e1OID"] = entity1.get("origId") if entity2.get("origId") != None: extra["e2OID"] = entity2.get("origId") sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId extra["directed"] = str(isDirected) if self.styles["sdb_merge"]: extra["sdb_merge"] = "True" #print extra return (categoryName, features, extra) def buildFeatures(self, sentenceGraph, entity1, entity2, token1, token2, path): features = {} if not self.styles["no_trigger_features"]: # F 85.52 -> 85.55 self.triggerFeatureBuilder.setFeatureVector(features) self.triggerFeatureBuilder.tag = "trg1_" self.triggerFeatureBuilder.buildFeatures(token1) self.triggerFeatureBuilder.tag = "trg2_" self.triggerFeatureBuilder.buildFeatures(token2) self.triggerFeatureBuilder.setFeatureVector(None) # REL features if self.styles["rel_features"] and not self.styles["no_task"]: self.relFeatureBuilder.setFeatureVector(features) self.relFeatureBuilder.tag = "rel1_" self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token1)) self.relFeatureBuilder.tag = "rel2_" self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token2)) self.relFeatureBuilder.setFeatureVector(None) if self.styles["bacteria_renaming"] and not self.styles["no_task"]: self.bacteriaRenamingFeatureBuilder.setFeatureVector(features) self.bacteriaRenamingFeatureBuilder.buildPairFeatures(entity1, entity2) #self.bacteriaRenamingFeatureBuilder.buildSubstringFeatures(entity1, entity2) # decreases perf. 74.76 -> 72.41 self.bacteriaRenamingFeatureBuilder.setFeatureVector(None) if self.styles["co_features"] and not self.styles["no_task"]: e1Offset = Range.charOffsetToSingleTuple(entity1.get("charOffset")) e2Offset = Range.charOffsetToSingleTuple(entity2.get("charOffset")) if Range.contains(e1Offset, e2Offset): features[self.featureSet.getId("e1_contains_e2")] = 1 if entity2.get("given") == "True": features[self.featureSet.getId("e1_contains_e2name")] = 1 if Range.contains(e2Offset, e1Offset): features[self.featureSet.getId("e2_contains_e1")] = 1 if entity1.get("given") == "True": features[self.featureSet.getId("e2_contains_e1name")] = 1 if self.styles["drugbank_features"]: self.drugFeatureBuilder.setFeatureVector(features) self.drugFeatureBuilder.tag = "ddi_" self.drugFeatureBuilder.buildPairFeatures(entity1, entity2) if self.styles["ddi_mtmx"]: self.drugFeatureBuilder.buildMTMXFeatures(entity1, entity2) self.drugFeatureBuilder.setFeatureVector(None) if self.styles["graph_kernel"]: self.graphKernelFeatureBuilder.setFeatureVector(features, entity1, entity2) self.graphKernelFeatureBuilder.buildGraphKernelFeatures(sentenceGraph, path) self.graphKernelFeatureBuilder.setFeatureVector(None) if self.styles["entity_type"]: e1Type = self.multiEdgeFeatureBuilder.getEntityType(entity1) e2Type = self.multiEdgeFeatureBuilder.getEntityType(entity2) features[self.featureSet.getId("e1_"+e1Type)] = 1 features[self.featureSet.getId("e2_"+e2Type)] = 1 features[self.featureSet.getId("distance_"+str(len(path)))] = 1 if not self.styles["no_dependency"]: #print "Dep features" self.multiEdgeFeatureBuilder.setFeatureVector(features, entity1, entity2) #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast if not self.styles["disable_entity_features"]: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not self.styles["disable_terminus_features"]: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not self.styles["disable_single_element_features"]: self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, sentenceGraph) if not self.styles["disable_ngram_features"]: #print "NGrams" self.multiEdgeFeatureBuilder.buildPathGrams(2, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(3, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(4, path, sentenceGraph) # remove for fast #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast #if edges != None: # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast if not self.styles["disable_path_edge_features"]: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) if self.styles["nodalida"]: self.nodalidaFeatureBuilder.setFeatureVector(features, entity1, entity2) shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths(sentenceGraph.dependencyGraph, path) print shortestPaths if len(shortestPaths) > 0: self.nodalidaFeatureBuilder.buildNGrams(shortestPaths, sentenceGraph) self.nodalidaFeatureBuilder.setFeatureVector(None) if self.styles["linear_features"]: self.tokenFeatureBuilder.setFeatureVector(features) for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == token1: token1Index = i if sentenceGraph.tokens[i] == token2: token2Index = i linearPreTag = "linfw_" if token1Index > token2Index: token1Index, token2Index = token2Index, token1Index linearPreTag = "linrv_" self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1") self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2") # Before, middle, after # self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf") # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw") # self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af") # before-middle, middle, middle-after # self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2) self.tokenFeatureBuilder.setFeatureVector(None) if self.styles["random"]: self.randomFeatureBuilder.setFeatureVector(features) self.randomFeatureBuilder.buildRandomFeatures(100, 0.01) self.randomFeatureBuilder.setFeatureVector(None) if self.styles["genia_features"] and not self.styles["no_task"]: e1Type = entity1.get("type") e2Type = entity2.get("type") assert(entity1.get("given") in (None, "False")) if entity2.get("given") == "True": features[self.featureSet.getId("GENIA_target_protein")] = 1 else: features[self.featureSet.getId("GENIA_nested_event")] = 1 if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization if entity2.get("given") == "True": features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1 else: features[self.featureSet.getId("GENIA_regulation_of_event")] = 1 if self.styles["bi_features"]: # Make features based on entity types e1Type = entity1.get("type") e2Type = entity2.get("type") e1SuperType = str(self.getBISuperType(e1Type)) e2SuperType = str(self.getBISuperType(e2Type)) features[self.featureSet.getId("BI_e1_"+e1Type)] = 1 features[self.featureSet.getId("BI_e2_"+e2Type)] = 1 features[self.featureSet.getId("BI_e1sup_"+e1SuperType)] = 1 features[self.featureSet.getId("BI_e2sup_"+e2SuperType)] = 1 features[self.featureSet.getId("BI_e1e2_"+e1Type+"_"+e2Type)] = 1 features[self.featureSet.getId("BI_e1e2sup_"+e1SuperType+"_"+e2SuperType)] = 1 if self.styles["sdb_features"]: e1Type = entity1.get("type") e2Type = entity2.get("type") features[self.featureSet.getId("SDB_e1_"+e1Type)] = 1 features[self.featureSet.getId("SDB_e2_"+e2Type)] = 1 features[self.featureSet.getId("SDB_e1e2_"+e1Type+"_"+e2Type)] = 1 if e1Type == e2Type: features[self.featureSet.getId("SDB_e1e2_equal")] = 1 features[self.featureSet.getId("SDB_e1e2_equal_" + e1Type)] = 1 e1SuperTypes = str(self.getSeeDevSuperTypes(e1Type)) e2SuperTypes = str(self.getSeeDevSuperTypes(e2Type)) for e1SuperType in e1SuperTypes: for e2SuperType in e2SuperTypes: features[self.featureSet.getId("SDB_e1sup_"+e1SuperType)] = 1 features[self.featureSet.getId("SDB_e2sup_"+e2SuperType)] = 1 features[self.featureSet.getId("SDB_e1e2sup_"+e1SuperType+"_"+e2SuperType)] = 1 if e1SuperType == e2SuperType: features[self.featureSet.getId("SDB_e1e2sup_equal")] = 1 features[self.featureSet.getId("SDB_e1e2sup_equal_" + e1SuperType)] = 1 if self.styles["ontobiotope_features"]: self.ontobiotopeFeatureBuilder.setFeatureVector(features) self.ontobiotopeFeatureBuilder.buildOBOFeaturesForEntityPair(entity1, entity2) self.ontobiotopeFeatureBuilder.setFeatureVector(None) if self.styles["full_entities"]: e1Text = entity1.get("text").lower() e2Text = entity2.get("text").lower() features[self.featureSet.getId("FULL_e1_"+e1Text)] = 1 features[self.featureSet.getId("FULL_e2_"+e2Text)] = 1 for ep1 in e1Text.split(): for ep2 in e2Text.split(): features[self.featureSet.getId("FULL_e1_"+ep1)] = 1 features[self.featureSet.getId("FULL_e2_"+ep2)] = 1 features[self.featureSet.getId("FULL_e1e2_"+ep1+"_"+ep2)] = 1 if self.styles["evex"]: self.evexFeatureBuilder.setFeatureVector(features, entity1, entity2) self.evexFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph) self.evexFeatureBuilder.setFeatureVector(None) if self.styles["wordnet"]: self.wordNetFeatureBuilder.setFeatureVector(features, entity1, entity2) self.wordNetFeatureBuilder.buildFeaturesForEntityPair(token1, token2) self.wordNetFeatureBuilder.buildLinearFeatures(token1, sentenceGraph.tokens, tag="t1_") self.wordNetFeatureBuilder.buildLinearFeatures(token2, sentenceGraph.tokens, tag="t2_") self.wordNetFeatureBuilder.buildPathFeatures(path) self.wordNetFeatureBuilder.setFeatureVector(None) if self.styles["wordvector"]: self.wordVectorFeatureBuilder.setFeatureVector(features, entity1, entity2) self.wordVectorFeatureBuilder.buildFeatures(token1, "t1_") self.wordVectorFeatureBuilder.buildFeatures(token2, "t2_") self.wordVectorFeatureBuilder.buildLinearFeatures(token1, sentenceGraph.tokens, tag="t1_") self.wordVectorFeatureBuilder.buildLinearFeatures(token2, sentenceGraph.tokens, tag="t2_") self.wordVectorFeatureBuilder.buildPathFeatures(path) self.wordVectorFeatureBuilder.buildFBAFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token1), sentenceGraph.tokens.index(token2)) self.wordVectorFeatureBuilder.setFeatureVector(None) if self.styles["giuliano"]: self.giulianoFeatureBuilder.setFeatureVector(features, entity1, entity2) self.giulianoFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph) self.giulianoFeatureBuilder.setFeatureVector(None) return features
class Round2TriggerExampleBuilder(ExampleBuilder): def nxMultiDiGraphToUndirected(self, graph): undirected = NX10.MultiGraph(name=graph.name) undirected.add_nodes_from(graph) undirected.add_edges_from(graph.edges_iter()) return undirected def getPredictionStrength(self, element): eType = element.get("type") predictions = element.get("predictions") if predictions == None: return 0 predictions = predictions.split(",") for prediction in predictions: predClass, predStrength = prediction.split(":") if predClass == eType: predStrength = float(predStrength) return predStrength return 0 def getInteractionEdgeLengths(self, sentenceGraph, paths): """ Return dependency and linear length of all interaction edges (measured between the two tokens). """ interactionLengths = {} for interaction in sentenceGraph.interactions: # Calculated interaction edge dep and lin length e1 = sentenceGraph.entitiesById[interaction.get("e1")] e2 = sentenceGraph.entitiesById[interaction.get("e2")] t1 = sentenceGraph.entityHeadTokenByEntity[e1] t2 = sentenceGraph.entityHeadTokenByEntity[e2] # Get dep path length if t1 != t2 and paths.has_key(t1) and paths[t1].has_key(t2): pathLength = len(paths[t1][t2]) else: # no dependencyPath pathLength = 999999 # more than any real path # Linear distance t1Pos = -1 t2Pos = -1 for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == t1: t1Pos = i if t2Pos != -1: break if sentenceGraph.tokens[i] == t2: t2Pos = i if t1Pos != -1: break linLength = abs(t1Pos - t2Pos) interactionLengths[interaction] = (interaction, pathLength, linLength, t2Pos) return interactionLengths def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None): if classSet == None: classSet = IdSet(1) assert (classSet.getId("neg") == 1) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" if gazetteerFileName != None: self.gazetteer = Gazetteer.loadGztr(gazetteerFileName) print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName else: print >> sys.stderr, "No gazetteer loaded" self.gazetteer = None self.styles = style self.skiplist = set() if skiplist != None: f = open(skiplist, "rt") for line in f.readlines(): self.skiplist.add(line.strip()) f.close() self.styles = [ "trigger_features", "typed", "directed", "no_linear", "entities", "genia_limits", "noMasking", "maxFeatures" ] self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "graph_kernel" in self.styles: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder( self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) @classmethod def run(cls, input, gold, output, parse, tokenization, style, idFileTag=None, append=False): """ An interface for running the example builder without needing to create a class """ classSet, featureSet = cls.getIdSets(idFileTag) if style != None: e = Round2TriggerExampleBuilder(style=style, classSet=classSet, featureSet=featureSet) else: e = Round2TriggerExampleBuilder(classSet=classSet, featureSet=featureSet) sentences = cls.getSentences(input, parse, tokenization) if gold != None: goldSentences = cls.getSentences(gold, parse, tokenization) else: goldSentences = None e.buildExamplesForSentences(sentences, goldSentences, output, idFileTag, append=append) def buildExamplesForSentences(self, sentences, goldSentences, output, idFileTag=None, append=False): examples = [] counter = ProgressCounter(len(sentences), "Build examples") if append: outfile = open(output, "at") else: outfile = open(output, "wt") exampleCount = 0 for i in range(len(sentences)): sentence = sentences[i] goldSentence = [None] if goldSentences != None: goldSentence = goldSentences[i] counter.update( 1, "Building examples (" + sentence[0].getSentenceId() + "): ") examples = self.buildExamples(sentence[0], goldSentence[0], append=append) exampleCount += len(examples) examples = self.preProcessExamples(examples) ExampleUtils.appendExamples(examples, outfile) outfile.close() print >> sys.stderr, "Examples built:", exampleCount print >> sys.stderr, "Features:", len(self.featureSet.getNames()) #IF LOCAL if self.exampleStats.getExampleCount() > 0: self.exampleStats.printStats() #ENDIF # Save Ids if idFileTag != None: print >> sys.stderr, "Saving class names to", idFileTag + ".class_names" self.classSet.write(idFileTag + ".class_names") print >> sys.stderr, "Saving feature names to", idFileTag + ".feature_names" self.featureSet.write(idFileTag + ".feature_names") def preProcessExamples(self, allExamples): if "normalize" in self.styles: print >> sys.stderr, " Normalizing feature vectors" ExampleUtils.normalizeFeatureVectors(allExamples) return allExamples def getMergedEntityType(self, entities): """ If a single token belongs to multiple entities of different types, a new, composite type is defined. This type is the alphabetically ordered types of these entities joined with '---'. """ types = set() for entity in entities: types.add(entity.get("type")) types = list(types) types.sort() typeString = "" for type in types: if type == "Protein" and "all_tokens" in self.styles: continue if typeString != "": typeString += "---" typeString += type if typeString == "": return "neg" if "limit_merged_types" in self.styles: if typeString.find("---") != -1: if typeString == "Gene_expression---Positive_regulation": return typeString else: return typeString.split("---")[0] else: return typeString return typeString def getTokenFeatures(self, token, sentenceGraph): """ Returns a list of features based on the attributes of a token. These can be used to define more complex features. """ # These features are cached when this method is first called # for a token. if self.tokenFeatures.has_key(token): return self.tokenFeatures[token] tokTxt = sentenceGraph.getTokenText(token) features = {} features["_txt_" + tokTxt] = 1 # F 69.35 -> 68.22 #normalizedText = tokTxt.replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower() #features["_norTxt_"+normalizedText]=1 #features["_norStem_" + PorterStemmer.stem(normalizedText)]=1 features["_POS_" + token.get("POS")] = 1 if sentenceGraph.tokenIsName[token]: features["_isName"] = 1 for entity in sentenceGraph.tokenIsEntityHead[token]: if entity.get("isName") == "True": features["_annType_" + entity.get("type")] = 1 # Filip's gazetteer based features (can be used separately from exclude_gazetteer) if "gazetteer_features" in self.styles: tokTxtLower = tokTxt.lower() if "stem_gazetteer" in self.styles: tokTxtLower = PorterStemmer.stem(tokTxtLower) if self.gazetteer and tokTxtLower in self.gazetteer: for label, weight in self.gazetteer[tokTxtLower].items(): features["_knownLabel_" + label] = weight # 1 performs slightly worse self.tokenFeatures[token] = features return features def buildLinearOrderFeatures(self, sentenceGraph, index, tag, features): """ Linear features are built by marking token features with a tag that defines their relative position in the linear order. """ tag = "linear_" + tag for tokenFeature, w in self.getTokenFeatures( sentenceGraph.tokens[index], sentenceGraph).iteritems(): features[self.featureSet.getId(tag + tokenFeature)] = w def buildExamples(self, sentenceGraph, goldGraph, append=False): examples = self.buildExamplesInner(sentenceGraph, goldGraph) entityCounts = {} exampleCounts = {} for entity in sentenceGraph.entities: eType = entity.get("type") if eType == "Protein": continue if not entityCounts.has_key(eType): entityCounts[eType] = 0 exampleCounts[eType] = 0 entityCounts[eType] += 1 for example in examples: eTypes = self.classSet.getName(example[1]).split("---") for eType in eTypes: if not exampleCounts.has_key(eType): exampleCounts[eType] = 0 exampleCounts[eType] += 1 #for key in sorted(entityCounts.keys()): # if entityCounts[key] != exampleCounts[key]: # print >> sys.stderr, "Warning, sentence", sentenceGraph.getSentenceId(), "example", key, "diff", entityCounts[key] - exampleCounts[key] return examples def buildExamplesInner(self, sentenceGraph, goldGraph): """ Build one example for each token of the sentence """ if sentenceGraph.sentenceElement.get("origId") in self.skiplist: print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get( "origId") return [] self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) self.triggerFeatureBuilder.initSentence(sentenceGraph) undirected = self.nxMultiDiGraphToUndirected( sentenceGraph.dependencyGraph) paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) # Get argument order self.interactionLengths = self.getInteractionEdgeLengths( sentenceGraph, paths) self.interactionLengths = self.interactionLengths.values() self.interactionLengths.sort(compareInteractionPrecedence) # Map tokens to entities tokenByOffset = {} for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] if goldGraph != None: goldToken = goldGraph.tokens[i] assert token.get("id") == goldToken.get("id") and token.get( "charOffset") == goldToken.get("charOffset") tokenByOffset[token.get("charOffset")] = token.get("id") # Map gold entities to their head offsets goldEntitiesByOffset = {} for token in sentenceGraph.tokens: goldEntitiesByOffset[token.get("charOffset")] = [] entityToGold = {} for entity in sentenceGraph.entities: entityToGold[entity] = [] if goldGraph != None: for entity in goldGraph.entities: offset = entity.get("headOffset") assert offset != None goldEntitiesByOffset[offset].append(entity) # Map predicted entities to gold entities for entity in sentenceGraph.entities: eType = entity.get("type") eOffset = entity.get("headOffset") for goldEntity in goldEntitiesByOffset[eOffset]: if goldEntity.get("type") == eType: entityToGold[entity].append(goldEntity) # Map entities to interactions #interactionsByEntityId = {} #for entity in sentenceGraph.entities: # interactionsByEntityId[entity.get("id")] = [] # Map tokens to interactions interactionsByToken = {} for token in sentenceGraph.tokens: interactionsByToken[token] = [] for interactionTuple in self.interactionLengths: interaction = interactionTuple[0] if interaction.get("type") == "neg": continue e1Id = interaction.get("e1") token = sentenceGraph.entityHeadTokenByEntity[ sentenceGraph.entitiesById[e1Id]] interactionsByToken[token].append(interaction) examples = [] exampleIndex = 0 self.tokenFeatures = {} #namedEntityNorStrings = set() namedEntityHeadTokens = [] if not "names" in self.styles: namedEntityCount = 0 for entity in sentenceGraph.entities: if entity.get( "isName" ) == "True": # known data which can be used for features namedEntityCount += 1 #namedEntityNorStrings.add( entity.get("text").replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower() ) namedEntityCountFeature = "nameCount_" + str(namedEntityCount) #if namedEntityCount == 0: # no names, no need for triggers # return [] if "pos_pairs" in self.styles: namedEntityHeadTokens = self.getNamedEntityHeadTokens( sentenceGraph) #neFeatures = {} # F: 69.35 -> 69.14 #for norString in namedEntityNorStrings: # neFeatures[self.featureSet.getId("norNE_" + norString)] = 1 bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 bowFeatures = {} for k, v in bagOfWords.iteritems(): bowFeatures[self.featureSet.getId(k)] = v self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True) fixedInEdges = [] for edge in inEdges: fixedInEdges.append((edge[0], edge[1], edge[2]["element"])) inEdges = fixedInEdges inEdges.sort(compareDependencyEdgesById) self.inEdgesByToken[token] = inEdges outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True) fixedOutEdges = [] for edge in outEdges: fixedOutEdges.append((edge[0], edge[1], edge[2]["element"])) outEdges = fixedOutEdges outEdges.sort(compareDependencyEdgesById) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] # Recognize only non-named entities (i.e. interaction words) if sentenceGraph.tokenIsName[ token] and not "names" in self.styles and not "all_tokens" in self.styles: continue # CLASS #if len(sentenceGraph.tokenIsEntityHead[token]) > 0: # category = self.classSet.getId(self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token])) #else: # category = 1 offset = token.get("charOffset") if len(goldEntitiesByOffset[offset]) > 0: category = self.classSet.getId( self.getMergedEntityType(goldEntitiesByOffset[offset])) else: category = 1 tokenText = token.get("text").lower() if "stem_gazetteer" in self.styles: tokenText = PorterStemmer.stem(tokenText) if ("exclude_gazetteer" in self.styles ) and self.gazetteer and tokenText not in self.gazetteer: features = {} features[self.featureSet.getId("exclude_gazetteer")] = 1 extra = { "xtype": "token", "t": token.get("id"), "excluded": "True" } examples.append( (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)) exampleIndex += 1 continue # FEATURES features = {} self.features = features if not "names" in self.styles: features[self.featureSet.getId(namedEntityCountFeature)] = 1 #for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) #features.update(neFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_" + text)] = 1 features[self.featureSet.getId("POS_" + token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_" + stem)] = 1 features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1 # Normalized versions of the string (if same as non-normalized, overlap without effect) normalizedText = text.replace("-", "").replace("/", "").replace( ",", "").replace("\\", "").replace(" ", "").lower() if normalizedText == "bound": # should be for all irregular verbs normalizedText = "bind" features[self.featureSet.getId("txt_" + normalizedText)] = 1 norStem = PorterStemmer.stem(normalizedText) features[self.featureSet.getId("stem_" + norStem)] = 1 features[self.featureSet.getId("nonstem_" + normalizedText[len(norStem):])] = 1 if "gazetteer_features_maintoken" in self.styles: tokTxtLower = text.lower() if "stem_gazetteer" in self.styles: tokTxtLower = PorterStemmer.stem(tokTxtLower) if self.gazetteer and tokTxtLower in self.gazetteer: for label, weight in self.gazetteer[tokTxtLower].items(): features[self.featureSet.getId( "gaz_knownLabel_" + label)] = weight # 1 performs slightly worse # Linear order features #for index in [-3,-2,-1,1,2,3,4,5]: # 69.35 -> 68.97 for index in [-3, -2, -1, 1, 2, 3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j - 1] == "-": features[self.featureSet.getId( "has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_" + text[j - 1:j + 1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_" + text[j - 2:j + 1].lower())] = 1 # Attached edges (Hanging in and out edges) t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_" + edgeType)] = 1 features[self.featureSet.getId("t1HIn_" + edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_" + tokenText)] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + tokenText)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_" + edgeType)] = 1 features[self.featureSet.getId("t1HOut_" + edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_" + tokenText)] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + tokenText)] = 1 extra = {"xtype": "token", "t": token.get("id")} examples.append( (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)) exampleIndex += 1 # chains self.buildChains(token, sentenceGraph, features) if "pos_pairs" in self.styles: self.buildPOSPairs(token, namedEntityHeadTokens, features) self.buildPredictionFeatures(sentenceGraph, paths, token, interactionsByToken[token]) return examples def buildChains(self, token, sentenceGraph, features, depthLeft=3, chain="", visited=None): if depthLeft == 0: return strDepthLeft = "dist_" + str(depthLeft) if visited == None: visited = set() inEdges = self.inEdgesByToken[token] outEdges = self.outEdgesByToken[token] edgeSet = visited.union(self.edgeSetByToken[token]) for edge in inEdges: if not edge in visited: edgeType = edge[2].get("type") features[self.featureSet.getId("dep_" + strDepthLeft + edgeType)] = 1 nextToken = edge[0] for tokenFeature, w in self.getTokenFeatures( nextToken, sentenceGraph).iteritems(): features[self.featureSet.getId(strDepthLeft + tokenFeature)] = w # for entity in sentenceGraph.tokenIsEntityHead[nextToken]: # if entity.get("isName") == "True": # features[self.featureSet.getId("name_dist_"+strDepthLeft)] = 1 # features[self.featureSet.getId("name_dist_"+strDepthLeft+entity.get("type"))] = 1 # features[self.featureSet.getId("POS_dist_"+strDepthLeft+nextToken.get("POS"))] = 1 # tokenText = sentenceGraph.getTokenText(nextToken) # features[self.featureSet.getId("text_dist_"+strDepthLeft+tokenText)] = 1 if sentenceGraph.tokenIsName[nextToken]: features[self.featureSet.getId("name_chain_dist_" + strDepthLeft + chain + "-frw_" + edgeType)] = 1 features[self.featureSet.getId("chain_dist_" + strDepthLeft + chain + "-frw_" + edgeType)] = 1 self.buildChains(nextToken, sentenceGraph, features, depthLeft - 1, chain + "-frw_" + edgeType, edgeSet) for edge in outEdges: if not edge in visited: edgeType = edge[2].get("type") features[self.featureSet.getId("dep_dist_" + strDepthLeft + edgeType)] = 1 nextToken = edge[1] for tokenFeature, w in self.getTokenFeatures( nextToken, sentenceGraph).iteritems(): features[self.featureSet.getId(strDepthLeft + tokenFeature)] = w # for entity in sentenceGraph.tokenIsEntityHead[nextToken]: # if entity.get("isName") == "True": # features[self.featureSet.getId("name_dist_"+strDepthLeft)] = 1 # features[self.featureSet.getId("name_dist_"+strDepthLeft+entity.get("type"))] = 1 # features[self.featureSet.getId("POS_dist_"+strDepthLeft+nextToken.get("POS"))] = 1 # tokenText = sentenceGraph.getTokenText(nextToken) # features[self.featureSet.getId("text_dist_"+strDepthLeft+tokenText)] = 1 if sentenceGraph.tokenIsName[nextToken]: features[self.featureSet.getId("name_chain_dist_" + strDepthLeft + chain + "-rev_" + edgeType)] = 1 features[self.featureSet.getId("chain_dist_" + strDepthLeft + chain + "-rev_" + edgeType)] = 1 self.buildChains(nextToken, sentenceGraph, features, depthLeft - 1, chain + "-rev_" + edgeType, edgeSet) def getNamedEntityHeadTokens(self, sentenceGraph): headTokens = [] for entity in sentenceGraph.entities: if entity.get( "isName" ) == "True": # known data which can be used for features headTokens.append( sentenceGraph.entityHeadTokenByEntity[entity]) return headTokens def buildPOSPairs(self, token, namedEntityHeadTokens, features): tokenPOS = token.get("POS") assert tokenPOS != None for headToken in namedEntityHeadTokens: headPOS = headToken.get("POS") features[self.featureSet.getId("POS_pair_NE_" + tokenPOS + "-" + headPOS)] = 1 ###################################################### # Unmerging-style features ###################################################### def buildPredictionFeatures( self, sentenceGraph, paths, token, interactions): #themeEntities, causeEntities=None): # NOTE!!!! TODO # add also features for arguments present, but not in this combination self.buildInterArgumentBagOfWords(interactions, sentenceGraph) if sentenceGraph.entitiesByToken.has_key(token): for eventEntity in sentenceGraph.entitiesByToken[token]: eventEntityType = eventEntity.get("type") self.setFeature("rootType_" + eventEntity.get("type"), 1) self.setFeature("predStrength" + eventEntityType, self.getPredictionStrength(eventEntity)) self.triggerFeatureBuilder.setFeatureVector(self.features) self.triggerFeatureBuilder.tag = "trg" + eventEntityType + "_" self.triggerFeatureBuilder.buildFeatures(token) self.triggerFeatureBuilder.tag = None argThemeCount = 0 argCauseCount = 0 # Current example's edge combination for i in range(len(interactions)): arg = interactions[i] if arg.get("type") == "Theme": argThemeCount += 1 self.buildArgumentFeatures(sentenceGraph, paths, self.features, token, arg, "argTheme") self.buildArgumentFeatures(sentenceGraph, paths, self.features, token, arg, "argTheme" + str(i)) else: # Cause argCauseCount += 1 self.buildArgumentFeatures(sentenceGraph, paths, self.features, token, arg, "argCause") self.buildArgumentFeatures(sentenceGraph, paths, self.features, token, arg, "argCause" + str(i)) self.setFeature("argCount", len(interactions)) self.setFeature("argCount_" + str(len(interactions)), 1) self.setFeature("argThemeCount", argThemeCount) self.setFeature("argThemeCount_" + str(argThemeCount), 1) self.setFeature("argCauseCount", argCauseCount) self.setFeature("argCauseCount_" + str(argCauseCount), 1) self.triggerFeatureBuilder.tag = "" self.triggerFeatureBuilder.setFeatureVector(None) def buildArgumentFeatures(self, sentenceGraph, paths, features, eventToken, arg, tag): argEntity = sentenceGraph.entitiesById[arg.get("e2")] argToken = sentenceGraph.entityHeadTokenByEntity[argEntity] self.buildEdgeFeatures(sentenceGraph, paths, features, eventToken, argToken, tag) self.triggerFeatureBuilder.tag = tag + "trg_" self.triggerFeatureBuilder.buildFeatures(argToken) if argEntity.get("isName") == "True": self.setFeature(tag + "Protein", 1) else: self.setFeature(tag + "Event", 1) self.setFeature("nestingEvent", 1) self.setFeature(tag + "_" + argEntity.get("type"), 1) def buildEdgeFeatures(self, sentenceGraph, paths, features, eventToken, argToken, tag): #eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode] #argToken = sentenceGraph.entityHeadTokenByEntity[argNode] self.multiEdgeFeatureBuilder.tag = tag + "_" self.multiEdgeFeatureBuilder.setFeatureVector(features, None, None, False) self.setFeature(tag + "_present", 1) if eventToken != argToken and paths.has_key( eventToken) and paths[eventToken].has_key(argToken): path = paths[eventToken][argToken] edges = self.multiEdgeFeatureBuilder.getEdges( sentenceGraph.dependencyGraph, path) else: path = [eventToken, argToken] edges = None if not "disable_entity_features" in self.styles: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) #if not "disable_terminus_features" in self.styles: # self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not "disable_single_element_features" in self.styles: self.multiEdgeFeatureBuilder.buildSingleElementFeatures( path, edges, sentenceGraph) if not "disable_ngram_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathGrams( 2, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 3, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 4, path, edges, sentenceGraph) # remove for fast if not "disable_path_edge_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures( path, edges, sentenceGraph) #self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None, None, None, False) self.multiEdgeFeatureBuilder.tag = "" def buildInterArgumentBagOfWords(self, arguments, sentenceGraph): if len(arguments) < 2: return indexByToken = {} for i in range(len(sentenceGraph.tokens)): indexByToken[sentenceGraph.tokens[i]] = i argTokenIndices = set() for arg in arguments: argEntity = sentenceGraph.entitiesById[arg.get("e2")] argToken = sentenceGraph.entityHeadTokenByEntity[argEntity] argTokenIndices.add(indexByToken[argToken]) minIndex = min(argTokenIndices) maxIndex = max(argTokenIndices) self.setFeature("argBoWRange", (maxIndex - minIndex)) self.setFeature("argBoWRange_" + str(maxIndex - minIndex), 1) bow = set() for i in range(minIndex + 1, maxIndex): token = sentenceGraph.tokens[i] if len(sentenceGraph.tokenIsEntityHead[token] ) == 0 and not sentenceGraph.tokenIsName[token]: bow.add(token.get("text")) bow = sorted(list(bow)) for word in bow: self.setFeature("argBoW_" + word, 1) if word in ["/", "-"]: self.setFeature("argBoW_slashOrHyphen", 1) if len(bow) == 1: self.setFeature("argBoWonly_" + bow[0], 1) if bow[0] in ["/", "-"]: self.setFeature("argBoWonly_slashOrHyphen", 1)
class AsymmetricEventExampleBuilder(ExampleBuilder): def __init__(self, style=["typed", "directed"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert (classSet.getId("neg") == 1) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) if style.find(",") != -1: style = style.split(",") self.styles = style self.negFrac = None self.posPairGaz = POSPairGazetteer() for s in style: if s.find("negFrac") != -1: self.negFrac = float(s.split("_")[-1]) print >> sys.stderr, "Downsampling negatives to", self.negFrac self.negRand = random.Random(15) elif s.find("posPairGaz") != -1: self.posPairGaz = POSPairGazetteer( loadFrom=s.split("_", 1)[-1]) self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) if "graph_kernel" in self.styles: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder( self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) if "ontology" in self.styles: self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder( self.featureSet) if "nodalida" in self.styles: self.nodalidaFeatureBuilder = NodalidaFeatureBuilder( self.featureSet) #IF LOCAL if "bioinfer_limits" in self.styles: self.bioinferOntologies = OntologyUtils.getBioInferTempOntology() #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName) #ENDIF self.pathLengths = length assert (self.pathLengths == None) self.types = types if "random" in self.styles: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet) #self.outFile = open("exampleTempFile.txt","wt") @classmethod def run(cls, input, output, parse, tokenization, style, idFileTag=None): classSet, featureSet = cls.getIdSets(idFileTag) if style != None: e = cls(style=style, classSet=classSet, featureSet=featureSet) else: e = cls(classSet=classSet, featureSet=featureSet) sentences = cls.getSentences(input, parse, tokenization) e.buildExamplesForSentences(sentences, output, idFileTag) if "printClassIds" in e.styles: print >> sys.stderr, e.classSet.Ids def definePredictedValueRange(self, sentences, elementName): self.multiEdgeFeatureBuilder.definePredictedValueRange( sentences, elementName) def getPredictedValueRange(self): return self.multiEdgeFeatureBuilder.predictedRange def filterEdgesByType(self, edges, typesToInclude): if len(typesToInclude) == 0: return edges edgesToKeep = [] for edge in edges: if edge.get("type") in typesToInclude: edgesToKeep.append(edge) return edgesToKeep def getCategoryNameFromTokens(self, sentenceGraph, t1, t2, directed=True): types = set() themeE1Types = set() intEdges = [] if sentenceGraph.interactionGraph.has_edge(t1, t2): intEdges = sentenceGraph.interactionGraph.get_edge_data(t1, t2, default={}) # NOTE: Only works if keys are ordered integers for i in range(len(intEdges)): types.add(intEdges[i]["element"].get("type")) # if (not directed) and sentenceGraph.interactionGraph.has_edge(t2, t1): # intEdgesReverse = sentenceGraph.interactionGraph.get_edge(t2, t1, default={}) # # NOTE: Only works if keys are ordered integers # for i in range(len(intEdgesReverse)): # intElement = intEdgesReverse[i]["element"] # intType = intElement.get("type") # types.add(intType) # intEdges.extend(intEdgesReverse) for i in range(len(intEdges)): intElement = intEdges[i]["element"] intType = intElement.get("type") if intType == "Theme": e1Entity = sentenceGraph.entitiesById[intElement.get("e1")] themeE1Types.add(e1Entity.get("type")) #types.add(intType) if len(themeE1Types) != 0: themeE1Types = list(themeE1Types) themeE1Types.sort() categoryName = "" for name in themeE1Types: if categoryName != "": categoryName += "---" categoryName += name return categoryName else: types = list(types) types.sort() categoryName = "" for name in types: if categoryName != "": categoryName += "---" categoryName += name if categoryName != "": return categoryName else: return "neg" def getCategoryName(self, sentenceGraph, e1, e2, directed=True): interactions = sentenceGraph.getInteractions(e1, e2) if not directed: interactions.extend(sentenceGraph.getInteractions(e2, e1)) types = set() for interaction in interactions: types.add(interaction.attrib["type"]) types = list(types) types.sort() categoryName = "" for name in types: if categoryName != "": categoryName += "---" categoryName += name if categoryName != "": return categoryName else: return "neg" def preProcessExamples(self, allExamples): # Duplicates cannot be removed here, as they should only be removed from the training set. This is done # in the classifier. # if "no_duplicates" in self.styles: # count = len(allExamples) # print >> sys.stderr, " Removing duplicates,", # allExamples = ExampleUtils.removeDuplicates(allExamples) # print >> sys.stderr, "removed", count - len(allExamples) if "normalize" in self.styles: print >> sys.stderr, " Normalizing feature vectors" ExampleUtils.normalizeFeatureVectors(allExamples) return allExamples def isPotentialGeniaInteraction(self, e1, e2): if e1.get("isName") == "True": return False else: return True #IF LOCAL def getBioInferParentType(self, eType): if eType == "Physical_entity" or OntologyUtils.hasParent( eType, "Physical_entity", self.bioinferOntologies): return "Physical" elif eType == "Property_entity" or OntologyUtils.hasParent( eType, "Property_entity", self.bioinferOntologies): return "Property" elif OntologyUtils.hasParent(eType, "Relationship", self.bioinferOntologies): return "Process" else: assert False, eType # if self.bioinferOntologies["Entity"].has_key(eType): # if OntologyUtils.hasParent(eType, "Physical_entity", self.bioinferOntologies): # assert not OntologyUtils.hasParent(eType, "Property_entity", self.bioinferOntologies), eType # return "Physical" # else: # assert OntologyUtils.hasParent(eType, "Property_entity", self.bioinferOntologies), eType # return "Property" # # else: # assert self.bioinferOntologies.has_key(eType), eType # #assert OntologyUtils.hasParent(eType, "Process_entity", self.bioinferOntologies["Relationship"]), eType # return "Process" def isPotentialBioInferInteraction(self, e1, e2, categoryName): e1Type = self.getBioInferParentType(e1.get("type")) e2Type = self.getBioInferParentType(e2.get("type")) if e1Type == "Process" or e1Type == "Property": return True elif e1Type == "Physical" and e2Type == "Physical": return True elif e1Type == "Physical" and e2Type == "Process": # hack return True else: assert ( categoryName == "neg" ), categoryName + " category for " + e1Type + " and " + e2Type return False #ENDIF def nxMultiDiGraphToUndirected(self, graph): undirected = NX10.MultiGraph(name=graph.name) undirected.add_nodes_from(graph) undirected.add_edges_from(graph.edges_iter()) return undirected def buildExamples(self, sentenceGraph): examples = [] exampleIndex = 0 clearGraph = sentenceGraph.getCleared() #undirected = sentenceGraph.getUndirectedDependencyGraph() undirected = self.nxMultiDiGraphToUndirected( sentenceGraph.dependencyGraph) ##undirected = sentenceGraph.dependencyGraph.to_undirected() ###undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) self.triggerFeatureBuilder.initSentence(clearGraph) # Generate examples based on interactions between entities or interactions between tokens if "entities" in self.styles: loopRange = len(sentenceGraph.entities) else: loopRange = len(sentenceGraph.tokens) #for i in range(loopRange-1): for i in range(loopRange): # allow self-interactions #for j in range(i+1,loopRange): for j in range(i, loopRange): # allow self-interactions eI = None eJ = None if "entities" in self.styles: eI = sentenceGraph.entities[i] eJ = sentenceGraph.entities[j] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True": # continue if eI.get("type") == "neg" or eJ.get("type") == "neg": continue else: tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] # # only consider paths between entities (NOTE! entities, not only named entities) # if "headsOnly" in self.styles: # if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0): # continue if "directed" in self.styles: # define forward if "entities" in self.styles: categoryName = self.getCategoryName( sentenceGraph, eI, eJ, True) else: categoryName = self.getCategoryNameFromTokens( sentenceGraph, tI, tJ, True) self.exampleStats.beginExample(categoryName) if self.negFrac == None or categoryName != "neg" or ( categoryName == "neg" and self.negRand.random() < self.negFrac): makeExample = True if ("genia_limits" in self.styles ) and not self.isPotentialGeniaInteraction(eI, eJ): makeExample = False self.exampleStats.filter("genia_limits") if self.posPairGaz.getNegFrac( (tI.get("POS"), tJ.get("POS"))) == 1.0: makeExample = False self.exampleStats.filter("pos_pair") if makeExample: if not sentenceGraph.tokenIsName[tI]: examples.append( self.buildExample(tI, tJ, paths, clearGraph, categoryName, exampleIndex, eI, eJ)) exampleIndex += 1 else: self.exampleStats.filter("genia_token_limits") else: self.exampleStats.filter("neg_frac") self.exampleStats.endExample() # define reverse if "entities" in self.styles: categoryName = self.getCategoryName( sentenceGraph, eJ, eI, True) else: categoryName = self.getCategoryNameFromTokens( sentenceGraph, tJ, tI, True) self.exampleStats.beginExample(categoryName) if self.negFrac == None or categoryName != "neg" or ( categoryName == "neg" and self.negRand.random() < self.negFrac): makeExample = True if ("genia_limits" in self.styles ) and not self.isPotentialGeniaInteraction(eJ, eI): makeExample = False self.exampleStats.filter("genia_limits") if ("bioinfer_limits" in self.styles ) and not self.isPotentialBioInferInteraction( eJ, eI, categoryName): makeExample = False self.exampleStats.filter("bioinfer_limits") if self.posPairGaz.getNegFrac( (tJ.get("POS"), tI.get("POS"))) == 1.0: makeExample = False self.exampleStats.filter("pos_pair") if makeExample: if not sentenceGraph.tokenIsName[tJ]: examples.append( self.buildExample(tJ, tI, paths, clearGraph, categoryName, exampleIndex, eJ, eI)) exampleIndex += 1 else: self.exampleStats.filter("genia_token_limits") else: self.exampleStats.filter("neg_frac") self.exampleStats.endExample() # else: # if "entities" in self.styles: # categoryName = self.getCategoryName(sentenceGraph, eI, eJ, False) # else: # categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, False) # forwardExample = self.buildExample(tI, tJ, paths, clearGraph, categoryName, exampleIndex, eI, eJ) # if not "graph_kernel" in self.styles: # reverseExample = self.buildExample(tJ, tI, paths, clearGraph, categoryName, exampleIndex, eJ, eI) # forwardExample[2].update(reverseExample[2]) # examples.append(forwardExample) # exampleIndex += 1 return examples def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, exampleIndex, entity1=None, entity2=None): # define features features = {} if True: #token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): if token1 != token2 and paths.has_key( token1) and paths[token1].has_key(token2): path = paths[token1][token2] else: path = [token1, token2] assert (self.pathLengths == None) if self.pathLengths == None or len(path) - 1 in self.pathLengths: if not "no_trigger": self.triggerFeatureBuilder.setFeatureVector(self.features) self.triggerFeatureBuilder.tag = "trg_t1_" self.triggerFeatureBuilder.buildFeatures(eventToken) self.triggerFeatureBuilder.tag = "trg_t2_" self.triggerFeatureBuilder.buildFeatures(eventToken) # if not "no_ontology" in self.styles: # self.ontologyFeatureBuilder.setFeatureVector(features) # self.ontologyFeatureBuilder.buildOntologyFeaturesForPath(sentenceGraph, path) # self.ontologyFeatureBuilder.setFeatureVector(None) if "graph_kernel" in self.styles or not "no_dependency" in self.styles: if token1 != token2 and paths.has_key( token1) and paths[token1].has_key(token2): edges = self.multiEdgeFeatureBuilder.getEdges( sentenceGraph.dependencyGraph, path) else: edges = None if "graph_kernel" in self.styles: self.graphKernelFeatureBuilder.setFeatureVector( features, entity1, entity2) self.graphKernelFeatureBuilder.buildGraphKernelFeatures( sentenceGraph, path, edges) self.graphKernelFeatureBuilder.setFeatureVector(None) if "entity_type" in self.styles: features[self.featureSet.getId("e1_" + entity1.attrib["type"])] = 1 features[self.featureSet.getId("e2_" + entity2.attrib["type"])] = 1 features[self.featureSet.getId("distance_" + str(len(path)))] = 1 if not "no_dependency" in self.styles: if token1 == token2: features[self.featureSet.getId("tokenSelfLoop")] = 1 self.multiEdgeFeatureBuilder.setFeatureVector( features, entity1, entity2) #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast if not "disable_entity_features" in self.styles: self.multiEdgeFeatureBuilder.buildEntityFeatures( sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not "disable_terminus_features" in self.styles: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures( path, sentenceGraph) # remove for fast if not "disable_single_element_features" in self.styles: self.multiEdgeFeatureBuilder.buildSingleElementFeatures( path, edges, sentenceGraph) if not "disable_ngram_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathGrams( 2, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 3, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 4, path, edges, sentenceGraph) # remove for fast #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast #if edges != None: # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast if not "disable_path_edge_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures( path, edges, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures( sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) if "nodalida" in self.styles: self.nodalidaFeatureBuilder.setFeatureVector( features, entity1, entity2) shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths( sentenceGraph.dependencyGraph, path) print shortestPaths if len(shortestPaths) > 0: self.nodalidaFeatureBuilder.buildNGrams( shortestPaths, sentenceGraph) self.nodalidaFeatureBuilder.setFeatureVector(None) if not "no_linear" in self.styles: self.tokenFeatureBuilder.setFeatureVector(features) for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == token1: token1Index = i if sentenceGraph.tokens[i] == token2: token2Index = i linearPreTag = "linfw_" if token1Index > token2Index: token1Index, token2Index = token2Index, token1Index linearPreTag = "linrv_" self.tokenFeatureBuilder.buildLinearOrderFeatures( token1Index, sentenceGraph, 2, 2, preTag="linTok1") self.tokenFeatureBuilder.buildLinearOrderFeatures( token2Index, sentenceGraph, 2, 2, preTag="linTok2") # Before, middle, after # self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf") # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw") # self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af") # before-middle, middle, middle-after # self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2) self.tokenFeatureBuilder.setFeatureVector(None) if "random" in self.styles: self.randomFeatureBuilder.setFeatureVector(features) self.randomFeatureBuilder.buildRandomFeatures(100, 0.01) self.randomFeatureBuilder.setFeatureVector(None) if "genia_limits" in self.styles: e1Type = entity1.get("type") e2Type = entity2.get("type") assert (entity1.get("isName") == "False") if entity2.get("isName") == "True": features[self.featureSet.getId( "GENIA_target_protein")] = 1 else: features[self.featureSet.getId( "GENIA_nested_event")] = 1 if e1Type.find( "egulation" ) != -1: # leave r out to avoid problems with capitalization if entity2.get("isName") == "True": features[self.featureSet.getId( "GENIA_regulation_of_protein")] = 1 else: features[self.featureSet.getId( "GENIA_regulation_of_event")] = 1 else: features[self.featureSet.getId("always_negative")] = 1 if "subset" in self.styles: features[self.featureSet.getId("out_of_scope")] = 1 else: features[self.featureSet.getId("always_negative")] = 1 if "subset" in self.styles: features[self.featureSet.getId("out_of_scope")] = 1 path = [token1, token2] self.triggerFeatureBuilder.tag = "" self.triggerFeatureBuilder.setFeatureVector(None) # define extra attributes # if int(path[0].attrib["id"].split("_")[-1]) < int(path[-1].attrib["id"].split("_")[-1]): # #extra = {"xtype":"edge","type":"i","t1":path[0],"t2":path[-1]} # extra = {"xtype":"asym","type":"i","t1":path[0].get("id"),"t2":path[-1].get("id")} # extra["deprev"] = False # else: # #extra = {"xtype":"edge","type":"i","t1":path[-1],"t2":path[0]} # extra = {"xtype":"asym","type":"i","t1":path[-1].get("id"),"t2":path[0].get("id")} # extra["deprev"] = True extra = { "xtype": "asym", "type": "i", "t1": token1.get("id"), "t2": token2.get("id") } if entity1 != None: #extra["e1"] = entity1 extra["e1"] = entity1.get("id") if entity2 != None: #extra["e2"] = entity2 extra["e2"] = entity2.get("id") extra["categoryName"] = categoryName sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId # make example if "binary" in self.styles: if categoryName != "neg": category = 1 else: category = -1 categoryName = "i" else: category = self.classSet.getId(categoryName) return (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)
class MultiEdgeExampleBuilder(ExampleBuilder): """ This example builder makes edge examples, i.e. examples describing the event arguments. """ def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert (classSet.getId("neg") == 1 or (len(classSet.Ids) == 2 and classSet.getId("neg") == -1)) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = self.getParameters(style, [ "typed", "directed", "headsOnly", "graph_kernel", "noAnnType", "noMasking", "maxFeatures", "genia_limits", "epi_limits", "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits", "genia_task1", "ontology", "nodalida", "bacteria_renaming", "trigger_features", "rel_features", "ddi_features", "evex", "giuliano", "random", "themeOnly", "causeOnly", "no_path", "entities", "skip_extra_triggers", "headsOnly", "graph_kernel", "trigger_features", "no_task", "no_dependency", "disable_entity_features", "disable_terminus_features", "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features", "no_linear", "subset", "binary", "pos_only", "entity_type" ]) if style == None: # no parameters given style["typed"] = style["directed"] = style["headsOnly"] = True # self.styles = style # if "selftrain_group" in self.styles: # self.selfTrainGroups = set() # if "selftrain_group-1" in self.styles: # self.selfTrainGroups.add("-1") # if "selftrain_group0" in self.styles: # self.selfTrainGroups.add("0") # if "selftrain_group1" in self.styles: # self.selfTrainGroups.add("1") # if "selftrain_group2" in self.styles: # self.selfTrainGroups.add("2") # if "selftrain_group3" in self.styles: # self.selfTrainGroups.add("3") # print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) # NOTE Temporarily re-enabling predicted range #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None) if self.styles["graph_kernel"]: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder( self.featureSet) if self.styles["noAnnType"]: self.multiEdgeFeatureBuilder.noAnnType = True if self.styles["noMasking"]: self.multiEdgeFeatureBuilder.maskNamedEntities = False if self.styles["maxFeatures"]: self.multiEdgeFeatureBuilder.maximum = True if self.styles["genia_task1"]: self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity") self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) if self.styles["ontology"]: self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder( self.featureSet) if self.styles["nodalida"]: self.nodalidaFeatureBuilder = NodalidaFeatureBuilder( self.featureSet) if self.styles["bacteria_renaming"]: self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder( self.featureSet) if self.styles["trigger_features"]: self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = True if self.styles["genia_task1"]: self.triggerFeatureBuilder.filterAnnTypes.add("Entity") #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName) if self.styles["rel_features"]: self.relFeatureBuilder = RELFeatureBuilder(featureSet) if self.styles["ddi_features"]: self.drugFeatureBuilder = DrugFeatureBuilder(featureSet) if self.styles["evex"]: self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet) if self.styles["giuliano"]: self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet) self.pathLengths = length assert (self.pathLengths == None) self.types = types if self.styles["random"]: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet) def definePredictedValueRange(self, sentences, elementName): self.multiEdgeFeatureBuilder.definePredictedValueRange( sentences, elementName) def getPredictedValueRange(self): return self.multiEdgeFeatureBuilder.predictedRange def filterEdgesByType(self, edges, typesToInclude): if len(typesToInclude) == 0: return edges edgesToKeep = [] for edge in edges: if edge.get("type") in typesToInclude: edgesToKeep.append(edge) return edgesToKeep def getCategoryNameFromTokens(self, sentenceGraph, t1, t2, directed=True): """ Example class. Multiple overlapping edges create a merged type. """ types = set() # if sentenceGraph.interactionGraph.has_edge(t1, t2): # intEdges = sentenceGraph.interactionGraph.get_edge_data(t1, t2, default={}) # # NOTE: Only works if keys are ordered integers # for i in range(len(intEdges)): # types.add(intEdges[i]["element"].get("type")) # if (not directed) and sentenceGraph.interactionGraph.has_edge(t2, t1): # intEdges = sentenceGraph.interactionGraph.get_edge(t2, t1, default={}) # # NOTE: Only works if keys are ordered integers # for i in range(len(intEdges)): # types.add(intEdges[i]["element"].get("type")) intEdges = sentenceGraph.interactionGraph.getEdges(t1, t2) if (not directed): intEdges = intEdges + sentenceGraph.interactionGraph.getEdges( t2, t1) for intEdge in intEdges: types.add(intEdge[2].get("type")) types = list(types) types.sort() categoryName = "" for name in types: if categoryName != "": categoryName += "---" categoryName += name if categoryName != "": return categoryName else: return "neg" def getCategoryName(self, sentenceGraph, e1, e2, directed=True, duplicateEntities=None): """ Example class. Multiple overlapping edges create a merged type. """ # interactions = [] # e1s = [e1] # if duplicateEntities != None and e1 in duplicateEntities: # e1s += duplicateEntities[e1] # e2s = [e2] # if duplicateEntities != None and e2 in duplicateEntities: # e2s += duplicateEntities[e2] # for entity1 in e1s: # for entity2 in e2s: # interactions = interactions + sentenceGraph.getInteractions(entity1, entity2) # if not directed: # interactions = interactions + sentenceGraph.getInteractions(entity2, entity1) interactions = sentenceGraph.getInteractions(e1, e2, True) #print interactions types = set() for interaction in interactions: types.add(interaction[2].get("type")) types = list(types) types.sort() categoryName = "" for name in types: if self.styles["causeOnly"] and name != "Cause": continue if self.styles["themeOnly"] and name != "Theme": continue if categoryName != "": categoryName += "---" categoryName += name if categoryName != "": return categoryName else: return "neg" def isPotentialRELInteraction(self, e1, e2): if e1.get("type") == "Protein" and e2.get("type") == "Entity": return True else: return False def isPotentialBBInteraction(self, e1, e2, sentenceGraph): #if e1.get("type") == "Bacterium" and e2.get("type") in ["Host", "HostPart", "Geographical", "Environmental", "Food", "Medical", "Soil", "Water"]: # Note: "Environment" type is misspelled as "Environmental" in the BB-task documentation if e1.get("type") == "Bacterium" and e2.get("type") in [ "Host", "HostPart", "Geographical", "Environment", "Food", "Medical", "Soil", "Water" ]: return True elif e1.get("type") == "Host" and e2.get("type") == "HostPart": return True else: return False def getBISuperType(self, eType): if eType in [ "GeneProduct", "Protein", "ProteinFamily", "PolymeraseComplex" ]: return "ProteinEntity" elif eType in [ "Gene", "GeneFamily", "GeneComplex", "Regulon", "Site", "Promoter" ]: return "GeneEntity" else: return None def isPotentialBIInteraction(self, e1, e2, sentenceGraph, stats): e1Type = e1.get("type") e1SuperType = self.getBISuperType(e1Type) e2Type = e2.get("type") e2SuperType = self.getBISuperType(e2Type) tag = "(" + e1Type + "/" + e2Type + ")" if e1Type == "Regulon": if e2SuperType in ["GeneEntity", "ProteinEntity"]: return True if e1SuperType == "ProteinEntity": if e2Type in ["Site", "Promoter", "Gene", "GeneComplex"]: return True if e1Type in ["Action", "Transcription", "Expression"]: return True if e1Type == "Site": if e2SuperType == "GeneEntity": return True if e1Type == "Promoter": if e2SuperType in ["GeneEntity", "ProteinEntity"]: return True if e1SuperType in ["GeneEntity", "ProteinEntity"]: if e2SuperType in ["GeneEntity", "ProteinEntity"]: return True stats.filter("bi_limits") #+tag) return False def isPotentialEPIInteraction(self, e1, e2, sentenceGraph): if e1.get("type") != "Catalysis": if e1.get("type") in ["Protein", "Entity"]: return False elif e2.get("type") in ["Protein", "Entity"]: return True else: return False else: # Catalysis if e2.get("type") != "Entity": return True else: return False assert False, (e1.get("type"), e2.get("type")) def isPotentialIDInteraction(self, e1, e2, sentenceGraph): e1Type = e1.get("type") e2Type = e2.get("type") e1IsCore = e1Type in [ "Protein", "Regulon-operon", "Two-component-system", "Chemical", "Organism" ] e2IsCore = e2Type in [ "Protein", "Regulon-operon", "Two-component-system", "Chemical", "Organism" ] if e1IsCore: return False elif e1Type in ["Gene_expression", "Transcription"]: if e2Type in ["Protein", "Regulon-operon"]: return True else: return False elif e1Type in ["Protein_catabolism", "Phosphorylation"]: if e2Type == "Protein": return True else: return False elif e1Type == "Localization": if e2IsCore or e2Type == "Entity": return True else: return False elif e1Type in ["Binding", "Process"]: if e2IsCore: return True else: return False elif "egulation" in e1Type: if e2Type != "Entity": return True else: return False elif e1Type == "Entity": if e2IsCore: return True else: return False assert False, (e1Type, e2Type) def isPotentialCOInteraction(self, e1, e2, sentenceGraph): if e1.get("type") == "Exp" and e2.get("type") == "Exp": anaphoraTok = sentenceGraph.entityHeadTokenByEntity[e1] antecedentTok = sentenceGraph.entityHeadTokenByEntity[e2] antecedentTokenFound = False for token in sentenceGraph.tokens: if token == antecedentTok: antecedentTokenFound = True if token == anaphoraTok: # if, not elif, to take into accoutn cases where e1Tok == e2Tok if antecedentTokenFound: return True else: return False assert False elif e1.get("type") == "Exp" and e2.get("type") == "Protein": return True else: return False def isPotentialGeniaInteraction(self, e1, e2): e1Type = e1.get("type") e2Type = e2.get("type") if e1Type == "Protein": return False elif e1Type in [ "Entity", "Gene_expression", "Transcription", "Protein_catabolism", "Phosphorylation", "Binding" ]: if e2Type == "Protein": return True else: return False elif e1Type == "Localization": if e2Type in ["Protein", "Entity"]: return True else: return False elif "egulation" in e1Type: if e2Type != "Entity": return True else: return False assert False, (e1Type, e2Type) def getGoldCategoryName(self, goldGraph, entityToGold, e1, e2, directed=True): if len(entityToGold[e1]) > 0 and len(entityToGold[e2]) > 0: return self.getCategoryName(goldGraph, entityToGold[e1][0], entityToGold[e2][0], directed=directed) else: return "neg" def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ #examples = [] exampleIndex = 0 if self.styles["trigger_features"]: self.triggerFeatureBuilder.initSentence(sentenceGraph) if self.styles["evex"]: self.evexFeatureBuilder.initSentence(sentenceGraph) # Filter entities, if needed #mergedIds = None #duplicateEntities = None #entities = sentenceGraph.entities #entities, mergedIds, duplicateEntities = self.mergeEntities(sentenceGraph, False) # "no_duplicates" in self.styles) sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities entityToDuplicates = sentenceGraph.mergedEntityToDuplicates self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) # Connect to optional gold graph if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities( entities, goldGraph.entities) paths = None if not self.styles["no_path"]: ##undirected = sentenceGraph.getUndirectedDependencyGraph() #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) ###undirected = sentenceGraph.dependencyGraph.to_undirected() ####undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work undirected = sentenceGraph.dependencyGraph.toUndirected() #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) paths = undirected #for edge in sentenceGraph.dependencyGraph.edges: # assert edge[2] != None #for edge in undirected.edges: # assert edge[2] != None #if sentenceGraph.sentenceElement.get("id") == "GENIA.d70.s5": # print [(x[0].get("id"), x[1].get("id"), x[2].get("id")) for x in sentenceGraph.dependencyGraph.edges] # Generate examples based on interactions between entities or interactions between tokens if self.styles["entities"]: loopRange = len(entities) else: loopRange = len(sentenceGraph.tokens) for i in range(loopRange - 1): for j in range(i + 1, loopRange): eI = None eJ = None if self.styles["entities"]: eI = entities[i] eJ = entities[j] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True": # continue if eI.get("type") == "neg" or eJ.get("type") == "neg": continue if self.styles["skip_extra_triggers"]: if eI.get("source") != None or eJ.get( "source") != None: continue else: tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] # only consider paths between entities (NOTE! entities, not only named entities) if self.styles["headsOnly"]: if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len( sentenceGraph.tokenIsEntityHead[tJ]) == 0): continue if self.styles["directed"]: # define forward if self.styles["entities"]: categoryName = self.getCategoryName( sentenceGraph, eI, eJ, True) if goldGraph != None: categoryName = self.getGoldCategoryName( goldGraph, entityToGold, eI, eJ, True) else: categoryName = self.getCategoryNameFromTokens( sentenceGraph, tI, tJ, True) # make forward self.exampleStats.beginExample(categoryName) makeExample = True if self.styles[ "genia_limits"] and not self.isPotentialGeniaInteraction( eI, eJ): makeExample = False self.exampleStats.filter("genia_limits") if self.styles["genia_task1"] and ( eI.get("type") == "Entity" or eJ.get("type") == "Entity"): makeExample = False self.exampleStats.filter("genia_task1") if self.styles[ "rel_limits"] and not self.isPotentialRELInteraction( eI, eJ): makeExample = False self.exampleStats.filter("rel_limits") if self.styles[ "co_limits"] and not self.isPotentialCOInteraction( eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("co_limits") if self.styles[ "bb_limits"] and not self.isPotentialBBInteraction( eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("bb_limits") if categoryName != "neg": self.exampleStats.filter("bb_limits(" + categoryName + ":" + eI.get("type") + "/" + eJ.get("type") + ")") if self.styles[ "bi_limits"] and not self.isPotentialBIInteraction( eI, eJ, sentenceGraph, self.exampleStats): makeExample = False #self.exampleStats.filter("bi_limits") if self.styles[ "epi_limits"] and not self.isPotentialEPIInteraction( eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("epi_limits") if self.styles[ "id_limits"] and not self.isPotentialIDInteraction( eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("id_limits") # if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"): # makeExample = False # self.exampleStats.filter("selftrain_limits") # if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups): # makeExample = False # self.exampleStats.filter("selftrain_group") if self.styles["pos_only"] and categoryName == "neg": makeExample = False self.exampleStats.filter("pos_only") if makeExample: #examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) ) ExampleUtils.appendExamples([ self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) ], outfile) exampleIndex += 1 self.exampleStats.endExample() # define reverse if self.styles["entities"]: categoryName = self.getCategoryName( sentenceGraph, eJ, eI, True) if goldGraph != None: categoryName = self.getGoldCategoryName( goldGraph, entityToGold, eJ, eI, True) else: categoryName = self.getCategoryNameFromTokens( sentenceGraph, tJ, tI, True) # make reverse self.exampleStats.beginExample(categoryName) makeExample = True if self.styles[ "genia_limits"] and not self.isPotentialGeniaInteraction( eJ, eI): makeExample = False self.exampleStats.filter("genia_limits") if self.styles["genia_task1"] and ( eI.get("type") == "Entity" or eJ.get("type") == "Entity"): makeExample = False self.exampleStats.filter("genia_task1") if self.styles[ "rel_limits"] and not self.isPotentialRELInteraction( eJ, eI): makeExample = False self.exampleStats.filter("rel_limits") if self.styles[ "co_limits"] and not self.isPotentialCOInteraction( eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("co_limits") if self.styles[ "bb_limits"] and not self.isPotentialBBInteraction( eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("bb_limits") if categoryName != "neg": self.exampleStats.filter("bb_limits(" + categoryName + ":" + eJ.get("type") + "/" + eI.get("type") + ")") if self.styles[ "bi_limits"] and not self.isPotentialBIInteraction( eJ, eI, sentenceGraph, self.exampleStats): makeExample = False #self.exampleStats.filter("bi_limits") if self.styles[ "epi_limits"] and not self.isPotentialEPIInteraction( eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("epi_limits") if self.styles[ "id_limits"] and not self.isPotentialIDInteraction( eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("id_limits") # if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"): # makeExample = False # self.exampleStats.filter("selftrain_limits") # if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups): # makeExample = False # self.exampleStats.filter("selftrain_group") if self.styles["pos_only"] and categoryName == "neg": makeExample = False self.exampleStats.filter("pos_only") if makeExample: #examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) ) ExampleUtils.appendExamples([ self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) ], outfile) exampleIndex += 1 self.exampleStats.endExample() else: if self.styles["entities"]: categoryName = self.getCategoryName( sentenceGraph, eI, eJ, False) else: categoryName = self.getCategoryNameFromTokens( sentenceGraph, tI, tJ, False) self.exampleStats.beginExample(categoryName) forwardExample = self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) if not self.styles["graph_kernel"]: reverseExample = self.buildExample( tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) forwardExample[2].update(reverseExample[2]) #examples.append(forwardExample) ExampleUtils.appendExamples([forwardExample], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, exampleIndex, entity1=None, entity2=None): """ Build a single directed example for the potential edge between token1 and token2 """ # dummy return for speed testing #return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),1,{},{}) # define features features = {} if True: #token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): #if token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): # path = paths[token1][token2] #else: # path = [token1, token2] if not self.styles["no_path"]: # directedPath reduces performance by 0.01 pp #directedPath = sentenceGraph.dependencyGraph.getPaths(token1, token2) #if len(directedPath) == 0: # directedPath = sentenceGraph.dependencyGraph.getPaths(token2, token1) # for dp in directedPath: # dp.reverse() #if len(directedPath) == 0: # path = paths.getPaths(token1, token2) #else: # path = directedPath path = paths.getPaths(token1, token2) if len(path) > 0: #if len(path) > 1: # print len(path) path = path[0] pathExists = True else: path = [token1, token2] pathExists = False else: path = [token1, token2] pathExists = False #print token1.get("id"), token2.get("id") assert (self.pathLengths == None) if self.pathLengths == None or len(path) - 1 in self.pathLengths: # if not "no_ontology" in self.styles: # self.ontologyFeatureBuilder.setFeatureVector(features) # self.ontologyFeatureBuilder.buildOntologyFeaturesForPath(sentenceGraph, path) # self.ontologyFeatureBuilder.setFeatureVector(None) if self.styles["trigger_features"]: # F 85.52 -> 85.55 self.triggerFeatureBuilder.setFeatureVector(features) self.triggerFeatureBuilder.tag = "trg1_" self.triggerFeatureBuilder.buildFeatures(token1) self.triggerFeatureBuilder.tag = "trg2_" self.triggerFeatureBuilder.buildFeatures(token2) self.triggerFeatureBuilder.setFeatureVector(None) # REL features if self.styles["rel_features"] and not self.styles["no_task"]: self.relFeatureBuilder.setFeatureVector(features) self.relFeatureBuilder.tag = "rel1_" self.relFeatureBuilder.buildAllFeatures( sentenceGraph.tokens, sentenceGraph.tokens.index(token1)) self.relFeatureBuilder.tag = "rel2_" self.relFeatureBuilder.buildAllFeatures( sentenceGraph.tokens, sentenceGraph.tokens.index(token2)) self.relFeatureBuilder.setFeatureVector(None) if self.styles[ "bacteria_renaming"] and not self.styles["no_task"]: self.bacteriaRenamingFeatureBuilder.setFeatureVector( features) self.bacteriaRenamingFeatureBuilder.buildPairFeatures( entity1, entity2) #self.bacteriaRenamingFeatureBuilder.buildSubstringFeatures(entity1, entity2) # decreases perf. 74.76 -> 72.41 self.bacteriaRenamingFeatureBuilder.setFeatureVector(None) if self.styles["co_limits"] and not self.styles["no_task"]: e1Offset = Range.charOffsetToSingleTuple( entity1.get("charOffset")) e2Offset = Range.charOffsetToSingleTuple( entity2.get("charOffset")) if Range.contains(e1Offset, e2Offset): features[self.featureSet.getId("e1_contains_e2")] = 1 if entity2.get("isName") == "True": features[self.featureSet.getId( "e1_contains_e2name")] = 1 if Range.contains(e2Offset, e1Offset): features[self.featureSet.getId("e2_contains_e1")] = 1 if entity1.get("isName") == "True": features[self.featureSet.getId( "e2_contains_e1name")] = 1 if self.styles["ddi_features"]: self.drugFeatureBuilder.setFeatureVector(features) self.drugFeatureBuilder.tag = "ddi_" self.drugFeatureBuilder.buildPairFeatures(entity1, entity2) if self.styles["ddi_mtmx"]: self.drugFeatureBuilder.buildMTMXFeatures( entity1, entity2) self.drugFeatureBuilder.setFeatureVector(None) #if "graph_kernel" in self.styles or not "no_dependency" in self.styles: # #print "Getting edges" # if token1 != token2 and pathExists: # #print "g1" # edges = self.multiEdgeFeatureBuilder.getEdges(sentenceGraph.dependencyGraph, path) # #print "g2" # else: # edges = None if self.styles["graph_kernel"]: self.graphKernelFeatureBuilder.setFeatureVector( features, entity1, entity2) self.graphKernelFeatureBuilder.buildGraphKernelFeatures( sentenceGraph, path) self.graphKernelFeatureBuilder.setFeatureVector(None) if self.styles["entity_type"]: features[self.featureSet.getId("e1_" + entity1.get("type"))] = 1 features[self.featureSet.getId("e2_" + entity2.get("type"))] = 1 features[self.featureSet.getId("distance_" + str(len(path)))] = 1 if not self.styles["no_dependency"]: #print "Dep features" self.multiEdgeFeatureBuilder.setFeatureVector( features, entity1, entity2) #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast if not self.styles["disable_entity_features"]: self.multiEdgeFeatureBuilder.buildEntityFeatures( sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not self.styles["disable_terminus_features"]: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures( path, sentenceGraph) # remove for fast if not self.styles["disable_single_element_features"]: self.multiEdgeFeatureBuilder.buildSingleElementFeatures( path, sentenceGraph) if not self.styles["disable_ngram_features"]: #print "NGrams" self.multiEdgeFeatureBuilder.buildPathGrams( 2, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 3, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 4, path, sentenceGraph) # remove for fast #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast #if edges != None: # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast if not self.styles["disable_path_edge_features"]: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures( path, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures( sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) if self.styles["nodalida"]: self.nodalidaFeatureBuilder.setFeatureVector( features, entity1, entity2) shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths( sentenceGraph.dependencyGraph, path) print shortestPaths if len(shortestPaths) > 0: self.nodalidaFeatureBuilder.buildNGrams( shortestPaths, sentenceGraph) self.nodalidaFeatureBuilder.setFeatureVector(None) if not self.styles["no_linear"]: self.tokenFeatureBuilder.setFeatureVector(features) for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == token1: token1Index = i if sentenceGraph.tokens[i] == token2: token2Index = i linearPreTag = "linfw_" if token1Index > token2Index: token1Index, token2Index = token2Index, token1Index linearPreTag = "linrv_" self.tokenFeatureBuilder.buildLinearOrderFeatures( token1Index, sentenceGraph, 2, 2, preTag="linTok1") self.tokenFeatureBuilder.buildLinearOrderFeatures( token2Index, sentenceGraph, 2, 2, preTag="linTok2") # Before, middle, after # self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf") # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw") # self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af") # before-middle, middle, middle-after # self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2) self.tokenFeatureBuilder.setFeatureVector(None) if self.styles["random"]: self.randomFeatureBuilder.setFeatureVector(features) self.randomFeatureBuilder.buildRandomFeatures(100, 0.01) self.randomFeatureBuilder.setFeatureVector(None) if self.styles["genia_limits"] and not self.styles["no_task"]: e1Type = entity1.get("type") e2Type = entity2.get("type") assert (entity1.get("isName") == "False") if entity2.get("isName") == "True": features[self.featureSet.getId( "GENIA_target_protein")] = 1 else: features[self.featureSet.getId( "GENIA_nested_event")] = 1 if e1Type.find( "egulation" ) != -1: # leave r out to avoid problems with capitalization if entity2.get("isName") == "True": features[self.featureSet.getId( "GENIA_regulation_of_protein")] = 1 else: features[self.featureSet.getId( "GENIA_regulation_of_event")] = 1 if self.styles["bi_limits"]: # Make features based on entity types e1Type = entity1.get("type") e2Type = entity2.get("type") e1SuperType = str(self.getBISuperType(e1Type)) e2SuperType = str(self.getBISuperType(e2Type)) features[self.featureSet.getId("BI_e1_" + e1Type)] = 1 features[self.featureSet.getId("BI_e2_" + e2Type)] = 1 features[self.featureSet.getId("BI_e1sup_" + e1SuperType)] = 1 features[self.featureSet.getId("BI_e2sup_" + e2SuperType)] = 1 features[self.featureSet.getId("BI_e1e2_" + e1Type + "_" + e2Type)] = 1 features[self.featureSet.getId("BI_e1e2sup_" + e1SuperType + "_" + e2SuperType)] = 1 if self.styles["evex"]: self.evexFeatureBuilder.setFeatureVector( features, entity1, entity2) self.evexFeatureBuilder.buildEdgeFeatures( entity1, entity2, token1, token2, path, sentenceGraph) self.evexFeatureBuilder.setFeatureVector(None) if self.styles["giuliano"]: self.giulianoFeatureBuilder.setFeatureVector( features, entity1, entity2) self.giulianoFeatureBuilder.buildEdgeFeatures( entity1, entity2, token1, token2, path, sentenceGraph) self.giulianoFeatureBuilder.setFeatureVector(None) else: features[self.featureSet.getId("always_negative")] = 1 if self.styles["subset"]: features[self.featureSet.getId("out_of_scope")] = 1 else: features[self.featureSet.getId("always_negative")] = 1 if self.styles["subset"]: features[self.featureSet.getId("out_of_scope")] = 1 path = [token1, token2] # define extra attributes #if int(path[0].get("id").split("_")[-1]) < int(path[-1].get("id").split("_")[-1]): if int(path[0].get("charOffset").split("-")[0]) < int( path[-1].get("charOffset").split("-")[0]): #extra = {"xtype":"edge","type":"i","t1":path[0],"t2":path[-1]} extra = { "xtype": "edge", "type": "i", "t1": path[0].get("id"), "t2": path[-1].get("id") } extra["deprev"] = False else: #extra = {"xtype":"edge","type":"i","t1":path[-1],"t2":path[0]} extra = { "xtype": "edge", "type": "i", "t1": path[-1].get("id"), "t2": path[0].get("id") } extra["deprev"] = True if entity1 != None: #extra["e1"] = entity1 extra["e1"] = entity1.get("id") if sentenceGraph.mergedEntityToDuplicates != None: #extra["e1GoldIds"] = mergedEntityIds[entity1] extra["e1DuplicateIds"] = ",".join([ x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity1] ]) if entity2 != None: #extra["e2"] = entity2 extra["e2"] = entity2.get("id") if sentenceGraph.mergedEntityToDuplicates != None: extra["e2DuplicateIds"] = ",".join([ x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity2] ]) #extra["e2GoldIds"] = mergedEntityIds[entity2] extra["categoryName"] = categoryName if self.styles["bacteria_renaming"]: if entity1.get("text") != None and entity1.get("text") != "": extra["e1t"] = entity1.get("text").replace(" ", "---").replace( ":", "-COL-") if entity2.get("text") != None and entity2.get("text") != "": extra["e2t"] = entity2.get("text").replace(" ", "---").replace( ":", "-COL-") sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId # make example if self.styles["binary"]: if categoryName != "neg": category = 1 else: category = -1 categoryName = "i" else: category = self.classSet.getId(categoryName) # NOTE: temporarily disable for replicating 110310 experiment #features[self.featureSet.getId("extra_constant")] = 1 return (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)
class AsymmetricEventExampleBuilder(ExampleBuilder): def __init__(self, style=["typed","directed"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) if style.find(",") != -1: style = style.split(",") self.styles = style self.negFrac = None self.posPairGaz = POSPairGazetteer() for s in style: if s.find("negFrac") != -1: self.negFrac = float(s.split("_")[-1]) print >> sys.stderr, "Downsampling negatives to", self.negFrac self.negRand = random.Random(15) elif s.find("posPairGaz") != -1: self.posPairGaz = POSPairGazetteer(loadFrom=s.split("_", 1)[-1]) self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) if "graph_kernel" in self.styles: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) if "ontology" in self.styles: self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) if "nodalida" in self.styles: self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet) #IF LOCAL if "bioinfer_limits" in self.styles: self.bioinferOntologies = OntologyUtils.getBioInferTempOntology() #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName) #ENDIF self.pathLengths = length assert(self.pathLengths == None) self.types = types if "random" in self.styles: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet) #self.outFile = open("exampleTempFile.txt","wt") @classmethod def run(cls, input, output, parse, tokenization, style, idFileTag=None): classSet, featureSet = cls.getIdSets(idFileTag) if style != None: e = cls(style=style, classSet=classSet, featureSet=featureSet) else: e = cls(classSet=classSet, featureSet=featureSet) sentences = cls.getSentences(input, parse, tokenization) e.buildExamplesForSentences(sentences, output, idFileTag) if "printClassIds" in e.styles: print >> sys.stderr, e.classSet.Ids def definePredictedValueRange(self, sentences, elementName): self.multiEdgeFeatureBuilder.definePredictedValueRange(sentences, elementName) def getPredictedValueRange(self): return self.multiEdgeFeatureBuilder.predictedRange def filterEdgesByType(self, edges, typesToInclude): if len(typesToInclude) == 0: return edges edgesToKeep = [] for edge in edges: if edge.get("type") in typesToInclude: edgesToKeep.append(edge) return edgesToKeep def getCategoryNameFromTokens(self, sentenceGraph, t1, t2, directed=True): types = set() themeE1Types = set() intEdges = [] if sentenceGraph.interactionGraph.has_edge(t1, t2): intEdges = sentenceGraph.interactionGraph.get_edge_data(t1, t2, default={}) # NOTE: Only works if keys are ordered integers for i in range(len(intEdges)): types.add(intEdges[i]["element"].get("type")) # if (not directed) and sentenceGraph.interactionGraph.has_edge(t2, t1): # intEdgesReverse = sentenceGraph.interactionGraph.get_edge(t2, t1, default={}) # # NOTE: Only works if keys are ordered integers # for i in range(len(intEdgesReverse)): # intElement = intEdgesReverse[i]["element"] # intType = intElement.get("type") # types.add(intType) # intEdges.extend(intEdgesReverse) for i in range(len(intEdges)): intElement = intEdges[i]["element"] intType = intElement.get("type") if intType == "Theme": e1Entity = sentenceGraph.entitiesById[intElement.get("e1")] themeE1Types.add(e1Entity.get("type")) #types.add(intType) if len(themeE1Types) != 0: themeE1Types = list(themeE1Types) themeE1Types.sort() categoryName = "" for name in themeE1Types: if categoryName != "": categoryName += "---" categoryName += name return categoryName else: types = list(types) types.sort() categoryName = "" for name in types: if categoryName != "": categoryName += "---" categoryName += name if categoryName != "": return categoryName else: return "neg" def getCategoryName(self, sentenceGraph, e1, e2, directed=True): interactions = sentenceGraph.getInteractions(e1, e2) if not directed: interactions.extend(sentenceGraph.getInteractions(e2, e1)) types = set() for interaction in interactions: types.add(interaction.attrib["type"]) types = list(types) types.sort() categoryName = "" for name in types: if categoryName != "": categoryName += "---" categoryName += name if categoryName != "": return categoryName else: return "neg" def preProcessExamples(self, allExamples): # Duplicates cannot be removed here, as they should only be removed from the training set. This is done # in the classifier. # if "no_duplicates" in self.styles: # count = len(allExamples) # print >> sys.stderr, " Removing duplicates,", # allExamples = ExampleUtils.removeDuplicates(allExamples) # print >> sys.stderr, "removed", count - len(allExamples) if "normalize" in self.styles: print >> sys.stderr, " Normalizing feature vectors" ExampleUtils.normalizeFeatureVectors(allExamples) return allExamples def isPotentialGeniaInteraction(self, e1, e2): if e1.get("isName") == "True": return False else: return True #IF LOCAL def getBioInferParentType(self, eType): if eType == "Physical_entity" or OntologyUtils.hasParent(eType, "Physical_entity", self.bioinferOntologies): return "Physical" elif eType == "Property_entity" or OntologyUtils.hasParent(eType, "Property_entity", self.bioinferOntologies): return "Property" elif OntologyUtils.hasParent(eType, "Relationship", self.bioinferOntologies): return "Process" else: assert False, eType # if self.bioinferOntologies["Entity"].has_key(eType): # if OntologyUtils.hasParent(eType, "Physical_entity", self.bioinferOntologies): # assert not OntologyUtils.hasParent(eType, "Property_entity", self.bioinferOntologies), eType # return "Physical" # else: # assert OntologyUtils.hasParent(eType, "Property_entity", self.bioinferOntologies), eType # return "Property" # # else: # assert self.bioinferOntologies.has_key(eType), eType # #assert OntologyUtils.hasParent(eType, "Process_entity", self.bioinferOntologies["Relationship"]), eType # return "Process" def isPotentialBioInferInteraction(self, e1, e2, categoryName): e1Type = self.getBioInferParentType(e1.get("type")) e2Type = self.getBioInferParentType(e2.get("type")) if e1Type == "Process" or e1Type == "Property": return True elif e1Type == "Physical" and e2Type == "Physical": return True elif e1Type == "Physical" and e2Type == "Process": # hack return True else: assert(categoryName == "neg"), categoryName + " category for " + e1Type + " and " + e2Type return False #ENDIF def nxMultiDiGraphToUndirected(self, graph): undirected = NX10.MultiGraph(name=graph.name) undirected.add_nodes_from(graph) undirected.add_edges_from(graph.edges_iter()) return undirected def buildExamples(self, sentenceGraph): examples = [] exampleIndex = 0 clearGraph = sentenceGraph.getCleared() #undirected = sentenceGraph.getUndirectedDependencyGraph() undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) ##undirected = sentenceGraph.dependencyGraph.to_undirected() ###undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) self.triggerFeatureBuilder.initSentence(clearGraph) # Generate examples based on interactions between entities or interactions between tokens if "entities" in self.styles: loopRange = len(sentenceGraph.entities) else: loopRange = len(sentenceGraph.tokens) #for i in range(loopRange-1): for i in range(loopRange): # allow self-interactions #for j in range(i+1,loopRange): for j in range(i,loopRange): # allow self-interactions eI = None eJ = None if "entities" in self.styles: eI = sentenceGraph.entities[i] eJ = sentenceGraph.entities[j] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True": # continue if eI.get("type") == "neg" or eJ.get("type") == "neg": continue else: tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] # # only consider paths between entities (NOTE! entities, not only named entities) # if "headsOnly" in self.styles: # if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0): # continue if "directed" in self.styles: # define forward if "entities" in self.styles: categoryName = self.getCategoryName(sentenceGraph, eI, eJ, True) else: categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, True) self.exampleStats.beginExample(categoryName) if self.negFrac == None or categoryName != "neg" or (categoryName == "neg" and self.negRand.random() < self.negFrac): makeExample = True if ("genia_limits" in self.styles) and not self.isPotentialGeniaInteraction(eI, eJ): makeExample = False self.exampleStats.filter("genia_limits") if self.posPairGaz.getNegFrac((tI.get("POS"), tJ.get("POS"))) == 1.0: makeExample = False self.exampleStats.filter("pos_pair") if makeExample: if not sentenceGraph.tokenIsName[tI]: examples.append( self.buildExample(tI, tJ, paths, clearGraph, categoryName, exampleIndex, eI, eJ) ) exampleIndex += 1 else: self.exampleStats.filter("genia_token_limits") else: self.exampleStats.filter("neg_frac") self.exampleStats.endExample() # define reverse if "entities" in self.styles: categoryName = self.getCategoryName(sentenceGraph, eJ, eI, True) else: categoryName = self.getCategoryNameFromTokens(sentenceGraph, tJ, tI, True) self.exampleStats.beginExample(categoryName) if self.negFrac == None or categoryName != "neg" or (categoryName == "neg" and self.negRand.random() < self.negFrac): makeExample = True if ("genia_limits" in self.styles) and not self.isPotentialGeniaInteraction(eJ, eI): makeExample = False self.exampleStats.filter("genia_limits") if ("bioinfer_limits" in self.styles) and not self.isPotentialBioInferInteraction(eJ, eI, categoryName): makeExample = False self.exampleStats.filter("bioinfer_limits") if self.posPairGaz.getNegFrac((tJ.get("POS"), tI.get("POS"))) == 1.0: makeExample = False self.exampleStats.filter("pos_pair") if makeExample: if not sentenceGraph.tokenIsName[tJ]: examples.append( self.buildExample(tJ, tI, paths, clearGraph, categoryName, exampleIndex, eJ, eI) ) exampleIndex += 1 else: self.exampleStats.filter("genia_token_limits") else: self.exampleStats.filter("neg_frac") self.exampleStats.endExample() # else: # if "entities" in self.styles: # categoryName = self.getCategoryName(sentenceGraph, eI, eJ, False) # else: # categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, False) # forwardExample = self.buildExample(tI, tJ, paths, clearGraph, categoryName, exampleIndex, eI, eJ) # if not "graph_kernel" in self.styles: # reverseExample = self.buildExample(tJ, tI, paths, clearGraph, categoryName, exampleIndex, eJ, eI) # forwardExample[2].update(reverseExample[2]) # examples.append(forwardExample) # exampleIndex += 1 return examples def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, exampleIndex, entity1=None, entity2=None): # define features features = {} if True: #token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): if token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): path = paths[token1][token2] else: path = [token1, token2] assert(self.pathLengths == None) if self.pathLengths == None or len(path)-1 in self.pathLengths: if not "no_trigger": self.triggerFeatureBuilder.setFeatureVector(self.features) self.triggerFeatureBuilder.tag = "trg_t1_" self.triggerFeatureBuilder.buildFeatures(eventToken) self.triggerFeatureBuilder.tag = "trg_t2_" self.triggerFeatureBuilder.buildFeatures(eventToken) # if not "no_ontology" in self.styles: # self.ontologyFeatureBuilder.setFeatureVector(features) # self.ontologyFeatureBuilder.buildOntologyFeaturesForPath(sentenceGraph, path) # self.ontologyFeatureBuilder.setFeatureVector(None) if "graph_kernel" in self.styles or not "no_dependency" in self.styles: if token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): edges = self.multiEdgeFeatureBuilder.getEdges(sentenceGraph.dependencyGraph, path) else: edges = None if "graph_kernel" in self.styles: self.graphKernelFeatureBuilder.setFeatureVector(features, entity1, entity2) self.graphKernelFeatureBuilder.buildGraphKernelFeatures(sentenceGraph, path, edges) self.graphKernelFeatureBuilder.setFeatureVector(None) if "entity_type" in self.styles: features[self.featureSet.getId("e1_"+entity1.attrib["type"])] = 1 features[self.featureSet.getId("e2_"+entity2.attrib["type"])] = 1 features[self.featureSet.getId("distance_"+str(len(path)))] = 1 if not "no_dependency" in self.styles: if token1 == token2: features[self.featureSet.getId("tokenSelfLoop")] = 1 self.multiEdgeFeatureBuilder.setFeatureVector(features, entity1, entity2) #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast if not "disable_entity_features" in self.styles: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not "disable_terminus_features" in self.styles: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not "disable_single_element_features" in self.styles: self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, edges, sentenceGraph) if not "disable_ngram_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathGrams(2, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(3, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(4, path, edges, sentenceGraph) # remove for fast #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast #if edges != None: # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast if not "disable_path_edge_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, edges, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) if "nodalida" in self.styles: self.nodalidaFeatureBuilder.setFeatureVector(features, entity1, entity2) shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths(sentenceGraph.dependencyGraph, path) print shortestPaths if len(shortestPaths) > 0: self.nodalidaFeatureBuilder.buildNGrams(shortestPaths, sentenceGraph) self.nodalidaFeatureBuilder.setFeatureVector(None) if not "no_linear" in self.styles: self.tokenFeatureBuilder.setFeatureVector(features) for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == token1: token1Index = i if sentenceGraph.tokens[i] == token2: token2Index = i linearPreTag = "linfw_" if token1Index > token2Index: token1Index, token2Index = token2Index, token1Index linearPreTag = "linrv_" self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1") self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2") # Before, middle, after # self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf") # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw") # self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af") # before-middle, middle, middle-after # self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2) self.tokenFeatureBuilder.setFeatureVector(None) if "random" in self.styles: self.randomFeatureBuilder.setFeatureVector(features) self.randomFeatureBuilder.buildRandomFeatures(100, 0.01) self.randomFeatureBuilder.setFeatureVector(None) if "genia_limits" in self.styles: e1Type = entity1.get("type") e2Type = entity2.get("type") assert(entity1.get("isName") == "False") if entity2.get("isName") == "True": features[self.featureSet.getId("GENIA_target_protein")] = 1 else: features[self.featureSet.getId("GENIA_nested_event")] = 1 if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization if entity2.get("isName") == "True": features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1 else: features[self.featureSet.getId("GENIA_regulation_of_event")] = 1 else: features[self.featureSet.getId("always_negative")] = 1 if "subset" in self.styles: features[self.featureSet.getId("out_of_scope")] = 1 else: features[self.featureSet.getId("always_negative")] = 1 if "subset" in self.styles: features[self.featureSet.getId("out_of_scope")] = 1 path = [token1, token2] self.triggerFeatureBuilder.tag = "" self.triggerFeatureBuilder.setFeatureVector(None) # define extra attributes # if int(path[0].attrib["id"].split("_")[-1]) < int(path[-1].attrib["id"].split("_")[-1]): # #extra = {"xtype":"edge","type":"i","t1":path[0],"t2":path[-1]} # extra = {"xtype":"asym","type":"i","t1":path[0].get("id"),"t2":path[-1].get("id")} # extra["deprev"] = False # else: # #extra = {"xtype":"edge","type":"i","t1":path[-1],"t2":path[0]} # extra = {"xtype":"asym","type":"i","t1":path[-1].get("id"),"t2":path[0].get("id")} # extra["deprev"] = True extra = {"xtype":"asym","type":"i","t1":token1.get("id"),"t2":token2.get("id")} if entity1 != None: #extra["e1"] = entity1 extra["e1"] = entity1.get("id") if entity2 != None: #extra["e2"] = entity2 extra["e2"] = entity2.get("id") extra["categoryName"] = categoryName sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId # make example if "binary" in self.styles: if categoryName != "neg": category = 1 else: category = -1 categoryName = "i" else: category = self.classSet.getId(categoryName) return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)
class UnmergedEdgeExampleBuilder(ExampleBuilder): def __init__(self, style=["typed", "directed", "headsOnly"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert (classSet.getId("neg") == 1) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) self.pathLengths = length assert (self.pathLengths == None) self.types = types if "random" in self.styles: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet) #self.outFile = open("exampleTempFile.txt","wt") @classmethod def run(cls, input, output, parse, tokenization, style, idFileTag=None): classSet, featureSet = cls.getIdSets(idFileTag) if style == None: e = UnmergedEdgeExampleBuilder(classSet=classSet, featureSet=featureSet) else: e = UnmergedEdgeExampleBuilder(style=style, classSet=classSet, featureSet=featureSet) sentences = cls.getSentences(input, parse, tokenization) e.buildExamplesForSentences(sentences, output, idFileTag) print e.classSet.Ids def definePredictedValueRange(self, sentences, elementName): self.multiEdgeFeatureBuilder.definePredictedValueRange( sentences, elementName) def getPredictedValueRange(self): return self.multiEdgeFeatureBuilder.predictedRange def filterEdgesByType(self, edges, typesToInclude): if len(typesToInclude) == 0: return edges edgesToKeep = [] for edge in edges: if edge.get("type") in typesToInclude: edgesToKeep.append(edge) return edgesToKeep def getCategoryName(self, sentenceGraph, e1, e2, directed=True): # Dummies are potential entities that do not exist in the # training data. If both entities of an interaction are dummies # it can't exist in the training data and is therefore a negative if e1[2] or e2[2]: return "neg" e1 = e1[0] e2 = e2[0] interactions = sentenceGraph.getInteractions(e1, e2) if not directed: interactions.extend(sentenceGraph.getInteractions(e2, e1)) types = set() for interaction in interactions: types.add(interaction.attrib["type"]) types = list(types) types.sort() categoryName = "" for name in types: if categoryName != "": categoryName += "---" categoryName += name if categoryName != "": return categoryName else: return "neg" def preProcessExamples(self, allExamples): if "normalize" in self.styles: print >> sys.stderr, " Normalizing feature vectors" ExampleUtils.normalizeFeatureVectors(allExamples) return allExamples def isPotentialGeniaInteraction(self, e1, e2): if e1.get("isName") == "True": return False else: return True def nxMultiDiGraphToUndirected(self, graph): undirected = NX10.MultiGraph(name=graph.name) undirected.add_nodes_from(graph) undirected.add_edges_from(graph.edges_iter()) return undirected def getInteractionEdgeLengths(self, sentenceGraph, paths): """ Return dependency and linear length of all interaction edges (measured between the two tokens). """ interactionLengths = {} for interaction in sentenceGraph.interactions: # Calculated interaction edge dep and lin length e1 = sentenceGraph.entitiesById[interaction.get("e1")] e2 = sentenceGraph.entitiesById[interaction.get("e2")] t1 = sentenceGraph.entityHeadTokenByEntity[e1] t2 = sentenceGraph.entityHeadTokenByEntity[e2] # Get dep path length if t1 != t2 and paths.has_key(t1) and paths[t1].has_key(t2): pathLength = len(paths[t1][t2]) else: # no dependencyPath pathLength = 999999 # more than any real path # Linear distance t1Pos = -1 t2Pos = -1 for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == t1: t1Pos = i if t2Pos != -1: break if sentenceGraph.tokens[i] == t2: t2Pos = i if t1Pos != -1: break linLength = abs(t1Pos - t2Pos) interactionLengths[interaction] = (pathLength, linLength) return interactionLengths def getPrecedenceLevels(self, sentenceGraph, paths): """ Get overlapping entity precedence """ interactionLengths = self.getInteractionEdgeLengths( sentenceGraph, paths) interactionsByEntity = {} # Convenience mapping entityPrecedenceValues = {} for entity in sentenceGraph.entities: interactionsByEntity[entity] = [] eId = entity.get("id") # Add access to interactions argDepDist = 0 # Sum of lengths of shortest paths argLinDist = 0 # Sum of linear distances for interaction in sentenceGraph.interactions: if interaction.get( "e1" ) == eId: # An argument of the entity defined by the node interactionsByEntity[entity].append(interaction) argDepDist += interactionLengths[interaction][0] argLinDist += interactionLengths[interaction][1] # Store precedence counts (num args, sum of dep lengths, sum of lin lengths) entityPrecedenceValues[entity] = (len(interactionsByEntity), argDepDist, argLinDist, entity) # Determine level of entity from precedence counts levelByEntity = {} # slot number #levelByInteraction = {} # slot number of parent node # There is one slot group per token, per type for token in sentenceGraph.tokens: # per token entitiesByType = {} for entity in sentenceGraph.tokenIsEntityHead[token]: # per type if entity.get( "isName") == "True": # Names can never have duplicates assert not levelByEntity.has_key(entity) levelByEntity[entity] = 0 continue eType = entity.get("type") if eType == "neg": continue if not entitiesByType.has_key(eType): entitiesByType[eType] = [] entitiesByType[eType].append(entity) for eType in sorted(entitiesByType.keys()): # Slot ordering by precedence sortedEntities = [] for entity in entitiesByType[eType]: sortedEntities.append(entityPrecedenceValues[entity]) sortedEntities.sort(compareEntityPrecedence) level = 0 for precedenceTuple in sortedEntities: entity = precedenceTuple[3] assert not levelByEntity.has_key(entity) levelByEntity[entity] = level # Interactions have the same slot as their parent entity #for interaction in interactionsByEntity[entity]: # assert not levelByInteraction.has_key(interaction) # levelByInteraction[interaction] = level level += 1 return levelByEntity #, levelByInteraction def buildExamples(self, sentenceGraph): examples = [] exampleIndex = 0 #undirected = sentenceGraph.getUndirectedDependencyGraph() undirected = self.nxMultiDiGraphToUndirected( sentenceGraph.dependencyGraph) ##undirected = sentenceGraph.dependencyGraph.to_undirected() ###undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) # Determine overlapping entity precedence #levelByEntity, levelByInteraction = self.getPrecedenceLevels(sentenceGraph, paths) levelByEntity = self.getPrecedenceLevels(sentenceGraph, paths) entities = [] # There is one entity group for each token, for each type of entity for token in sentenceGraph.tokens: # per token entitiesByType = {} for entity in sentenceGraph.tokenIsEntityHead[token]: # per type if entity.get( "isName") == "True": # Names can never have duplicates entities.append((entity, 0, False)) continue eType = entity.get("type") if eType == "neg": continue if not entitiesByType.has_key(eType): entitiesByType[eType] = [] entitiesByType[eType].append(entity) # Create slot groups for tokens for which exists at least one entity eTypes = sorted(entitiesByType.keys()) if len(eTypes) == 0: continue # Create slot groups and insert GS data there for eType in eTypes: # Use first entity of a type as the dummy entity for unfilled slots dummyEntity = entitiesByType[eType][0] # Define entity slots entityGroup = [None, None, None, None] #entityGroup = [None, None] # Insert existing entities into slots for entity in entitiesByType[eType]: if levelByEntity.has_key(entity): level = levelByEntity[entity] if level < len(entityGroup): entityGroup[level] = (entity, level, False) # Create dummies for potential entities for i in range(len(entityGroup)): if entityGroup[i] == None: entityGroup[i] = (dummyEntity, i, True) # Put all slots into one potential entity list #print entityGroup for e in entityGroup: entities.append(e) # Generate examples based on interactions between entities for i in range(len(entities) - 1): for j in range(i + 1, len(entities)): eI = entities[i][0] eJ = entities[j][0] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] # define forward example categoryName = self.getCategoryName(sentenceGraph, entities[i], entities[j], True) if (not "genia_limits" in self.styles) or self.isPotentialGeniaInteraction( eI, eJ): examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, entities[i], entities[j])) exampleIndex += 1 # define reverse categoryName = self.getCategoryName(sentenceGraph, entities[j], entities[i], True) if (not "genia_limits" in self.styles) or self.isPotentialGeniaInteraction( eJ, eI): examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, entities[j], entities[i])) exampleIndex += 1 return examples def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, exampleIndex, e1=None, e2=None): entity1 = e1[0] entity2 = e2[0] # define features features = {} features[self.featureSet.getId("gov_level")] = e1[1] features[self.featureSet.getId("gov_level_" + str(e1[1]))] = 1 features[self.featureSet.getId("dep_level")] = e2[1] features[self.featureSet.getId("dep_level_" + str(e2[1]))] = 1 features[self.featureSet.getId("level_pair_" + str(e1[1]) + "_" + str(e2[1]))] = 1 if True: #token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): if token1 != token2 and paths.has_key( token1) and paths[token1].has_key(token2): path = paths[token1][token2] else: path = [token1, token2] assert (self.pathLengths == None) if self.pathLengths == None or len(path) - 1 in self.pathLengths: if not "no_dependency" in self.styles: if token1 != token2 and paths.has_key( token1) and paths[token1].has_key(token2): edges = self.multiEdgeFeatureBuilder.getEdges( sentenceGraph.dependencyGraph, path) else: edges = None if "entity_type" in self.styles: features[self.featureSet.getId("e1_" + entity1.attrib["type"])] = 1 features[self.featureSet.getId("e2_" + entity2.attrib["type"])] = 1 features[self.featureSet.getId("distance_" + str(len(path)))] = 1 if not "no_dependency" in self.styles: self.multiEdgeFeatureBuilder.setFeatureVector( features, entity1, entity2) #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast if not "disable_entity_features" in self.styles: self.multiEdgeFeatureBuilder.buildEntityFeatures( sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not "disable_terminus_features" in self.styles: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures( path, sentenceGraph) # remove for fast if not "disable_single_element_features" in self.styles: self.multiEdgeFeatureBuilder.buildSingleElementFeatures( path, edges, sentenceGraph) if not "disable_ngram_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathGrams( 2, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 3, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 4, path, edges, sentenceGraph) # remove for fast #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast #if edges != None: # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast if not "disable_path_edge_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures( path, edges, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures( sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) if not "no_linear" in self.styles: self.tokenFeatureBuilder.setFeatureVector(features) for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == token1: token1Index = i if sentenceGraph.tokens[i] == token2: token2Index = i linearPreTag = "linfw_" if token1Index > token2Index: token1Index, token2Index = token2Index, token1Index linearPreTag = "linrv_" self.tokenFeatureBuilder.buildLinearOrderFeatures( token1Index, sentenceGraph, 2, 2, preTag="linTok1") self.tokenFeatureBuilder.buildLinearOrderFeatures( token2Index, sentenceGraph, 2, 2, preTag="linTok2") # Before, middle, after # self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf") # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw") # self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af") # before-middle, middle, middle-after # self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2) self.tokenFeatureBuilder.setFeatureVector(None) if "random" in self.styles: self.randomFeatureBuilder.setFeatureVector(features) self.randomFeatureBuilder.buildRandomFeatures(100, 0.01) self.randomFeatureBuilder.setFeatureVector(None) if "genia_limits" in self.styles: e1Type = entity1.get("type") e2Type = entity2.get("type") assert (entity1.get("isName") == "False") if entity2.get("isName") == "True": features[self.featureSet.getId( "GENIA_target_protein")] = 1 else: features[self.featureSet.getId( "GENIA_nested_event")] = 1 if e1Type.find( "egulation" ) != -1: # leave r out to avoid problems with capitalization if entity2.get("isName") == "True": features[self.featureSet.getId( "GENIA_regulation_of_protein")] = 1 else: features[self.featureSet.getId( "GENIA_regulation_of_event")] = 1 else: features[self.featureSet.getId("always_negative")] = 1 if "subset" in self.styles: features[self.featureSet.getId("out_of_scope")] = 1 else: features[self.featureSet.getId("always_negative")] = 1 if "subset" in self.styles: features[self.featureSet.getId("out_of_scope")] = 1 path = [token1, token2] # define extra attributes if int(path[0].attrib["id"].split("_")[-1]) < int( path[-1].attrib["id"].split("_")[-1]): #extra = {"xtype":"edge","type":"i","t1":path[0],"t2":path[-1]} extra = { "xtype": "ue", "type": "i", "t1": path[0].get("id"), "t2": path[-1].get("id") } extra["deprev"] = False else: #extra = {"xtype":"edge","type":"i","t1":path[-1],"t2":path[0]} extra = { "xtype": "ue", "type": "i", "t1": path[-1].get("id"), "t2": path[0].get("id") } extra["deprev"] = True if entity1 != None: extra["e1"] = entity1.get("id") extra["l1"] = str(e1[1]) extra["d1"] = str(e1[2])[ 0] # is a dummy node (an entity not in existing triggers) if entity2 != None: extra["e2"] = entity2.get("id") extra["l2"] = str(e2[1]) extra["d2"] = str(e2[2])[ 0] # is a dummy node (an entity not in existing triggers) extra["categoryName"] = categoryName sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId # make example if "binary" in self.styles: if categoryName != "neg": category = 1 else: category = -1 categoryName = "i" else: category = self.classSet.getId(categoryName) return (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)
class EntityRelationExampleBuilder(ExampleBuilder): """ BioNLP'11 REL subtask examples """ def __init__(self, style=["typed","directed","headsOnly"], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) #if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True #if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False #if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = False @classmethod def run(cls, input, output, parse, tokenization, style, idFileTag=None): """ An interface for running the example builder without needing to create a class """ classSet, featureSet = cls.getIdSets(idFileTag) if style != None: e = EntityRelationExampleBuilder(style=style, classSet=classSet, featureSet=featureSet) else: e = EntityRelationExampleBuilder(classSet=classSet, featureSet=featureSet) sentences = cls.getSentences(input, parse, tokenization) e.buildExamplesForSentences(sentences, output, idFileTag) def getCategoryNameFromTokens(self, sentenceGraph, t1, t2, directed=True): """ Example class. Multiple overlapping edges create a merged type. """ types = set() intEdges = sentenceGraph.interactionGraph.getEdges(t1, t2) if (not directed): intEdges = intEdges + sentenceGraph.interactionGraph.getEdges(t2, t1) for intEdge in intEdges: types.add(intEdge[2].get("type")) types = list(types) types.sort() categoryName = "" for name in types: if categoryName != "": categoryName += "---" categoryName += name if categoryName != "": return categoryName else: return "neg" # def isPotentialTargetEntityHead(self, namedEntityToken, token): # if token.get("POS") in ["CD","JJ","NN","NNS","RB"]: # return True # else: # return False def buildExamples(self, sentenceGraph): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ examples = [] exampleIndex = 0 if "trigger_features" in self.styles: self.triggerFeatureBuilder.initSentence(sentenceGraph) undirectedDepGraph = sentenceGraph.dependencyGraph.toUndirected() namedEntities = [] for entity in sentenceGraph.entities: if entity.get("isName") == "True": namedEntities.append(entity) potentialTargetEntities = {} for i in range(len(sentenceGraph.tokens)): potentialTargetEntities[i] = sentenceGraph.tokens[i].get("POS") in ["CD","JJ","NN","NNS","RB"] for namedEntity in namedEntities: for i in range(len(sentenceGraph.tokens)): if not potentialTargetEntities[i]: continue namedEntityToken = sentenceGraph.entityHeadTokenByEntity[namedEntity] token = sentenceGraph.tokens[i] categoryName = self.getCategoryNameFromTokens(sentenceGraph, namedEntityToken, token, True) #if (not "genia_limits" in self.styles) or self.isPotentialRelation(namedEntityToken, token): examples.append( self.buildExample(entity, i, undirectedDepGraph, sentenceGraph, categoryName, exampleIndex) ) exampleIndex += 1 return examples def buildExample(self, namedEntity, tokenIndex, undirectedDepGraph, sentenceGraph, categoryName, exampleIndex): """ Build a single directed example for the potential edge between token1 and token2 """ namedEntityToken = sentenceGraph.entityHeadTokenByEntity[namedEntity] token = sentenceGraph.tokens[tokenIndex] # define features features = {} paths = undirectedDepGraph.getPaths(namedEntityToken, token) if len(paths) > 0: path = paths[0] else: path = [namedEntityToken, token] if "trigger_features" in self.styles: self.triggerFeatureBuilder.setFeatureVector(features) self.triggerFeatureBuilder.tag = "trg1_" self.triggerFeatureBuilder.buildFeatures(namedEntityToken) self.triggerFeatureBuilder.tag = "trg2_" self.triggerFeatureBuilder.buildFeatures(token) self.triggerFeatureBuilder.setFeatureVector(None) if not "no_dependency" in self.styles: #print "Dep features" self.multiEdgeFeatureBuilder.setFeatureVector(features, None, None) #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast if not "disable_entity_features" in self.styles: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not "disable_terminus_features" in self.styles: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not "disable_single_element_features" in self.styles: self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, sentenceGraph) if not "disable_ngram_features" in self.styles: #print "NGrams" self.multiEdgeFeatureBuilder.buildPathGrams(2, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(3, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(4, path, sentenceGraph) # remove for fast #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast #if edges != None: # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast if not "disable_path_edge_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) if not "no_linear" in self.styles: self.tokenFeatureBuilder.setFeatureVector(features) for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == token1: token1Index = i if sentenceGraph.tokens[i] == token2: token2Index = i linearPreTag = "linfw_" if token1Index > token2Index: token1Index, token2Index = token2Index, token1Index linearPreTag = "linrv_" self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1") self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2") # Before, middle, after # self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf") # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw") # self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af") # before-middle, middle, middle-after # self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2) self.tokenFeatureBuilder.setFeatureVector(None) # define extra attributes extra = {"xtype":"entRel","type":"i","t1":namedEntityToken.get("id"),"t2":token.get("id")} extra["e1"] = namedEntity.get("id") # list gold entities in extra, if present e2s = set() for entity in sentenceGraph.tokenIsEntityHead[token]: e2s.add(entity.get("id")) if len(e2s) != 0: extra["e2"] = ",".join(sorted(e2s)) else: extra["e2"] = "None" extra["categoryName"] = categoryName sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId # make example category = self.classSet.getId(categoryName) return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)
class UnmergingExampleBuilder(ExampleBuilder): """ This example builder makes unmerging examples, i.e. examples describing potential events. """ def __init__( self, style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures", length=None, types=[], featureSet=None, classSet=None): # reset style regardless of input style = "trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures" if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert (classSet.getId("neg") == 1) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = self.getParameters(style, [ "trigger_features", "typed", "directed", "no_linear", "entities", "genia_limits", "noAnnType", "noMasking", "maxFeatures", "no_merge", "disable_entity_features", "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features" ]) self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"] self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles[ "noMasking"] self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"] self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) self.pathLengths = length assert (self.pathLengths == None) self.types = types self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = True #self.outFile = open("exampleTempFile.txt","wt") def getInteractionEdgeLengths(self, sentenceGraph, paths): """ Return dependency and linear length of all interaction edges (measured between the two tokens). """ interactionLengths = {} for interaction in sentenceGraph.interactions: # Calculated interaction edge dep and lin length e1 = sentenceGraph.entitiesById[interaction.get("e1")] e2 = sentenceGraph.entitiesById[interaction.get("e2")] t1 = sentenceGraph.entityHeadTokenByEntity[e1] t2 = sentenceGraph.entityHeadTokenByEntity[e2] # Get dep path length if t1 != t2: path = paths.getPaths(t1, t2) if t1 != t2 and len(path) > 0: pathLength = min(len(x) for x in path) #len(paths[t1][t2]) else: # no dependencyPath pathLength = 999999 # more than any real path # Linear distance t1Pos = -1 t2Pos = -1 for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == t1: t1Pos = i if t2Pos != -1: break if sentenceGraph.tokens[i] == t2: t2Pos = i if t1Pos != -1: break linLength = abs(t1Pos - t2Pos) interactionLengths[interaction] = (interaction, pathLength, linLength, t2Pos) return interactionLengths def eventIsGold(self, entity, arguments, sentenceGraph, goldGraph, goldEntitiesByOffset): offset = entity.get("headOffset") if not goldEntitiesByOffset.has_key(offset): return False eType = entity.get("type") goldEntities = goldEntitiesByOffset[offset] # Check all gold entities for a match for goldEntity in goldEntities: isGold = True # The entity type must match if goldEntity.get("type") != eType: isGold = False continue goldEntityId = goldEntity.get("id") # Collect the gold interactions goldInteractions = [] for goldInteraction in goldGraph.interactions: if goldInteraction.get("e1") == goldEntityId: goldInteractions.append(goldInteraction) # Argument count rules if len(goldInteractions) != len( arguments): # total number of edges differs isGold = False continue # count number of edges per type argTypeCounts = {} for argument in arguments: argType = argument.get("type") if not argTypeCounts.has_key(argType): argTypeCounts[argType] = 0 argTypeCounts[argType] += 1 # count number of gold edges per type goldTypeCounts = {} for argument in goldInteractions: argType = argument.get("type") if not goldTypeCounts.has_key(argType): goldTypeCounts[argType] = 0 goldTypeCounts[argType] += 1 # argument edge counts per type must match if argTypeCounts != goldTypeCounts: isGold = False continue # Exact argument matching for argument in arguments: # check all edges e1 = argument.get("e1") e2 = argument.get("e2") e2Entity = sentenceGraph.entitiesById[e2] e2Offset = e2Entity.get("headOffset") e2Type = e2Entity.get("type") argType = argument.get("type") found = False for goldInteraction in goldInteractions: if goldInteraction.get("type") == argType: goldE2Entity = goldGraph.entitiesById[ goldInteraction.get("e2")] if goldE2Entity.get( "headOffset") == e2Offset and goldE2Entity.get( "type") == e2Type: found = True break if found == False: # this edge did not have a corresponding gold edge isGold = False break # Event is in gold if isGold: break return isGold def getArgumentCombinations(self, eType, interactions, entityId=None): combs = [] if eType == "Binding": # Making examples for only all-together/all-separate cases # doesn't work, since even gold data has several cases of # overlapping bindings with different numbers of arguments #if len(interactions) > 0: # return [interactions] #else: # return interactions # Skip causes themes = [] for interaction in interactions: if interaction.get("type") == "Theme": themes.append(interaction) for i in range(len(themes)): # Looking at a2-normalize.pl reveals that there can be max 6 themes # Based on training+devel data, four is maximum if i < 10: #4: for j in combinations(themes, i + 1): combs.append(j) # if len(combs) >= 100: # print >> sys.stderr, "Warning, truncating unmerging examples at 100 for Binding entity", entityId # break return combs elif eType == "Process": # For ID-task argCombinations = [] argCombinations.append([]) # process can have 0 interactions for interaction in interactions: if interaction.get("type") == "Participant": argCombinations.append([interaction]) return argCombinations else: # one of the regulation-types, or one of the simple types themes = [] causes = [] siteArgs = [] contextGenes = [] sideChains = [] locTargets = [] for interaction in interactions: iType = interaction.get("type") #assert iType in ["Theme", "Cause"], (iType, ETUtils.toStr(interaction)) if iType not in [ "Theme", "Cause", "SiteArg", "Contextgene", "Sidechain" ]: # "AtLoc", "ToLoc"]: continue if iType == "Theme": themes.append(interaction) elif iType == "Cause": causes.append(interaction) elif iType == "SiteArg": siteArgs.append(interaction) elif iType == "Contextgene": contextGenes.append(interaction) elif iType == "Sidechain": sideChains.append(interaction) elif iType in ["AtLoc", "ToLoc"]: locTargets.append(iType) else: assert False, (iType, interaction.get("id")) # Limit arguments to event types that can have them if eType.find("egulation") == -1 and eType != "Catalysis": causes = [] if eType != "Glycosylation": sideChains = [] if eType not in ["Acetylation", "Methylation"]: contextGenes = [] if eType == "Catalysis": siteArgs = [] # Themes can always appear alone themeAloneCombinations = [] for theme in themes: themeAloneCombinations.append([theme]) #print "Combine", combine.combine(themes, causes), "TA", themeAloneCombinations return combine.combine(themes, causes) \ + combine.combine(themes, siteArgs) \ + combine.combine(themes, sideChains) \ + combine.combine(themes, contextGenes) \ + combine.combine(themes, siteArgs, sideChains) \ + combine.combine(themes, siteArgs, contextGenes) \ + combine.combine(themes, locTargets) \ + themeAloneCombinations def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) self.triggerFeatureBuilder.initSentence(sentenceGraph) #examples = [] exampleIndex = 0 #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) undirected = sentenceGraph.dependencyGraph.toUndirected() paths = undirected # Get argument order self.interactionLenghts = self.getInteractionEdgeLengths( sentenceGraph, paths) # Map tokens to character offsets tokenByOffset = {} for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] if goldGraph != None: # check that the tokenizations match goldToken = goldGraph.tokens[i] assert token.get("id") == goldToken.get("id") and token.get( "charOffset") == goldToken.get("charOffset") tokenByOffset[token.get("charOffset")] = token.get("id") # Map gold entities to their head offsets goldEntitiesByOffset = {} if goldGraph != None: for entity in goldGraph.entities: offset = entity.get("headOffset") assert offset != None if not goldEntitiesByOffset.has_key(offset): goldEntitiesByOffset[offset] = [] goldEntitiesByOffset[offset].append(entity) # Generate examples based on interactions between entities or interactions between tokens # interactionsByEntityId = {} # for entity in sentenceGraph.entities: # interactionsByEntityId[entity.get("id")] = [] # for interaction in sentenceGraph.interactions: # if interaction.get("type") == "neg": # continue # e1Id = interaction.get("e1") # interactionsByEntityId[e1Id].append(interaction) if self.styles["no_merge"]: mergeInput = False entities = sentenceGraph.entities else: mergeInput = True sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities exampleIndex = 0 for entity in entities: # sentenceGraph.entities: eType = entity.get("type") assert eType != None, entity.attrib eType = str(eType) #if eType not in ["Binding", "Positive_regulation", "Negative_regulation", "Regulation"]: # continue #if not goldEntitiesByOffset.has_key(entity.get("headOffset")): # continue #interactions = interactionsByEntityId[entity.get("id")] interactions = [ x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput) ] argCombinations = self.getArgumentCombinations( eType, interactions, entity.get("id")) #if len(argCombinations) <= 1: # continue assert argCombinations != None, (entity.get("id"), entity.get("type")) for argCombination in argCombinations: if eType != "Process": assert len(argCombination ) > 0, eType + ": " + str(argCombinations) # Originally binary classification if goldGraph != None: isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset) #if eType == "Binding": # print argCombination[0].get("e1"), len(argCombination), isGoldEvent else: isGoldEvent = False # Named (multi-)class if isGoldEvent: #category = "event" category = eType if category.find("egulation") != -1: category = "All_regulation" elif category != "Binding": category = "Other" #"simple6" else: category = "neg" features = {} argString = "" for arg in argCombination: argString += "," + arg.get("id") extra = { "xtype": "um", "e": entity.get("id"), "i": argString[1:], "etype": eType, "class": category } assert type(extra["etype"]) == types.StringType, extra self.exampleStats.addExample(category) example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions) example[0] = sentenceGraph.getSentenceId() + ".x" + str( exampleIndex) example[1] = self.classSet.getId(category) example[3] = extra #examples.append( example ) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 #return examples return exampleIndex def buildExample(self, sentenceGraph, paths, eventEntity, argCombination, allInteractions): #themeEntities, causeEntities=None): # NOTE!!!! TODO # add also features for arguments present, but not in this combination features = {} self.features = features self.buildInterArgumentBagOfWords(argCombination, sentenceGraph) eventEntityType = eventEntity.get("type") if eventEntityType == "Binding": interactionIndex = {} groupInteractionLengths = [] for interaction in allInteractions: groupInteractionLengths.append( self.interactionLenghts[interaction]) groupInteractionLengths.sort(compareInteractionPrecedence) #print groupInteractionLengths for i in range(len(groupInteractionLengths)): interactionIndex[groupInteractionLengths[i][0]] = i eventToken = sentenceGraph.entityHeadTokenByEntity[eventEntity] self.triggerFeatureBuilder.setFeatureVector(self.features) self.triggerFeatureBuilder.tag = "trg_" self.triggerFeatureBuilder.buildFeatures(eventToken) self.triggerFeatureBuilder.tag = None #self.setFeature("rootType_"+eventEntity.get("type"), 1) #argThemeCount = 0 #argCauseCount = 0 argCounts = {} # Current example's edge combination for arg in argCombination: argType = arg.get("type") if argType not in argCounts: argCounts[argType] = 0 argCounts[argType] += 1 tag = "arg" + argType if eventEntityType == "Binding" and argType == "Theme": tag += str(interactionIndex[arg]) self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, tag) ## if arg.get("type") == "Theme": ## #argThemeCount += 1 ## tag = "argTheme" ## self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, tag) # #elif arg.get("type") == "Cause": # Cause # # #argCauseCount += 1 # # self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, "argCause") # else: # self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, "arg"+argType) # Edge group context #contextThemeCount = 0 #contextCauseCount = 0 contextCounts = {} for interaction in allInteractions: if interaction in argCombination: # Already part of current example's combination continue contextArgType = interaction.get("type") if contextArgType not in contextCounts: contextCounts[contextArgType] = 0 contextCounts[contextArgType] += 1 tag = "conArg" + contextArgType if eventEntityType == "Binding" and contextArgType == "Theme": tag += str(interactionIndex[interaction]) self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag) # if interaction.get("type") == "Theme": # contextThemeCount += 1 # tag = "conTheme" # self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag) # if eventEntityType == "Binding": # tag += str(interactionIndex[interaction]) # self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag) # else: # Cause # contextCauseCount += 1 # self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, "conCause") self.setFeature("argCount", len(argCombination)) self.setFeature("argCount_" + str(len(argCombination)), 1) self.setFeature("interactionCount", len(allInteractions)) self.setFeature("interactionCount_" + str(len(allInteractions)), 1) #self.setFeature("argThemeCount", argThemeCount) #self.setFeature("argThemeCount_" + str(argThemeCount), 1) #self.setFeature("argCauseCount", argCauseCount) #self.setFeature("argCauseCount_" + str(argCauseCount), 1) for key in sorted(argCounts.keys()): self.setFeature("arg" + key + "Count", argCounts[key]) self.setFeature("arg" + key + "Count_" + str(argCounts[key]), 1) #self.setFeature("interactionThemeCount", contextThemeCount) #self.setFeature("interactionThemeCount_" + str(contextThemeCount), 1) #self.setFeature("interactionCauseCount", contextCauseCount) #self.setFeature("interactionCauseCount_" + str(contextCauseCount), 1) for key in sorted(contextCounts.keys()): self.setFeature("contextArg" + key + "Count", contextCounts[key]) self.setFeature( "contextArg" + key + "Count_" + str(contextCounts[key]), 1) self.triggerFeatureBuilder.tag = "" self.triggerFeatureBuilder.setFeatureVector(None) # Common features # if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization # if entity2.get("isName") == "True": # features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1 # else: # features[self.featureSet.getId("GENIA_regulation_of_event")] = 1 # define extra attributes return [None, None, features, None] def buildArgumentFeatures(self, sentenceGraph, paths, features, eventToken, arg, tag): argEntity = sentenceGraph.entitiesById[arg.get("e2")] argToken = sentenceGraph.entityHeadTokenByEntity[argEntity] self.buildEdgeFeatures(sentenceGraph, paths, features, eventToken, argToken, tag) self.triggerFeatureBuilder.tag = tag + "trg_" self.triggerFeatureBuilder.buildFeatures(argToken) if argEntity.get("isName") == "True": self.setFeature(tag + "Protein", 1) else: self.setFeature(tag + "Event", 1) self.setFeature("nestingEvent", 1) self.setFeature(tag + "_" + argEntity.get("type"), 1) def buildEdgeFeatures(self, sentenceGraph, paths, features, eventToken, argToken, tag): #eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode] #argToken = sentenceGraph.entityHeadTokenByEntity[argNode] self.multiEdgeFeatureBuilder.tag = tag + "_" self.multiEdgeFeatureBuilder.setFeatureVector(features, None, None, False) self.setFeature(tag + "_present", 1) path = paths.getPaths(eventToken, argToken) if eventToken != argToken and len(path) > 0: path = path[0] else: path = [eventToken, argToken] #edges = None if not self.styles["disable_entity_features"]: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) #if not "disable_terminus_features" in self.styles: # self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not self.styles["disable_single_element_features"]: self.multiEdgeFeatureBuilder.buildSingleElementFeatures( path, sentenceGraph) if not self.styles["disable_ngram_features"]: self.multiEdgeFeatureBuilder.buildPathGrams( 2, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 3, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 4, path, sentenceGraph) # remove for fast if not self.styles["disable_path_edge_features"]: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures( path, sentenceGraph) #self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None, None, None, False) self.multiEdgeFeatureBuilder.tag = "" def buildInterArgumentBagOfWords(self, arguments, sentenceGraph): if len(arguments) < 2: return indexByToken = {} for i in range(len(sentenceGraph.tokens)): indexByToken[sentenceGraph.tokens[i]] = i argTokenIndices = set() for arg in arguments: argEntity = sentenceGraph.entitiesById[arg.get("e2")] argToken = sentenceGraph.entityHeadTokenByEntity[argEntity] argTokenIndices.add(indexByToken[argToken]) minIndex = min(argTokenIndices) maxIndex = max(argTokenIndices) self.setFeature("argBoWRange", (maxIndex - minIndex)) self.setFeature("argBoWRange_" + str(maxIndex - minIndex), 1) bow = set() for i in range(minIndex + 1, maxIndex): token = sentenceGraph.tokens[i] if len(sentenceGraph.tokenIsEntityHead[token] ) == 0 and not sentenceGraph.tokenIsName[token]: bow.add(token.get("text")) bow = sorted(list(bow)) for word in bow: self.setFeature("argBoW_" + word, 1) if word in ["/", "-"]: self.setFeature("argBoW_slashOrHyphen", 1) if len(bow) == 1: self.setFeature("argBoWonly_" + bow[0], 1) if bow[0] in ["/", "-"]: self.setFeature("argBoWonly_slashOrHyphen", 1)
class UnmergingExampleBuilder(ExampleBuilder): """ This example builder makes unmerging examples, i.e. examples describing potential events. """ def __init__(self, style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures", length=None, types=[], featureSet=None, classSet=None): # reset style regardless of input style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures" if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = self.getParameters(style, ["trigger_features","typed","directed","no_linear","entities","genia_limits", "noAnnType", "noMasking", "maxFeatures", "no_merge", "disable_entity_features", "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features"]) self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"] self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles["noMasking"] self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"] self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) self.pathLengths = length assert(self.pathLengths == None) self.types = types self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = True #self.outFile = open("exampleTempFile.txt","wt") def getInteractionEdgeLengths(self, sentenceGraph, paths): """ Return dependency and linear length of all interaction edges (measured between the two tokens). """ interactionLengths = {} for interaction in sentenceGraph.interactions: # Calculated interaction edge dep and lin length e1 = sentenceGraph.entitiesById[interaction.get("e1")] e2 = sentenceGraph.entitiesById[interaction.get("e2")] t1 = sentenceGraph.entityHeadTokenByEntity[e1] t2 = sentenceGraph.entityHeadTokenByEntity[e2] # Get dep path length if t1 != t2: path = paths.getPaths(t1, t2) if t1 != t2 and len(path) > 0: pathLength = min(len(x) for x in path) #len(paths[t1][t2]) else: # no dependencyPath pathLength = 999999 # more than any real path # Linear distance t1Pos = -1 t2Pos = -1 for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == t1: t1Pos = i if t2Pos != -1: break if sentenceGraph.tokens[i] == t2: t2Pos = i if t1Pos != -1: break linLength = abs(t1Pos - t2Pos) interactionLengths[interaction] = (interaction, pathLength, linLength, t2Pos) return interactionLengths def eventIsGold(self, entity, arguments, sentenceGraph, goldGraph, goldEntitiesByOffset): offset = entity.get("headOffset") if not goldEntitiesByOffset.has_key(offset): return False eType = entity.get("type") goldEntities = goldEntitiesByOffset[offset] # Check all gold entities for a match for goldEntity in goldEntities: isGold = True # The entity type must match if goldEntity.get("type") != eType: isGold = False continue goldEntityId = goldEntity.get("id") # Collect the gold interactions goldInteractions = [] for goldInteraction in goldGraph.interactions: if goldInteraction.get("e1") == goldEntityId: goldInteractions.append(goldInteraction) # Argument count rules if len(goldInteractions) != len(arguments): # total number of edges differs isGold = False continue # count number of edges per type argTypeCounts = {} for argument in arguments: argType = argument.get("type") if not argTypeCounts.has_key(argType): argTypeCounts[argType] = 0 argTypeCounts[argType] += 1 # count number of gold edges per type goldTypeCounts = {} for argument in goldInteractions: argType = argument.get("type") if not goldTypeCounts.has_key(argType): goldTypeCounts[argType] = 0 goldTypeCounts[argType] += 1 # argument edge counts per type must match if argTypeCounts != goldTypeCounts: isGold = False continue # Exact argument matching for argument in arguments: # check all edges e1 = argument.get("e1") e2 = argument.get("e2") e2Entity = sentenceGraph.entitiesById[e2] e2Offset = e2Entity.get("headOffset") e2Type = e2Entity.get("type") argType = argument.get("type") found = False for goldInteraction in goldInteractions: if goldInteraction.get("type") == argType: goldE2Entity = goldGraph.entitiesById[goldInteraction.get("e2")] if goldE2Entity.get("headOffset") == e2Offset and goldE2Entity.get("type") == e2Type: found = True break if found == False: # this edge did not have a corresponding gold edge isGold = False break # Event is in gold if isGold: break return isGold def getArgumentCombinations(self, eType, interactions, entityId=None): combs = [] if eType == "Binding": # Making examples for only all-together/all-separate cases # doesn't work, since even gold data has several cases of # overlapping bindings with different numbers of arguments #if len(interactions) > 0: # return [interactions] #else: # return interactions # Skip causes themes = [] for interaction in interactions: if interaction.get("type") == "Theme": themes.append(interaction) for i in range(len(themes)): # Looking at a2-normalize.pl reveals that there can be max 6 themes # Based on training+devel data, four is maximum if i < 10: #4: for j in combinations(themes, i+1): combs.append(j) # if len(combs) >= 100: # print >> sys.stderr, "Warning, truncating unmerging examples at 100 for Binding entity", entityId # break return combs elif eType == "Process": # For ID-task argCombinations = [] argCombinations.append([]) # process can have 0 interactions for interaction in interactions: if interaction.get("type") == "Participant": argCombinations.append([interaction]) return argCombinations else: # one of the regulation-types, or one of the simple types themes = [] causes = [] siteArgs = [] contextGenes = [] sideChains = [] locTargets = [] for interaction in interactions: iType = interaction.get("type") #assert iType in ["Theme", "Cause"], (iType, ETUtils.toStr(interaction)) if iType not in ["Theme", "Cause", "SiteArg", "Contextgene", "Sidechain"]: # "AtLoc", "ToLoc"]: continue if iType == "Theme": themes.append(interaction) elif iType == "Cause": causes.append(interaction) elif iType == "SiteArg": siteArgs.append(interaction) elif iType == "Contextgene": contextGenes.append(interaction) elif iType == "Sidechain": sideChains.append(interaction) elif iType in ["AtLoc", "ToLoc"]: locTargets.append(iType) else: assert False, (iType, interaction.get("id")) # Limit arguments to event types that can have them if eType.find("egulation") == -1 and eType != "Catalysis": causes = [] if eType != "Glycosylation": sideChains = [] if eType not in ["Acetylation", "Methylation"]: contextGenes = [] if eType == "Catalysis": siteArgs = [] # Themes can always appear alone themeAloneCombinations = [] for theme in themes: themeAloneCombinations.append([theme]) #print "Combine", combine.combine(themes, causes), "TA", themeAloneCombinations return combine.combine(themes, causes) \ + combine.combine(themes, siteArgs) \ + combine.combine(themes, sideChains) \ + combine.combine(themes, contextGenes) \ + combine.combine(themes, siteArgs, sideChains) \ + combine.combine(themes, siteArgs, contextGenes) \ + combine.combine(themes, locTargets) \ + themeAloneCombinations def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) self.triggerFeatureBuilder.initSentence(sentenceGraph) #examples = [] exampleIndex = 0 #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) undirected = sentenceGraph.dependencyGraph.toUndirected() paths = undirected # Get argument order self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths) # Map tokens to character offsets tokenByOffset = {} for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] if goldGraph != None: # check that the tokenizations match goldToken = goldGraph.tokens[i] assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset") tokenByOffset[token.get("charOffset")] = token.get("id") # Map gold entities to their head offsets goldEntitiesByOffset = {} if goldGraph != None: for entity in goldGraph.entities: offset = entity.get("headOffset") assert offset != None if not goldEntitiesByOffset.has_key(offset): goldEntitiesByOffset[offset] = [] goldEntitiesByOffset[offset].append(entity) # Generate examples based on interactions between entities or interactions between tokens # interactionsByEntityId = {} # for entity in sentenceGraph.entities: # interactionsByEntityId[entity.get("id")] = [] # for interaction in sentenceGraph.interactions: # if interaction.get("type") == "neg": # continue # e1Id = interaction.get("e1") # interactionsByEntityId[e1Id].append(interaction) if self.styles["no_merge"]: mergeInput = False entities = sentenceGraph.entities else: mergeInput = True sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities exampleIndex = 0 for entity in entities: # sentenceGraph.entities: eType = entity.get("type") assert eType != None, entity.attrib eType = str(eType) #if eType not in ["Binding", "Positive_regulation", "Negative_regulation", "Regulation"]: # continue #if not goldEntitiesByOffset.has_key(entity.get("headOffset")): # continue #interactions = interactionsByEntityId[entity.get("id")] interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)] argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id")) #if len(argCombinations) <= 1: # continue assert argCombinations != None, (entity.get("id"), entity.get("type")) for argCombination in argCombinations: if eType != "Process": assert len(argCombination) > 0, eType + ": " + str(argCombinations) # Originally binary classification if goldGraph != None: isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset) #if eType == "Binding": # print argCombination[0].get("e1"), len(argCombination), isGoldEvent else: isGoldEvent = False # Named (multi-)class if isGoldEvent: #category = "event" category = eType if category.find("egulation") != -1: category = "All_regulation" elif category != "Binding": category = "Other" #"simple6" else: category = "neg" features = {} argString = "" for arg in argCombination: argString += "," + arg.get("id") extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category} assert type(extra["etype"]) == types.StringType, extra self.exampleStats.addExample(category) example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions) example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex) example[1] = self.classSet.getId(category) example[3] = extra #examples.append( example ) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 #return examples return exampleIndex def buildExample(self, sentenceGraph, paths, eventEntity, argCombination, allInteractions): #themeEntities, causeEntities=None): # NOTE!!!! TODO # add also features for arguments present, but not in this combination features = {} self.features = features self.buildInterArgumentBagOfWords(argCombination, sentenceGraph) eventEntityType = eventEntity.get("type") if eventEntityType == "Binding": interactionIndex = {} groupInteractionLengths = [] for interaction in allInteractions: groupInteractionLengths.append(self.interactionLenghts[interaction]) groupInteractionLengths.sort(compareInteractionPrecedence) #print groupInteractionLengths for i in range(len(groupInteractionLengths)): interactionIndex[groupInteractionLengths[i][0]] = i eventToken = sentenceGraph.entityHeadTokenByEntity[eventEntity] self.triggerFeatureBuilder.setFeatureVector(self.features) self.triggerFeatureBuilder.tag = "trg_" self.triggerFeatureBuilder.buildFeatures(eventToken) self.triggerFeatureBuilder.tag = None #self.setFeature("rootType_"+eventEntity.get("type"), 1) #argThemeCount = 0 #argCauseCount = 0 argCounts = {} # Current example's edge combination for arg in argCombination: argType = arg.get("type") if argType not in argCounts: argCounts[argType] = 0 argCounts[argType] += 1 tag = "arg" + argType if eventEntityType == "Binding" and argType == "Theme": tag += str(interactionIndex[arg]) self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, tag) ## if arg.get("type") == "Theme": ## #argThemeCount += 1 ## tag = "argTheme" ## self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, tag) # #elif arg.get("type") == "Cause": # Cause # # #argCauseCount += 1 # # self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, "argCause") # else: # self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, "arg"+argType) # Edge group context #contextThemeCount = 0 #contextCauseCount = 0 contextCounts = {} for interaction in allInteractions: if interaction in argCombination: # Already part of current example's combination continue contextArgType = interaction.get("type") if contextArgType not in contextCounts: contextCounts[contextArgType] = 0 contextCounts[contextArgType] += 1 tag = "conArg" + contextArgType if eventEntityType == "Binding" and contextArgType == "Theme": tag += str(interactionIndex[interaction]) self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag) # if interaction.get("type") == "Theme": # contextThemeCount += 1 # tag = "conTheme" # self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag) # if eventEntityType == "Binding": # tag += str(interactionIndex[interaction]) # self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag) # else: # Cause # contextCauseCount += 1 # self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, "conCause") self.setFeature("argCount", len(argCombination)) self.setFeature("argCount_" + str(len(argCombination)), 1) self.setFeature("interactionCount", len(allInteractions)) self.setFeature("interactionCount_" + str(len(allInteractions)), 1) #self.setFeature("argThemeCount", argThemeCount) #self.setFeature("argThemeCount_" + str(argThemeCount), 1) #self.setFeature("argCauseCount", argCauseCount) #self.setFeature("argCauseCount_" + str(argCauseCount), 1) for key in sorted(argCounts.keys()): self.setFeature("arg" + key + "Count", argCounts[key]) self.setFeature("arg" + key + "Count_" + str(argCounts[key]), 1) #self.setFeature("interactionThemeCount", contextThemeCount) #self.setFeature("interactionThemeCount_" + str(contextThemeCount), 1) #self.setFeature("interactionCauseCount", contextCauseCount) #self.setFeature("interactionCauseCount_" + str(contextCauseCount), 1) for key in sorted(contextCounts.keys()): self.setFeature("contextArg" + key + "Count", contextCounts[key]) self.setFeature("contextArg" + key + "Count_" + str(contextCounts[key]), 1) self.triggerFeatureBuilder.tag = "" self.triggerFeatureBuilder.setFeatureVector(None) # Common features # if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization # if entity2.get("isName") == "True": # features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1 # else: # features[self.featureSet.getId("GENIA_regulation_of_event")] = 1 # define extra attributes return [None,None,features,None] def buildArgumentFeatures(self, sentenceGraph, paths, features, eventToken, arg, tag): argEntity = sentenceGraph.entitiesById[arg.get("e2")] argToken = sentenceGraph.entityHeadTokenByEntity[argEntity] self.buildEdgeFeatures(sentenceGraph, paths, features, eventToken, argToken, tag) self.triggerFeatureBuilder.tag = tag + "trg_" self.triggerFeatureBuilder.buildFeatures(argToken) if argEntity.get("isName") == "True": self.setFeature(tag+"Protein", 1) else: self.setFeature(tag+"Event", 1) self.setFeature("nestingEvent", 1) self.setFeature(tag+"_"+argEntity.get("type"), 1) def buildEdgeFeatures(self, sentenceGraph, paths, features, eventToken, argToken, tag): #eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode] #argToken = sentenceGraph.entityHeadTokenByEntity[argNode] self.multiEdgeFeatureBuilder.tag = tag + "_" self.multiEdgeFeatureBuilder.setFeatureVector(features, None, None, False) self.setFeature(tag+"_present", 1) path = paths.getPaths(eventToken, argToken) if eventToken != argToken and len(path) > 0: path = path[0] else: path = [eventToken, argToken] #edges = None if not self.styles["disable_entity_features"]: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) #if not "disable_terminus_features" in self.styles: # self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not self.styles["disable_single_element_features"]: self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, sentenceGraph) if not self.styles["disable_ngram_features"]: self.multiEdgeFeatureBuilder.buildPathGrams(2, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(3, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(4, path, sentenceGraph) # remove for fast if not self.styles["disable_path_edge_features"]: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, sentenceGraph) #self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None, None, None, False) self.multiEdgeFeatureBuilder.tag = "" def buildInterArgumentBagOfWords(self, arguments, sentenceGraph): if len(arguments) < 2: return indexByToken = {} for i in range(len(sentenceGraph.tokens)): indexByToken[sentenceGraph.tokens[i]] = i argTokenIndices = set() for arg in arguments: argEntity = sentenceGraph.entitiesById[arg.get("e2")] argToken = sentenceGraph.entityHeadTokenByEntity[argEntity] argTokenIndices.add(indexByToken[argToken]) minIndex = min(argTokenIndices) maxIndex = max(argTokenIndices) self.setFeature("argBoWRange", (maxIndex-minIndex)) self.setFeature("argBoWRange_" + str(maxIndex-minIndex), 1) bow = set() for i in range(minIndex+1, maxIndex): token = sentenceGraph.tokens[i] if len(sentenceGraph.tokenIsEntityHead[token]) == 0 and not sentenceGraph.tokenIsName[token]: bow.add(token.get("text")) bow = sorted(list(bow)) for word in bow: self.setFeature("argBoW_"+word, 1) if word in ["/", "-"]: self.setFeature("argBoW_slashOrHyphen", 1) if len(bow) == 1: self.setFeature("argBoWonly_"+bow[0], 1) if bow[0] in ["/", "-"]: self.setFeature("argBoWonly_slashOrHyphen", 1)
class UnmergingExampleBuilder(ExampleBuilder): """ This example builder makes unmerging examples, i.e. examples describing potential events. """ #def __init__(self, style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures", length=None, types=[], featureSet=None, classSet=None): def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None): # reset style regardless of input #style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures" if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) defaultNone = ["binary", "trigger_features","typed","directed","no_linear","entities","genia_limits", "noAnnType", "noMasking", "maxFeatures", "no_merge", "disable_entity_features", "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features"] defaultParameters = {} for name in defaultNone: defaultParameters[name] = None defaultParameters["keep_intersentence"] = False defaultParameters["keep_intersentence_gold"] = True defaultParameters["no_arg_count_upper_limit"] = False self.styles = self._setDefaultParameters(defaultParameters) self.styles = self.getParameters(style) self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"] self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles["noMasking"] self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"] #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) self.pathLengths = length assert(self.pathLengths == None) self.types = types self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = True #self.outFile = open("exampleTempFile.txt","wt") def getInteractionEdgeLengths(self, sentenceGraph, paths): """ Return dependency and linear length of all interaction edges (measured between the two tokens). """ interactionLengths = {} count = 0 for interaction in sentenceGraph.interactions: # Calculated interaction edge dep and lin length e1Id = interaction.get("e1") e2Id = interaction.get("e2") if e2Id not in sentenceGraph.entitiesById: # intersentence interaction interactionLengths[interaction] = (interaction, -count, -count, -count) continue e1 = sentenceGraph.entitiesById[e1Id] e2 = sentenceGraph.entitiesById[e2Id] t1 = sentenceGraph.entityHeadTokenByEntity[e1] t2 = sentenceGraph.entityHeadTokenByEntity[e2] # Get dep path length if t1 != t2: path = paths.getPaths(t1, t2) if t1 != t2 and len(path) > 0: pathLength = min(len(x) for x in path) #len(paths[t1][t2]) else: # no dependencyPath pathLength = 999999 # more than any real path # Linear distance t1Pos = -1 t2Pos = -1 for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == t1: t1Pos = i if t2Pos != -1: break if sentenceGraph.tokens[i] == t2: t2Pos = i if t1Pos != -1: break linLength = abs(t1Pos - t2Pos) interactionLengths[interaction] = (interaction, pathLength, linLength, t2Pos) count += 1 return interactionLengths def eventIsGold(self, entity, arguments, sentenceGraph, goldGraph, goldEntitiesByOffset, allGoldInteractions): offset = entity.get("headOffset") if not goldEntitiesByOffset.has_key(offset): return False eType = entity.get("type") goldEntities = goldEntitiesByOffset[offset] # Check all gold entities for a match for goldEntity in goldEntities: isGold = True # The entity type must match if goldEntity.get("type") != eType: isGold = False continue goldEntityId = goldEntity.get("id") # Collect the gold interactions goldInteractions = [] for goldInteraction in allGoldInteractions: #goldGraph.interactions: if goldInteraction.get("e1") == goldEntityId and goldInteraction.get("event") == "True": goldInteractions.append(goldInteraction) # Argument count rules if len(goldInteractions) != len(arguments): # total number of edges differs isGold = False continue # count number of edges per type argTypeCounts = {} for argument in arguments: argType = argument.get("type") if not argTypeCounts.has_key(argType): argTypeCounts[argType] = 0 argTypeCounts[argType] += 1 # count number of gold edges per type goldTypeCounts = {} for argument in goldInteractions: argType = argument.get("type") if not goldTypeCounts.has_key(argType): goldTypeCounts[argType] = 0 goldTypeCounts[argType] += 1 # argument edge counts per type must match if argTypeCounts != goldTypeCounts: isGold = False continue # Exact argument matching for argument in arguments: # check all edges e1 = argument.get("e1") e2 = argument.get("e2") if e2 not in sentenceGraph.entitiesById: # intersentence argument, assumed to be correct found = True continue e2Entity = sentenceGraph.entitiesById[e2] e2Offset = e2Entity.get("headOffset") e2Type = e2Entity.get("type") argType = argument.get("type") found = False for goldInteraction in goldInteractions: if goldInteraction.get("type") == argType: if goldInteraction.get("e2") in goldGraph.entitiesById: # if not, assume this goldInteraction is an intersentence interaction goldE2Entity = goldGraph.entitiesById[goldInteraction.get("e2")] if goldE2Entity.get("headOffset") == e2Offset and goldE2Entity.get("type") == e2Type: found = True break if found == False: # this edge did not have a corresponding gold edge isGold = False break # Event is in gold if isGold: break return isGold def sortInteractionsById(self, interactions): # The order of the interactions affects the order of the unmerging examples, and this # affects performance. It's not clear whether this is what really happens, or whether # the order of the interactions has some effect on the consistency of the unmerging # features (it shouldn't). However, in case it does, this function is left here for now, # although it shouldn't be needed at all. In any case the impact is minimal, for GE # 53.22 vs 53.28 on the development set. pairs = [] for interaction in interactions: pairs.append( (int(interaction.get("id").split(".i")[-1]), interaction) ) pairs.sort() return [x[1] for x in pairs] def processDocument(self, sentences, goldSentences, outfile, structureAnalyzer=None): self.documentEntitiesById = {} for sentence in sentences: for entity in sentence.entities: assert entity.get("id") not in self.documentEntitiesById self.documentEntitiesById[entity.get("id")] = entity for i in range(len(sentences)): sentence = sentences[i] goldSentence = None if goldSentences != None: goldSentence = goldSentences[i] self.progress.update(1, "Building examples ("+sentence.sentence.get("id")+"): ") self.processSentence(sentence, outfile, goldSentence, structureAnalyzer=structureAnalyzer) def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) self.triggerFeatureBuilder.initSentence(sentenceGraph) exampleIndex = 0 undirected = sentenceGraph.dependencyGraph.toUndirected() paths = undirected # Get argument order self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths) # Map tokens to character offsets tokenByOffset = {} for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] if goldGraph != None: # check that the tokenizations match goldToken = goldGraph.tokens[i] assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset") tokenByOffset[token.get("charOffset")] = token.get("id") # Map gold entities to their head offsets goldEntitiesByOffset = {} if goldGraph != None: for entity in goldGraph.entities: offset = entity.get("headOffset") assert offset != None if not goldEntitiesByOffset.has_key(offset): goldEntitiesByOffset[offset] = [] goldEntitiesByOffset[offset].append(entity) if self.styles["no_merge"]: mergeInput = False entities = sentenceGraph.entities else: mergeInput = True sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) exampleIndex = 0 for entity in entities: # sentenceGraph.entities: if type(entity) in types.StringTypes: # dummy entity for intersentence interactions continue eType = entity.get("type") assert eType != None, entity.attrib eType = str(eType) interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)] interactions = self.sortInteractionsById(interactions) interactionCounts = defaultdict(int) validInteractionsByType = defaultdict(list) for interaction in interactions: if interaction.get("event") != "True": continue e1 = sentenceGraph.entitiesById[interaction.get("e1")] if interaction.get("e2") in sentenceGraph.entitiesById: e2 = sentenceGraph.entitiesById[interaction.get("e2")] if interaction.get("type") in structureAnalyzer.getValidEdgeTypes(e1.get("type"), e2.get("type")): validInteractionsByType[interaction.get("type")].append(interaction) else: # intersentence validInteractionsByType[interaction.get("type")].append(interaction) interactionCounts[interaction.get("type")] += 1 interactionCountString = ",".join([key + "=" + str(interactionCounts[key]) for key in sorted(interactionCounts.keys())]) #argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id")) intCombinations = [] validIntTypeCount = 0 maxArgCount = 0 if self.debug: print >> sys.stderr, entity.get("id"), entity.get("type"), "int:" + interactionCountString, "validInt:" + str(validInteractionsByType) for intType in sorted(validInteractionsByType.keys()): # for each argument type the event can have validIntTypeCount += 1 intCombinations.append([]) minArgs, maxArgs = structureAnalyzer.getArgLimits(entity.get("type"), intType) if maxArgs > maxArgCount: maxArgCount = maxArgs #if maxArgs > 1: # allow any number of arguments for cases like Binding # maxArgs = len(validInteractionsByType[intType]) for combLen in range(minArgs, maxArgs+1): # for each valid argument count, get all possible combinations. note that there may be zero-lenght combination for singleTypeArgCombination in combinations(validInteractionsByType[intType], combLen): intCombinations[-1].append(singleTypeArgCombination) # e.g. theme:[a,b], cause:[d] = [[ # intCombinations now contains a list of lists, each of which has a tuple for each valid combination # of one argument type. Next, we'll make all valid combinations of multiple argument types if self.debug: print >> sys.stderr, " ", "intCombinations", intCombinations argCombinations = combine.combine(*intCombinations) if self.debug: print >> sys.stderr, " ", "argCombinations", argCombinations for i in range(len(argCombinations)): argCombinations[i] = sum(argCombinations[i], ()) #sum(argCombinations, []) # flatten nested list if self.debug: print >> sys.stderr, " ", "argCombinations flat", argCombinations for argCombination in argCombinations: # Originally binary classification if goldGraph != None: isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset, goldGraph.interactions) #if eType == "Binding": # print argCombination[0].get("e1"), len(argCombination), isGoldEvent else: isGoldEvent = False # Named (multi-)class if isGoldEvent: # category = "zeroArg" # if validIntTypeCount == 1: # category = "singleArg" # event has 0-1 arguments (old simple6) # if validIntTypeCount > 1: # category = "multiType" # event has arguments of several types, 0-1 of each (old Regulation) # if maxArgCount > 1: # category = "multiArg" # event can have 2-n of at least one argument type (old Binding) if self.styles["binary"]: category = "pos" else: category = entity.get("type") assert category != None else: category = "neg" self.exampleStats.beginExample(category) issues = defaultdict(int) # early out for proteins etc. if validIntTypeCount == 0 and entity.get("given") == "True": self.exampleStats.filter("given-leaf:" + entity.get("type")) if self.debug: print >> sys.stderr, " ", category +"("+eType+")", "arg combination", argCombination, "LEAF" elif structureAnalyzer.isValidEntity(entity) or structureAnalyzer.isValidEvent(entity, argCombination, self.documentEntitiesById, noUpperLimitBeyondOne=self.styles["no_arg_count_upper_limit"], issues=issues): if self.debug: print >> sys.stderr, " ", category, "arg combination", argCombination, "VALID" argString = "" for arg in argCombination: argString += "," + arg.get("type") + "=" + arg.get("id") extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category} extra["allInt"] = interactionCountString assert type(extra["etype"]) in types.StringTypes, extra assert type(extra["class"]) in types.StringTypes, category assert type(extra["i"]) in types.StringTypes, argString example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions) example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex) example[1] = self.classSet.getId(category) example[3] = extra #examples.append( example ) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 else: # not a valid event or valid entity if len(issues) == 0: # must be > 0 so that it gets filtered if not structureAnalyzer.isValidEntity(entity): issues["INVALID_ENTITY:"+eType] += 1 else: issues["UNKNOWN_ISSUE_FOR:"+eType] += 1 for key in issues: self.exampleStats.filter(key) if self.debug: print >> sys.stderr, " ", category, "arg combination", argCombination, "INVALID", issues self.exampleStats.endExample() #return examples return exampleIndex def buildExample(self, sentenceGraph, paths, eventEntity, argCombination, allInteractions): #themeEntities, causeEntities=None): # NOTE!!!! TODO # add also features for arguments present, but not in this combination features = {} self.features = features self.buildInterArgumentBagOfWords(argCombination, sentenceGraph) eventEntityType = eventEntity.get("type") if eventEntityType == "Binding": interactionIndex = {} groupInteractionLengths = [] for interaction in allInteractions: groupInteractionLengths.append(self.interactionLenghts[interaction]) groupInteractionLengths.sort(compareInteractionPrecedence) #print groupInteractionLengths for i in range(len(groupInteractionLengths)): interactionIndex[groupInteractionLengths[i][0]] = i eventToken = sentenceGraph.entityHeadTokenByEntity[eventEntity] self.triggerFeatureBuilder.setFeatureVector(self.features) self.triggerFeatureBuilder.tag = "trg_" self.triggerFeatureBuilder.buildFeatures(eventToken) self.triggerFeatureBuilder.tag = None #self.setFeature("rootType_"+eventEntity.get("type"), 1) argThemeCount = 0 argCauseCount = 0 argCounts = {} # Current example's edge combination for arg in argCombination: if arg.get("type") == "Theme": argThemeCount += 1 tag = "argTheme" self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, tag) if eventEntityType == "Binding": tag += str(interactionIndex[arg]) self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, tag) elif arg.get("type") == "Cause": # Cause argCauseCount += 1 self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, "argCause") else: argType = arg.get("type") if argType not in argCounts: argCounts[argType] = 0 self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, "arg"+argType) argCounts[argType] += 1 # Edge group context contextThemeCount = 0 contextCauseCount = 0 for interaction in allInteractions: if interaction in argCombination: # Already part of current example's combination continue if interaction.get("type") == "Theme": contextThemeCount += 1 tag = "conTheme" self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag) if eventEntityType == "Binding": tag += str(interactionIndex[interaction]) self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag) else: # Cause contextCauseCount += 1 self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, "conCause") self.setFeature("argCount", len(argCombination)) self.setFeature("argCount_" + str(len(argCombination)), 1) self.setFeature("interactionCount", len(allInteractions)) self.setFeature("interactionCount_" + str(len(allInteractions)), 1) self.setFeature("argThemeCount", argThemeCount) self.setFeature("argThemeCount_" + str(argThemeCount), 1) self.setFeature("argCauseCount", argCauseCount) self.setFeature("argCauseCount_" + str(argCauseCount), 1) for key in sorted(argCounts.keys()): self.setFeature("arg" + key + "Count", argCounts[key]) self.setFeature("arg" + key + "Count_" + str(argCounts[key]), 1) self.setFeature("interactionThemeCount", contextThemeCount) self.setFeature("interactionThemeCount_" + str(contextThemeCount), 1) self.setFeature("interactionCauseCount", contextCauseCount) self.setFeature("interactionCauseCount_" + str(contextCauseCount), 1) self.triggerFeatureBuilder.tag = "" self.triggerFeatureBuilder.setFeatureVector(None) # Common features # if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization # if entity2.get("given") == "True": # features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1 # else: # features[self.featureSet.getId("GENIA_regulation_of_event")] = 1 # define extra attributes return [None,None,features,None] def buildArgumentFeatures(self, sentenceGraph, paths, features, eventToken, arg, tag): if arg.get("e2") not in sentenceGraph.entitiesById: # intersentence argument return argEntity = sentenceGraph.entitiesById[arg.get("e2")] argToken = sentenceGraph.entityHeadTokenByEntity[argEntity] self.buildEdgeFeatures(sentenceGraph, paths, features, eventToken, argToken, tag) self.triggerFeatureBuilder.tag = tag + "trg_" self.triggerFeatureBuilder.buildFeatures(argToken) if argEntity.get("given") == "True": self.setFeature(tag+"Protein", 1) else: self.setFeature(tag+"Event", 1) self.setFeature("nestingEvent", 1) self.setFeature(tag+"_"+argEntity.get("type"), 1) def buildEdgeFeatures(self, sentenceGraph, paths, features, eventToken, argToken, tag): #eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode] #argToken = sentenceGraph.entityHeadTokenByEntity[argNode] self.multiEdgeFeatureBuilder.tag = tag + "_" self.multiEdgeFeatureBuilder.setFeatureVector(features, None, None, False) self.setFeature(tag+"_present", 1) path = paths.getPaths(eventToken, argToken) if eventToken != argToken and len(path) > 0: path = path[0] else: path = [eventToken, argToken] #edges = None if not self.styles["disable_entity_features"]: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) #if not "disable_terminus_features" in self.styles: # self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not self.styles["disable_single_element_features"]: self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, sentenceGraph) if not self.styles["disable_ngram_features"]: self.multiEdgeFeatureBuilder.buildPathGrams(2, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(3, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(4, path, sentenceGraph) # remove for fast if not self.styles["disable_path_edge_features"]: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, sentenceGraph) #self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None, None, None, False) self.multiEdgeFeatureBuilder.tag = "" def buildInterArgumentBagOfWords(self, arguments, sentenceGraph): if len(arguments) < 2: return indexByToken = {} for i in range(len(sentenceGraph.tokens)): indexByToken[sentenceGraph.tokens[i]] = i argTokenIndices = set() for arg in arguments: if arg.get("e2") in sentenceGraph.entitiesById: # skip intersentence interactions argEntity = sentenceGraph.entitiesById[arg.get("e2")] argToken = sentenceGraph.entityHeadTokenByEntity[argEntity] argTokenIndices.add(indexByToken[argToken]) if len(argTokenIndices) < 1: return minIndex = min(argTokenIndices) maxIndex = max(argTokenIndices) self.setFeature("argBoWRange", (maxIndex-minIndex)) self.setFeature("argBoWRange_" + str(maxIndex-minIndex), 1) bow = set() for i in range(minIndex+1, maxIndex): token = sentenceGraph.tokens[i] if len(sentenceGraph.tokenIsEntityHead[token]) == 0 and not sentenceGraph.tokenIsName[token]: bow.add(token.get("text")) bow = sorted(list(bow)) for word in bow: self.setFeature("argBoW_"+word, 1) if word in ["/", "-"]: self.setFeature("argBoW_slashOrHyphen", 1) if len(bow) == 1: self.setFeature("argBoWonly_"+bow[0], 1) if bow[0] in ["/", "-"]: self.setFeature("argBoWonly_slashOrHyphen", 1)
class UnmergingExampleBuilder(ExampleBuilder): """ This example builder makes unmerging examples, i.e. examples describing potential events. """ #def __init__(self, style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures", length=None, types=[], featureSet=None, classSet=None): def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None): # reset style regardless of input #style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures" if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) defaultNone = ["binary", "trigger_features","typed","directed","no_linear","entities","genia_limits", "noAnnType", "noMasking", "maxFeatures", "no_merge", "disable_entity_features", "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features"] defaultParameters = {} for name in defaultNone: defaultParameters[name] = None defaultParameters["keep_intersentence"] = False defaultParameters["keep_intersentence_gold"] = True self.styles = self._setDefaultParameters(defaultParameters) self.styles = self.getParameters(style) self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"] self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles["noMasking"] self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"] #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) self.pathLengths = length assert(self.pathLengths == None) self.types = types self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = True #self.outFile = open("exampleTempFile.txt","wt") def getInteractionEdgeLengths(self, sentenceGraph, paths): """ Return dependency and linear length of all interaction edges (measured between the two tokens). """ interactionLengths = {} count = 0 for interaction in sentenceGraph.interactions: # Calculated interaction edge dep and lin length e1Id = interaction.get("e1") e2Id = interaction.get("e2") if e2Id not in sentenceGraph.entitiesById: # intersentence interaction interactionLengths[interaction] = (interaction, -count, -count, -count) continue e1 = sentenceGraph.entitiesById[e1Id] e2 = sentenceGraph.entitiesById[e2Id] t1 = sentenceGraph.entityHeadTokenByEntity[e1] t2 = sentenceGraph.entityHeadTokenByEntity[e2] # Get dep path length if t1 != t2: path = paths.getPaths(t1, t2) if t1 != t2 and len(path) > 0: pathLength = min(len(x) for x in path) #len(paths[t1][t2]) else: # no dependencyPath pathLength = 999999 # more than any real path # Linear distance t1Pos = -1 t2Pos = -1 for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == t1: t1Pos = i if t2Pos != -1: break if sentenceGraph.tokens[i] == t2: t2Pos = i if t1Pos != -1: break linLength = abs(t1Pos - t2Pos) interactionLengths[interaction] = (interaction, pathLength, linLength, t2Pos) count += 1 return interactionLengths def eventIsGold(self, entity, arguments, sentenceGraph, goldGraph, goldEntitiesByOffset, allGoldInteractions): offset = entity.get("headOffset") if not goldEntitiesByOffset.has_key(offset): return False eType = entity.get("type") goldEntities = goldEntitiesByOffset[offset] # Check all gold entities for a match for goldEntity in goldEntities: isGold = True # The entity type must match if goldEntity.get("type") != eType: isGold = False continue goldEntityId = goldEntity.get("id") # Collect the gold interactions goldInteractions = [] for goldInteraction in allGoldInteractions: #goldGraph.interactions: if goldInteraction.get("e1") == goldEntityId and goldInteraction.get("event") == "True": goldInteractions.append(goldInteraction) # Argument count rules if len(goldInteractions) != len(arguments): # total number of edges differs isGold = False continue # count number of edges per type argTypeCounts = {} for argument in arguments: argType = argument.get("type") if not argTypeCounts.has_key(argType): argTypeCounts[argType] = 0 argTypeCounts[argType] += 1 # count number of gold edges per type goldTypeCounts = {} for argument in goldInteractions: argType = argument.get("type") if not goldTypeCounts.has_key(argType): goldTypeCounts[argType] = 0 goldTypeCounts[argType] += 1 # argument edge counts per type must match if argTypeCounts != goldTypeCounts: isGold = False continue # Exact argument matching for argument in arguments: # check all edges e1 = argument.get("e1") e2 = argument.get("e2") if e2 not in sentenceGraph.entitiesById: # intersentence argument, assumed to be correct found = True continue e2Entity = sentenceGraph.entitiesById[e2] e2Offset = e2Entity.get("headOffset") e2Type = e2Entity.get("type") argType = argument.get("type") found = False for goldInteraction in goldInteractions: if goldInteraction.get("type") == argType: if goldInteraction.get("e2") in goldGraph.entitiesById: # if not, assume this goldInteraction is an intersentence interaction goldE2Entity = goldGraph.entitiesById[goldInteraction.get("e2")] if goldE2Entity.get("headOffset") == e2Offset and goldE2Entity.get("type") == e2Type: found = True break if found == False: # this edge did not have a corresponding gold edge isGold = False break # Event is in gold if isGold: break return isGold def sortInteractionsById(self, interactions): # The order of the interactions affects the order of the unmerging examples, and this # affects performance. It's not clear whether this is what really happens, or whether # the order of the interactions has some effect on the consistency of the unmerging # features (it shouldn't). However, in case it does, this function is left here for now, # although it shouldn't be needed at all. In any case the impact is minimal, for GE # 53.22 vs 53.28 on the development set. pairs = [] for interaction in interactions: pairs.append( (int(interaction.get("id").split(".i")[-1]), interaction) ) pairs.sort() return [x[1] for x in pairs] def processDocument(self, sentences, goldSentences, outfile, structureAnalyzer=None): self.documentEntitiesById = {} for sentence in sentences: for entity in sentence.entities: assert entity.get("id") not in self.documentEntitiesById self.documentEntitiesById[entity.get("id")] = entity for i in range(len(sentences)): sentence = sentences[i] goldSentence = None if goldSentences != None: goldSentence = goldSentences[i] self.progress.update(1, "Building examples ("+sentence.sentence.get("id")+"): ") self.processSentence(sentence, outfile, goldSentence, structureAnalyzer=structureAnalyzer) def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) self.triggerFeatureBuilder.initSentence(sentenceGraph) exampleIndex = 0 undirected = sentenceGraph.dependencyGraph.toUndirected() paths = undirected # Get argument order self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths) # Map tokens to character offsets tokenByOffset = {} for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] if goldGraph != None: # check that the tokenizations match goldToken = goldGraph.tokens[i] assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset") tokenByOffset[token.get("charOffset")] = token.get("id") # Map gold entities to their head offsets goldEntitiesByOffset = {} if goldGraph != None: for entity in goldGraph.entities: offset = entity.get("headOffset") assert offset != None if not goldEntitiesByOffset.has_key(offset): goldEntitiesByOffset[offset] = [] goldEntitiesByOffset[offset].append(entity) if self.styles["no_merge"]: mergeInput = False entities = sentenceGraph.entities else: mergeInput = True sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) exampleIndex = 0 for entity in entities: # sentenceGraph.entities: if type(entity) in types.StringTypes: # dummy entity for intersentence interactions continue eType = entity.get("type") assert eType != None, entity.attrib eType = str(eType) interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)] interactions = self.sortInteractionsById(interactions) interactionCounts = defaultdict(int) validInteractionsByType = defaultdict(list) for interaction in interactions: if interaction.get("event") != "True": continue e1 = sentenceGraph.entitiesById[interaction.get("e1")] if interaction.get("e2") in sentenceGraph.entitiesById: e2 = sentenceGraph.entitiesById[interaction.get("e2")] if interaction.get("type") in structureAnalyzer.getValidEdgeTypes(e1.get("type"), e2.get("type")): validInteractionsByType[interaction.get("type")].append(interaction) else: # intersentence validInteractionsByType[interaction.get("type")].append(interaction) interactionCounts[interaction.get("type")] += 1 interactionCountString = ",".join([key + "=" + str(interactionCounts[key]) for key in sorted(interactionCounts.keys())]) #argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id")) intCombinations = [] validIntTypeCount = 0 maxArgCount = 0 if self.debug: print >> sys.stderr, entity.get("id"), entity.get("type"), "int:" + interactionCountString, "validInt:" + str(validInteractionsByType) for intType in sorted(validInteractionsByType.keys()): # for each argument type the event can have validIntTypeCount += 1 intCombinations.append([]) minArgs, maxArgs = structureAnalyzer.getArgLimits(entity.get("type"), intType) if maxArgs > maxArgCount: maxArgCount = maxArgs #if maxArgs > 1: # allow any number of arguments for cases like Binding # maxArgs = len(validInteractionsByType[intType]) for combLen in range(minArgs, maxArgs+1): # for each valid argument count, get all possible combinations. note that there may be zero-lenght combination for singleTypeArgCombination in combinations(validInteractionsByType[intType], combLen): intCombinations[-1].append(singleTypeArgCombination) # e.g. theme:[a,b], cause:[d] = [[ # intCombinations now contains a list of lists, each of which has a tuple for each valid combination # of one argument type. Next, we'll make all valid combinations of multiple argument types if self.debug: print >> sys.stderr, " ", "intCombinations", intCombinations argCombinations = combine.combine(*intCombinations) if self.debug: print >> sys.stderr, " ", "argCombinations", argCombinations for i in range(len(argCombinations)): argCombinations[i] = sum(argCombinations[i], ()) #sum(argCombinations, []) # flatten nested list if self.debug: print >> sys.stderr, " ", "argCombinations flat", argCombinations for argCombination in argCombinations: # Originally binary classification if goldGraph != None: isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset, goldGraph.interactions) #if eType == "Binding": # print argCombination[0].get("e1"), len(argCombination), isGoldEvent else: isGoldEvent = False # Named (multi-)class if isGoldEvent: # category = "zeroArg" # if validIntTypeCount == 1: # category = "singleArg" # event has 0-1 arguments (old simple6) # if validIntTypeCount > 1: # category = "multiType" # event has arguments of several types, 0-1 of each (old Regulation) # if maxArgCount > 1: # category = "multiArg" # event can have 2-n of at least one argument type (old Binding) if self.styles["binary"]: category = "pos" else: category = entity.get("type") assert category != None else: category = "neg" self.exampleStats.beginExample(category) issues = defaultdict(int) # early out for proteins etc. if validIntTypeCount == 0 and entity.get("given") == "True": self.exampleStats.filter("given-leaf:" + entity.get("type")) if self.debug: print >> sys.stderr, " ", category +"("+eType+")", "arg combination", argCombination, "LEAF" elif not structureAnalyzer.isValidEvent(entity, argCombination, self.documentEntitiesById, issues=issues): for key in issues: self.exampleStats.filter(key) if self.debug: print >> sys.stderr, " ", category, "arg combination", argCombination, "INVALID", issues else: if self.debug: print >> sys.stderr, " ", category, "arg combination", argCombination, "VALID" features = {} argString = "" for arg in argCombination: argString += "," + arg.get("type") + "=" + arg.get("id") extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category} extra["allInt"] = interactionCountString assert type(extra["etype"]) in types.StringTypes, extra assert type(extra["class"]) in types.StringTypes, category assert type(extra["i"]) in types.StringTypes, argString example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions) example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex) example[1] = self.classSet.getId(category) example[3] = extra #examples.append( example ) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex def buildExample(self, sentenceGraph, paths, eventEntity, argCombination, allInteractions): #themeEntities, causeEntities=None): # NOTE!!!! TODO # add also features for arguments present, but not in this combination features = {} self.features = features self.buildInterArgumentBagOfWords(argCombination, sentenceGraph) eventEntityType = eventEntity.get("type") if eventEntityType == "Binding": interactionIndex = {} groupInteractionLengths = [] for interaction in allInteractions: groupInteractionLengths.append(self.interactionLenghts[interaction]) groupInteractionLengths.sort(compareInteractionPrecedence) #print groupInteractionLengths for i in range(len(groupInteractionLengths)): interactionIndex[groupInteractionLengths[i][0]] = i eventToken = sentenceGraph.entityHeadTokenByEntity[eventEntity] self.triggerFeatureBuilder.setFeatureVector(self.features) self.triggerFeatureBuilder.tag = "trg_" self.triggerFeatureBuilder.buildFeatures(eventToken) self.triggerFeatureBuilder.tag = None #self.setFeature("rootType_"+eventEntity.get("type"), 1) argThemeCount = 0 argCauseCount = 0 argCounts = {} # Current example's edge combination for arg in argCombination: if arg.get("type") == "Theme": argThemeCount += 1 tag = "argTheme" self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, tag) if eventEntityType == "Binding": tag += str(interactionIndex[arg]) self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, tag) elif arg.get("type") == "Cause": # Cause argCauseCount += 1 self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, "argCause") else: argType = arg.get("type") if argType not in argCounts: argCounts[argType] = 0 self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, "arg"+argType) argCounts[argType] += 1 # Edge group context contextThemeCount = 0 contextCauseCount = 0 for interaction in allInteractions: if interaction in argCombination: # Already part of current example's combination continue if interaction.get("type") == "Theme": contextThemeCount += 1 tag = "conTheme" self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag) if eventEntityType == "Binding": tag += str(interactionIndex[interaction]) self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag) else: # Cause contextCauseCount += 1 self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, "conCause") self.setFeature("argCount", len(argCombination)) self.setFeature("argCount_" + str(len(argCombination)), 1) self.setFeature("interactionCount", len(allInteractions)) self.setFeature("interactionCount_" + str(len(allInteractions)), 1) self.setFeature("argThemeCount", argThemeCount) self.setFeature("argThemeCount_" + str(argThemeCount), 1) self.setFeature("argCauseCount", argCauseCount) self.setFeature("argCauseCount_" + str(argCauseCount), 1) for key in sorted(argCounts.keys()): self.setFeature("arg" + key + "Count", argCounts[key]) self.setFeature("arg" + key + "Count_" + str(argCounts[key]), 1) self.setFeature("interactionThemeCount", contextThemeCount) self.setFeature("interactionThemeCount_" + str(contextThemeCount), 1) self.setFeature("interactionCauseCount", contextCauseCount) self.setFeature("interactionCauseCount_" + str(contextCauseCount), 1) self.triggerFeatureBuilder.tag = "" self.triggerFeatureBuilder.setFeatureVector(None) # Common features # if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization # if entity2.get("given") == "True": # features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1 # else: # features[self.featureSet.getId("GENIA_regulation_of_event")] = 1 # define extra attributes return [None,None,features,None] def buildArgumentFeatures(self, sentenceGraph, paths, features, eventToken, arg, tag): if arg.get("e2") not in sentenceGraph.entitiesById: # intersentence argument return argEntity = sentenceGraph.entitiesById[arg.get("e2")] argToken = sentenceGraph.entityHeadTokenByEntity[argEntity] self.buildEdgeFeatures(sentenceGraph, paths, features, eventToken, argToken, tag) self.triggerFeatureBuilder.tag = tag + "trg_" self.triggerFeatureBuilder.buildFeatures(argToken) if argEntity.get("given") == "True": self.setFeature(tag+"Protein", 1) else: self.setFeature(tag+"Event", 1) self.setFeature("nestingEvent", 1) self.setFeature(tag+"_"+argEntity.get("type"), 1) def buildEdgeFeatures(self, sentenceGraph, paths, features, eventToken, argToken, tag): #eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode] #argToken = sentenceGraph.entityHeadTokenByEntity[argNode] self.multiEdgeFeatureBuilder.tag = tag + "_" self.multiEdgeFeatureBuilder.setFeatureVector(features, None, None, False) self.setFeature(tag+"_present", 1) path = paths.getPaths(eventToken, argToken) if eventToken != argToken and len(path) > 0: path = path[0] else: path = [eventToken, argToken] #edges = None if not self.styles["disable_entity_features"]: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) #if not "disable_terminus_features" in self.styles: # self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not self.styles["disable_single_element_features"]: self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, sentenceGraph) if not self.styles["disable_ngram_features"]: self.multiEdgeFeatureBuilder.buildPathGrams(2, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(3, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(4, path, sentenceGraph) # remove for fast if not self.styles["disable_path_edge_features"]: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, sentenceGraph) #self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None, None, None, False) self.multiEdgeFeatureBuilder.tag = "" def buildInterArgumentBagOfWords(self, arguments, sentenceGraph): if len(arguments) < 2: return indexByToken = {} for i in range(len(sentenceGraph.tokens)): indexByToken[sentenceGraph.tokens[i]] = i argTokenIndices = set() for arg in arguments: if arg.get("e2") in sentenceGraph.entitiesById: # skip intersentence interactions argEntity = sentenceGraph.entitiesById[arg.get("e2")] argToken = sentenceGraph.entityHeadTokenByEntity[argEntity] argTokenIndices.add(indexByToken[argToken]) if len(argTokenIndices) < 1: return minIndex = min(argTokenIndices) maxIndex = max(argTokenIndices) self.setFeature("argBoWRange", (maxIndex-minIndex)) self.setFeature("argBoWRange_" + str(maxIndex-minIndex), 1) bow = set() for i in range(minIndex+1, maxIndex): token = sentenceGraph.tokens[i] if len(sentenceGraph.tokenIsEntityHead[token]) == 0 and not sentenceGraph.tokenIsName[token]: bow.add(token.get("text")) bow = sorted(list(bow)) for word in bow: self.setFeature("argBoW_"+word, 1) if word in ["/", "-"]: self.setFeature("argBoW_slashOrHyphen", 1) if len(bow) == 1: self.setFeature("argBoWonly_"+bow[0], 1) if bow[0] in ["/", "-"]: self.setFeature("argBoWonly_slashOrHyphen", 1)
class IntersentenceEdgeExampleBuilder(ExampleBuilder): """ This example builder makes edge examples, i.e. examples describing the event arguments. """ def __init__(self, style=["typed", "directed", "headsOnly"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert (classSet.getId("neg") == 1) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) self.types = types # @classmethod # def run(cls, input, output, parse, tokenization, style, idFileTag=None): # """ # An interface for running the example builder without needing to create a class # """ # classSet, featureSet = cls.getIdSets(idFileTag) # if style != None: # e = MultiEdgeExampleBuilder(style=style, classSet=classSet, featureSet=featureSet) # else: # e = MultiEdgeExampleBuilder(classSet=classSet, featureSet=featureSet) # sentences = cls.getSentences(input, parse, tokenization) # e.buildExamplesForSentences(sentences, output, idFileTag) @classmethod def run(cls, input, output, parse, tokenization, style, idFileTag=None): """ An interface for running the example builder without needing to create a class """ classSet, featureSet = cls.getIdSets(idFileTag) if style != None: e = IntersentenceEdgeExampleBuilder(style=style, classSet=classSet, featureSet=featureSet) else: e = IntersentenceEdgeExampleBuilder(classSet=classSet, featureSet=featureSet) # Load documents if type(input) != types.ListType: # Load corpus and make sentence graphs corpusElements = SentenceGraph.loadCorpus(input, parse, tokenization, False, True) else: # assume input is already a list of sentences assert (removeNameInfo == False) return input # run examplebuilder e.buildExamplesForDocuments(corpusElements.documentSentences, output, idFileTag) def buildExamplesForDocuments(self, documentSentences, output, idFileTag=None): examples = [] counter = ProgressCounter(len(documentSentences), "Build examples") #calculatePredictedRange(self, sentences) outfile = open(output, "wt") exampleCount = 0 for document in documentSentences: counter.update( 1, "Building examples (" + document[0].sentence.get("id") + "): ") examples = self.buildExamples(document) exampleCount += len(examples) #examples = self.preProcessExamples(examples) ExampleUtils.appendExamples(examples, outfile) outfile.close() print >> sys.stderr, "Examples built:", exampleCount print >> sys.stderr, "Features:", len(self.featureSet.getNames()) #IF LOCAL if self.exampleStats.getExampleCount() > 0: self.exampleStats.printStats() #ENDIF # Save Ids if idFileTag != None: print >> sys.stderr, "Saving class names to", idFileTag + ".class_names" self.classSet.write(idFileTag + ".class_names") print >> sys.stderr, "Saving feature names to", idFileTag + ".feature_names" self.featureSet.write(idFileTag + ".feature_names") def getCategoryName(self, sentence1, sentence2, e1, e2, directed=True): """ Example class. Multiple overlapping edges create a merged type. """ e1Id = e1.get("id") e2Id = e2.get("id") allInteractions = sentence1.interSentenceInteractions + sentence2.interSentenceInteractions interactions = [] #if len(allInteractions) > 0: # print len(allInteractions) for interaction in allInteractions: if interaction.get("e1") == e1Id and interaction.get("e2") == e2Id: interactions.append(interaction) types = set() for interaction in interactions: types.add(interaction.get("type")) types = list(types) types.sort() categoryName = "" for name in types: if categoryName != "": categoryName += "---" categoryName += name if categoryName != "": return categoryName else: return "neg" def isPotentialCOInteraction(self, e1, e2): if e1.get("type") == "Exp" and e2.get("type") == "Exp": return True else: return False def buildExamples(self, documentSentences): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ examples = [] exampleIndex = 0 for documentSentence in documentSentences: if documentSentence.sentenceGraph != None: documentSentence.sentenceGraph.undirected = documentSentence.sentenceGraph.dependencyGraph.toUndirected( ) documentSentence.triggerFeatureBuilder = TriggerFeatureBuilder( self.featureSet) documentSentence.triggerFeatureBuilder.useNonNameEntities = True documentSentence.triggerFeatureBuilder.initSentence( documentSentence.sentenceGraph) # Generate examples based on interactions between entities or interactions between tokens maxDistance = 1 for sentence1Index in range(len(documentSentences)): sentence1 = documentSentences[sentence1Index] if sentence1.sentenceGraph == None: continue for sentence2Index in range( sentence1Index + 1, min(sentence1Index + 1 + maxDistance, len(documentSentences))): sentence2 = documentSentences[sentence2Index] if sentence2.sentenceGraph == None: continue if "entities" in self.styles: loopRange1 = len(sentence1.sentenceGraph.entities) loopRange2 = len(sentence2.sentenceGraph.entities) else: loopRange = len(sentenceGraph.tokens) for i in range(loopRange1): for j in range(loopRange2): eI = None eJ = None if "entities" in self.styles: eI = sentence1.sentenceGraph.entities[i] eJ = sentence2.sentenceGraph.entities[j] tI = sentence1.sentenceGraph.entityHeadTokenByEntity[ eI] tJ = sentence2.sentenceGraph.entityHeadTokenByEntity[ eJ] #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True": # continue if eI.get("type") == "neg" or eJ.get( "type") == "neg": continue else: tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] # only consider paths between entities (NOTE! entities, not only named entities) if "headsOnly" in self.styles: if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len( sentenceGraph.tokenIsEntityHead[tJ]) == 0): continue if "directed" in self.styles: # define forward if "entities" in self.styles: categoryName = self.getCategoryName( sentence1, sentence2, eI, eJ, True) else: categoryName = self.getCategoryNameFromTokens( sentenceGraph, tI, tJ, True) # make forward self.exampleStats.beginExample(categoryName) makeExample = True if ("co_limits" in self.styles ) and not self.isPotentialCOInteraction( eI, eJ): makeExample = False self.exampleStats.filter("co_limits") if makeExample: examples.append( self.buildExample(sentence1, sentence2, categoryName, exampleIndex, eI, eJ)) exampleIndex += 1 self.exampleStats.endExample() # define reverse if "entities" in self.styles: categoryName = self.getCategoryName( sentence2, sentence1, eJ, eI, True) else: categoryName = self.getCategoryNameFromTokens( sentenceGraph, tJ, tI, True) # make reverse self.exampleStats.beginExample(categoryName) makeExample = True if ("co_limits" in self.styles ) and not self.isPotentialCOInteraction( eJ, eI): makeExample = False self.exampleStats.filter("co_limits") if makeExample: examples.append( self.buildExample(sentence2, sentence1, categoryName, exampleIndex, eJ, eI)) exampleIndex += 1 self.exampleStats.endExample() else: if "entities" in self.styles: categoryName = self.getCategoryName( sentenceGraph, eI, eJ, False) else: categoryName = self.getCategoryNameFromTokens( sentenceGraph, tI, tJ, False) self.exampleStats.beginExample(categoryName) forwardExample = self.buildExample( tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) if not "graph_kernel" in self.styles: reverseExample = self.buildExample( tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) forwardExample[2].update(reverseExample[2]) examples.append(forwardExample) exampleIndex += 1 self.exampleStats.endExample() return examples def getRootToken(self, sentenceGraph, token, visited=None, level=0): if visited == None: visited = set() inEdges = sentenceGraph.dependencyGraph.getInEdges(token) rv = None for inEdge in inEdges: if inEdge not in visited: visited.add(inEdge) rvNew = self.getRootToken(sentenceGraph, inEdge[0], visited, level + 1) if rv == None or rvNew[1] > rv[1]: rv = rvNew if rv == None: return (token, level) else: return rv def buildExample(self, sentence1, sentence2, categoryName, exampleIndex, entity1=None, entity2=None): """ Build a single directed example for the potential edge between token1 and token2 """ # define features features = {} e1Token = sentence1.sentenceGraph.entityHeadTokenByEntity[entity1] e2Token = sentence2.sentenceGraph.entityHeadTokenByEntity[entity2] e1RootToken = self.getRootToken(sentence1.sentenceGraph, e1Token)[0] e2RootToken = self.getRootToken(sentence2.sentenceGraph, e2Token)[0] e1Path = sentence1.sentenceGraph.undirected.getPaths( e1Token, e1RootToken) e2Path = sentence2.sentenceGraph.undirected.getPaths( e2RootToken, e2Token) if len(e1Path) > 0: e1Path = e1Path[0] else: e1Path = [e1Token, e1RootToken] if len(e2Path) > 0: e2Path = e2Path[0] else: e2Path = [e2RootToken, e2Token] # build features if "trigger_features" in self.styles: # F 85.52 -> 85.55 sentence1.triggerFeatureBuilder.setFeatureVector(features) sentence1.triggerFeatureBuilder.tag = "trg1_" sentence1.triggerFeatureBuilder.buildFeatures(e1Token) sentence1.triggerFeatureBuilder.setFeatureVector(None) sentence2.triggerFeatureBuilder.setFeatureVector(features) sentence2.triggerFeatureBuilder.tag = "trg2_" sentence2.triggerFeatureBuilder.buildFeatures(e2Token) sentence2.triggerFeatureBuilder.setFeatureVector(None) if "entity_type" in self.styles: features[self.featureSet.getId("e1_" + entity1.get("type"))] = 1 features[self.featureSet.getId("e2_" + entity2.get("type"))] = 1 features[self.featureSet.getId("distance_" + str(len(e1Path) + len(e2Path)))] = 1 if not "no_dependency" in self.styles: for pair in ([e1Path, "e1Edge_", entity1, None, sentence1], [e2Path, "e2Edge_", None, entity2, sentence2]): self.multiEdgeFeatureBuilder.tag = pair[1] self.multiEdgeFeatureBuilder.setFeatureVector( features, pair[2], pair[3]) #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast if not "disable_entity_features" in self.styles: self.multiEdgeFeatureBuilder.buildEntityFeatures( pair[4].sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(pair[0]) if not "disable_terminus_features" in self.styles: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures( pair[0], pair[4].sentenceGraph) # remove for fast if not "disable_single_element_features" in self.styles: self.multiEdgeFeatureBuilder.buildSingleElementFeatures( pair[0], pair[4].sentenceGraph) if not "disable_ngram_features" in self.styles: #print "NGrams" self.multiEdgeFeatureBuilder.buildPathGrams( 2, pair[0], pair[4].sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 3, pair[0], pair[4].sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 4, pair[0], pair[4].sentenceGraph) # remove for fast #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast #if edges != None: # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast if not "disable_path_edge_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures( pair[0], pair[4].sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures( pair[4].sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) # if not "no_linear" in self.styles: # self.tokenFeatureBuilder.setFeatureVector(features) # for i in range(len(sentenceGraph.tokens)): # if sentenceGraph.tokens[i] == token1: # token1Index = i # if sentenceGraph.tokens[i] == token2: # token2Index = i # linearPreTag = "linfw_" # if token1Index > token2Index: # token1Index, token2Index = token2Index, token1Index # linearPreTag = "linrv_" # self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1") # self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2") # Before, middle, after # self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf") # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw") # self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af") # before-middle, middle, middle-after # self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2) # self.tokenFeatureBuilder.setFeatureVector(None) # if "random" in self.styles: # self.randomFeatureBuilder.setFeatureVector(features) # self.randomFeatureBuilder.buildRandomFeatures(100, 0.01) # self.randomFeatureBuilder.setFeatureVector(None) # define extra attributes extra = { "xtype": "edge", "type": "i", "t1": e1Token.get("id"), "t2": e2Token.get("id") } if entity1 != None: #extra["e1"] = entity1 extra["e1"] = entity1.get("id") if entity2 != None: #extra["e2"] = entity2 extra["e2"] = entity2.get("id") extra["categoryName"] = categoryName # make example if "binary" in self.styles: if categoryName != "neg": category = 1 else: category = -1 categoryName = "i" else: category = self.classSet.getId(categoryName) return (sentence1.sentence.get("id") + ".x" + str(exampleIndex), category, features, extra)
class UnmergedEdgeExampleBuilder(ExampleBuilder): def __init__(self, style=["typed","directed","headsOnly"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) self.pathLengths = length assert(self.pathLengths == None) self.types = types if "random" in self.styles: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet) #self.outFile = open("exampleTempFile.txt","wt") @classmethod def run(cls, input, output, parse, tokenization, style, idFileTag=None): classSet, featureSet = cls.getIdSets(idFileTag) if style == None: e = UnmergedEdgeExampleBuilder(classSet=classSet, featureSet=featureSet) else: e = UnmergedEdgeExampleBuilder(style=style, classSet=classSet, featureSet=featureSet) sentences = cls.getSentences(input, parse, tokenization) e.buildExamplesForSentences(sentences, output, idFileTag) print e.classSet.Ids def definePredictedValueRange(self, sentences, elementName): self.multiEdgeFeatureBuilder.definePredictedValueRange(sentences, elementName) def getPredictedValueRange(self): return self.multiEdgeFeatureBuilder.predictedRange def filterEdgesByType(self, edges, typesToInclude): if len(typesToInclude) == 0: return edges edgesToKeep = [] for edge in edges: if edge.get("type") in typesToInclude: edgesToKeep.append(edge) return edgesToKeep def getCategoryName(self, sentenceGraph, e1, e2, directed=True): # Dummies are potential entities that do not exist in the # training data. If both entities of an interaction are dummies # it can't exist in the training data and is therefore a negative if e1[2] or e2[2]: return "neg" e1 = e1[0] e2 = e2[0] interactions = sentenceGraph.getInteractions(e1, e2) if not directed: interactions.extend(sentenceGraph.getInteractions(e2, e1)) types = set() for interaction in interactions: types.add(interaction.attrib["type"]) types = list(types) types.sort() categoryName = "" for name in types: if categoryName != "": categoryName += "---" categoryName += name if categoryName != "": return categoryName else: return "neg" def preProcessExamples(self, allExamples): if "normalize" in self.styles: print >> sys.stderr, " Normalizing feature vectors" ExampleUtils.normalizeFeatureVectors(allExamples) return allExamples def isPotentialGeniaInteraction(self, e1, e2): if e1.get("isName") == "True": return False else: return True def nxMultiDiGraphToUndirected(self, graph): undirected = NX10.MultiGraph(name=graph.name) undirected.add_nodes_from(graph) undirected.add_edges_from(graph.edges_iter()) return undirected def getInteractionEdgeLengths(self, sentenceGraph, paths): """ Return dependency and linear length of all interaction edges (measured between the two tokens). """ interactionLengths = {} for interaction in sentenceGraph.interactions: # Calculated interaction edge dep and lin length e1 = sentenceGraph.entitiesById[interaction.get("e1")] e2 = sentenceGraph.entitiesById[interaction.get("e2")] t1 = sentenceGraph.entityHeadTokenByEntity[e1] t2 = sentenceGraph.entityHeadTokenByEntity[e2] # Get dep path length if t1 != t2 and paths.has_key(t1) and paths[t1].has_key(t2): pathLength = len(paths[t1][t2]) else: # no dependencyPath pathLength = 999999 # more than any real path # Linear distance t1Pos = -1 t2Pos = -1 for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == t1: t1Pos = i if t2Pos != -1: break if sentenceGraph.tokens[i] == t2: t2Pos = i if t1Pos != -1: break linLength = abs(t1Pos - t2Pos) interactionLengths[interaction] = (pathLength, linLength) return interactionLengths def getPrecedenceLevels(self, sentenceGraph, paths): """ Get overlapping entity precedence """ interactionLengths = self.getInteractionEdgeLengths(sentenceGraph, paths) interactionsByEntity = {} # Convenience mapping entityPrecedenceValues = {} for entity in sentenceGraph.entities: interactionsByEntity[entity] = [] eId = entity.get("id") # Add access to interactions argDepDist = 0 # Sum of lengths of shortest paths argLinDist = 0 # Sum of linear distances for interaction in sentenceGraph.interactions: if interaction.get("e1") == eId: # An argument of the entity defined by the node interactionsByEntity[entity].append(interaction) argDepDist += interactionLengths[interaction][0] argLinDist += interactionLengths[interaction][1] # Store precedence counts (num args, sum of dep lengths, sum of lin lengths) entityPrecedenceValues[entity] = (len(interactionsByEntity), argDepDist, argLinDist, entity) # Determine level of entity from precedence counts levelByEntity = {} # slot number #levelByInteraction = {} # slot number of parent node # There is one slot group per token, per type for token in sentenceGraph.tokens: # per token entitiesByType = {} for entity in sentenceGraph.tokenIsEntityHead[token]: # per type if entity.get("isName") == "True": # Names can never have duplicates assert not levelByEntity.has_key(entity) levelByEntity[entity] = 0 continue eType = entity.get("type") if eType == "neg": continue if not entitiesByType.has_key(eType): entitiesByType[eType] = [] entitiesByType[eType].append(entity) for eType in sorted(entitiesByType.keys()): # Slot ordering by precedence sortedEntities = [] for entity in entitiesByType[eType]: sortedEntities.append(entityPrecedenceValues[entity]) sortedEntities.sort(compareEntityPrecedence) level = 0 for precedenceTuple in sortedEntities: entity = precedenceTuple[3] assert not levelByEntity.has_key(entity) levelByEntity[entity] = level # Interactions have the same slot as their parent entity #for interaction in interactionsByEntity[entity]: # assert not levelByInteraction.has_key(interaction) # levelByInteraction[interaction] = level level += 1 return levelByEntity#, levelByInteraction def buildExamples(self, sentenceGraph): examples = [] exampleIndex = 0 #undirected = sentenceGraph.getUndirectedDependencyGraph() undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) ##undirected = sentenceGraph.dependencyGraph.to_undirected() ###undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) # Determine overlapping entity precedence #levelByEntity, levelByInteraction = self.getPrecedenceLevels(sentenceGraph, paths) levelByEntity = self.getPrecedenceLevels(sentenceGraph, paths) entities = [] # There is one entity group for each token, for each type of entity for token in sentenceGraph.tokens: # per token entitiesByType = {} for entity in sentenceGraph.tokenIsEntityHead[token]: # per type if entity.get("isName") == "True": # Names can never have duplicates entities.append( (entity, 0, False) ) continue eType = entity.get("type") if eType == "neg": continue if not entitiesByType.has_key(eType): entitiesByType[eType] = [] entitiesByType[eType].append(entity) # Create slot groups for tokens for which exists at least one entity eTypes = sorted(entitiesByType.keys()) if len(eTypes) == 0: continue # Create slot groups and insert GS data there for eType in eTypes: # Use first entity of a type as the dummy entity for unfilled slots dummyEntity = entitiesByType[eType][0] # Define entity slots entityGroup = [None, None, None, None] #entityGroup = [None, None] # Insert existing entities into slots for entity in entitiesByType[eType]: if levelByEntity.has_key(entity): level = levelByEntity[entity] if level < len(entityGroup): entityGroup[level] = (entity, level, False) # Create dummies for potential entities for i in range(len(entityGroup)): if entityGroup[i] == None: entityGroup[i] = (dummyEntity, i, True) # Put all slots into one potential entity list #print entityGroup for e in entityGroup: entities.append(e) # Generate examples based on interactions between entities for i in range(len(entities)-1): for j in range(i+1,len(entities)): eI = entities[i][0] eJ = entities[j][0] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] # define forward example categoryName = self.getCategoryName(sentenceGraph, entities[i], entities[j], True) if (not "genia_limits" in self.styles) or self.isPotentialGeniaInteraction(eI, eJ): examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, entities[i], entities[j]) ) exampleIndex += 1 # define reverse categoryName = self.getCategoryName(sentenceGraph, entities[j], entities[i], True) if (not "genia_limits" in self.styles) or self.isPotentialGeniaInteraction(eJ, eI): examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, entities[j], entities[i]) ) exampleIndex += 1 return examples def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, exampleIndex, e1=None, e2=None): entity1=e1[0] entity2=e2[0] # define features features = {} features[self.featureSet.getId("gov_level")] = e1[1] features[self.featureSet.getId("gov_level_"+str(e1[1]))] = 1 features[self.featureSet.getId("dep_level")] = e2[1] features[self.featureSet.getId("dep_level_"+str(e2[1]))] = 1 features[self.featureSet.getId("level_pair_"+str(e1[1])+"_"+str(e2[1]))] = 1 if True: #token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): if token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): path = paths[token1][token2] else: path = [token1, token2] assert(self.pathLengths == None) if self.pathLengths == None or len(path)-1 in self.pathLengths: if not "no_dependency" in self.styles: if token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): edges = self.multiEdgeFeatureBuilder.getEdges(sentenceGraph.dependencyGraph, path) else: edges = None if "entity_type" in self.styles: features[self.featureSet.getId("e1_"+entity1.attrib["type"])] = 1 features[self.featureSet.getId("e2_"+entity2.attrib["type"])] = 1 features[self.featureSet.getId("distance_"+str(len(path)))] = 1 if not "no_dependency" in self.styles: self.multiEdgeFeatureBuilder.setFeatureVector(features, entity1, entity2) #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast if not "disable_entity_features" in self.styles: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not "disable_terminus_features" in self.styles: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not "disable_single_element_features" in self.styles: self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, edges, sentenceGraph) if not "disable_ngram_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathGrams(2, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(3, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(4, path, edges, sentenceGraph) # remove for fast #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast #if edges != None: # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast if not "disable_path_edge_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, edges, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) if not "no_linear" in self.styles: self.tokenFeatureBuilder.setFeatureVector(features) for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == token1: token1Index = i if sentenceGraph.tokens[i] == token2: token2Index = i linearPreTag = "linfw_" if token1Index > token2Index: token1Index, token2Index = token2Index, token1Index linearPreTag = "linrv_" self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1") self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2") # Before, middle, after # self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf") # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw") # self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af") # before-middle, middle, middle-after # self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2) self.tokenFeatureBuilder.setFeatureVector(None) if "random" in self.styles: self.randomFeatureBuilder.setFeatureVector(features) self.randomFeatureBuilder.buildRandomFeatures(100, 0.01) self.randomFeatureBuilder.setFeatureVector(None) if "genia_limits" in self.styles: e1Type = entity1.get("type") e2Type = entity2.get("type") assert(entity1.get("isName") == "False") if entity2.get("isName") == "True": features[self.featureSet.getId("GENIA_target_protein")] = 1 else: features[self.featureSet.getId("GENIA_nested_event")] = 1 if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization if entity2.get("isName") == "True": features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1 else: features[self.featureSet.getId("GENIA_regulation_of_event")] = 1 else: features[self.featureSet.getId("always_negative")] = 1 if "subset" in self.styles: features[self.featureSet.getId("out_of_scope")] = 1 else: features[self.featureSet.getId("always_negative")] = 1 if "subset" in self.styles: features[self.featureSet.getId("out_of_scope")] = 1 path = [token1, token2] # define extra attributes if int(path[0].attrib["id"].split("_")[-1]) < int(path[-1].attrib["id"].split("_")[-1]): #extra = {"xtype":"edge","type":"i","t1":path[0],"t2":path[-1]} extra = {"xtype":"ue","type":"i","t1":path[0].get("id"),"t2":path[-1].get("id")} extra["deprev"] = False else: #extra = {"xtype":"edge","type":"i","t1":path[-1],"t2":path[0]} extra = {"xtype":"ue","type":"i","t1":path[-1].get("id"),"t2":path[0].get("id")} extra["deprev"] = True if entity1 != None: extra["e1"] = entity1.get("id") extra["l1"] = str(e1[1]) extra["d1"] = str(e1[2])[0] # is a dummy node (an entity not in existing triggers) if entity2 != None: extra["e2"] = entity2.get("id") extra["l2"] = str(e2[1]) extra["d2"] = str(e2[2])[0] # is a dummy node (an entity not in existing triggers) extra["categoryName"] = categoryName sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId # make example if "binary" in self.styles: if categoryName != "neg": category = 1 else: category = -1 categoryName = "i" else: category = self.classSet.getId(categoryName) return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)
class Round2TriggerExampleBuilder(ExampleBuilder): def nxMultiDiGraphToUndirected(self, graph): undirected = NX10.MultiGraph(name=graph.name) undirected.add_nodes_from(graph) undirected.add_edges_from(graph.edges_iter()) return undirected def getPredictionStrength(self, element): eType = element.get("type") predictions = element.get("predictions") if predictions == None: return 0 predictions = predictions.split(",") for prediction in predictions: predClass, predStrength = prediction.split(":") if predClass == eType: predStrength = float(predStrength) return predStrength return 0 def getInteractionEdgeLengths(self, sentenceGraph, paths): """ Return dependency and linear length of all interaction edges (measured between the two tokens). """ interactionLengths = {} for interaction in sentenceGraph.interactions: # Calculated interaction edge dep and lin length e1 = sentenceGraph.entitiesById[interaction.get("e1")] e2 = sentenceGraph.entitiesById[interaction.get("e2")] t1 = sentenceGraph.entityHeadTokenByEntity[e1] t2 = sentenceGraph.entityHeadTokenByEntity[e2] # Get dep path length if t1 != t2 and paths.has_key(t1) and paths[t1].has_key(t2): pathLength = len(paths[t1][t2]) else: # no dependencyPath pathLength = 999999 # more than any real path # Linear distance t1Pos = -1 t2Pos = -1 for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == t1: t1Pos = i if t2Pos != -1: break if sentenceGraph.tokens[i] == t2: t2Pos = i if t1Pos != -1: break linLength = abs(t1Pos - t2Pos) interactionLengths[interaction] = (interaction, pathLength, linLength, t2Pos) return interactionLengths def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None): if classSet == None: classSet = IdSet(1) assert classSet.getId("neg") == 1 if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) # gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" if gazetteerFileName != None: self.gazetteer = Gazetteer.loadGztr(gazetteerFileName) print >>sys.stderr, "Loaded gazetteer from", gazetteerFileName else: print >>sys.stderr, "No gazetteer loaded" self.gazetteer = None self.styles = style self.skiplist = set() if skiplist != None: f = open(skiplist, "rt") for line in f.readlines(): self.skiplist.add(line.strip()) f.close() self.styles = [ "trigger_features", "typed", "directed", "no_linear", "entities", "genia_limits", "noMasking", "maxFeatures", ] self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "graph_kernel" in self.styles: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) @classmethod def run(cls, input, gold, output, parse, tokenization, style, idFileTag=None, append=False): """ An interface for running the example builder without needing to create a class """ classSet, featureSet = cls.getIdSets(idFileTag) if style != None: e = Round2TriggerExampleBuilder(style=style, classSet=classSet, featureSet=featureSet) else: e = Round2TriggerExampleBuilder(classSet=classSet, featureSet=featureSet) sentences = cls.getSentences(input, parse, tokenization) if gold != None: goldSentences = cls.getSentences(gold, parse, tokenization) else: goldSentences = None e.buildExamplesForSentences(sentences, goldSentences, output, idFileTag, append=append) def buildExamplesForSentences(self, sentences, goldSentences, output, idFileTag=None, append=False): examples = [] counter = ProgressCounter(len(sentences), "Build examples") if append: outfile = open(output, "at") else: outfile = open(output, "wt") exampleCount = 0 for i in range(len(sentences)): sentence = sentences[i] goldSentence = [None] if goldSentences != None: goldSentence = goldSentences[i] counter.update(1, "Building examples (" + sentence[0].getSentenceId() + "): ") examples = self.buildExamples(sentence[0], goldSentence[0], append=append) exampleCount += len(examples) examples = self.preProcessExamples(examples) ExampleUtils.appendExamples(examples, outfile) outfile.close() print >>sys.stderr, "Examples built:", exampleCount print >>sys.stderr, "Features:", len(self.featureSet.getNames()) # IF LOCAL if self.exampleStats.getExampleCount() > 0: self.exampleStats.printStats() # ENDIF # Save Ids if idFileTag != None: print >>sys.stderr, "Saving class names to", idFileTag + ".class_names" self.classSet.write(idFileTag + ".class_names") print >>sys.stderr, "Saving feature names to", idFileTag + ".feature_names" self.featureSet.write(idFileTag + ".feature_names") def preProcessExamples(self, allExamples): if "normalize" in self.styles: print >>sys.stderr, " Normalizing feature vectors" ExampleUtils.normalizeFeatureVectors(allExamples) return allExamples def getMergedEntityType(self, entities): """ If a single token belongs to multiple entities of different types, a new, composite type is defined. This type is the alphabetically ordered types of these entities joined with '---'. """ types = set() for entity in entities: types.add(entity.get("type")) types = list(types) types.sort() typeString = "" for type in types: if type == "Protein" and "all_tokens" in self.styles: continue if typeString != "": typeString += "---" typeString += type if typeString == "": return "neg" if "limit_merged_types" in self.styles: if typeString.find("---") != -1: if typeString == "Gene_expression---Positive_regulation": return typeString else: return typeString.split("---")[0] else: return typeString return typeString def getTokenFeatures(self, token, sentenceGraph): """ Returns a list of features based on the attributes of a token. These can be used to define more complex features. """ # These features are cached when this method is first called # for a token. if self.tokenFeatures.has_key(token): return self.tokenFeatures[token] tokTxt = sentenceGraph.getTokenText(token) features = {} features["_txt_" + tokTxt] = 1 # F 69.35 -> 68.22 # normalizedText = tokTxt.replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower() # features["_norTxt_"+normalizedText]=1 # features["_norStem_" + PorterStemmer.stem(normalizedText)]=1 features["_POS_" + token.get("POS")] = 1 if sentenceGraph.tokenIsName[token]: features["_isName"] = 1 for entity in sentenceGraph.tokenIsEntityHead[token]: if entity.get("isName") == "True": features["_annType_" + entity.get("type")] = 1 # Filip's gazetteer based features (can be used separately from exclude_gazetteer) if "gazetteer_features" in self.styles: tokTxtLower = tokTxt.lower() if "stem_gazetteer" in self.styles: tokTxtLower = PorterStemmer.stem(tokTxtLower) if self.gazetteer and tokTxtLower in self.gazetteer: for label, weight in self.gazetteer[tokTxtLower].items(): features["_knownLabel_" + label] = weight # 1 performs slightly worse self.tokenFeatures[token] = features return features def buildLinearOrderFeatures(self, sentenceGraph, index, tag, features): """ Linear features are built by marking token features with a tag that defines their relative position in the linear order. """ tag = "linear_" + tag for tokenFeature, w in self.getTokenFeatures(sentenceGraph.tokens[index], sentenceGraph).iteritems(): features[self.featureSet.getId(tag + tokenFeature)] = w def buildExamples(self, sentenceGraph, goldGraph, append=False): examples = self.buildExamplesInner(sentenceGraph, goldGraph) entityCounts = {} exampleCounts = {} for entity in sentenceGraph.entities: eType = entity.get("type") if eType == "Protein": continue if not entityCounts.has_key(eType): entityCounts[eType] = 0 exampleCounts[eType] = 0 entityCounts[eType] += 1 for example in examples: eTypes = self.classSet.getName(example[1]).split("---") for eType in eTypes: if not exampleCounts.has_key(eType): exampleCounts[eType] = 0 exampleCounts[eType] += 1 # for key in sorted(entityCounts.keys()): # if entityCounts[key] != exampleCounts[key]: # print >> sys.stderr, "Warning, sentence", sentenceGraph.getSentenceId(), "example", key, "diff", entityCounts[key] - exampleCounts[key] return examples def buildExamplesInner(self, sentenceGraph, goldGraph): """ Build one example for each token of the sentence """ if sentenceGraph.sentenceElement.get("origId") in self.skiplist: print >>sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get("origId") return [] self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) self.triggerFeatureBuilder.initSentence(sentenceGraph) undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) # Get argument order self.interactionLengths = self.getInteractionEdgeLengths(sentenceGraph, paths) self.interactionLengths = self.interactionLengths.values() self.interactionLengths.sort(compareInteractionPrecedence) # Map tokens to entities tokenByOffset = {} for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] if goldGraph != None: goldToken = goldGraph.tokens[i] assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset") tokenByOffset[token.get("charOffset")] = token.get("id") # Map gold entities to their head offsets goldEntitiesByOffset = {} for token in sentenceGraph.tokens: goldEntitiesByOffset[token.get("charOffset")] = [] entityToGold = {} for entity in sentenceGraph.entities: entityToGold[entity] = [] if goldGraph != None: for entity in goldGraph.entities: offset = entity.get("headOffset") assert offset != None goldEntitiesByOffset[offset].append(entity) # Map predicted entities to gold entities for entity in sentenceGraph.entities: eType = entity.get("type") eOffset = entity.get("headOffset") for goldEntity in goldEntitiesByOffset[eOffset]: if goldEntity.get("type") == eType: entityToGold[entity].append(goldEntity) # Map entities to interactions # interactionsByEntityId = {} # for entity in sentenceGraph.entities: # interactionsByEntityId[entity.get("id")] = [] # Map tokens to interactions interactionsByToken = {} for token in sentenceGraph.tokens: interactionsByToken[token] = [] for interactionTuple in self.interactionLengths: interaction = interactionTuple[0] if interaction.get("type") == "neg": continue e1Id = interaction.get("e1") token = sentenceGraph.entityHeadTokenByEntity[sentenceGraph.entitiesById[e1Id]] interactionsByToken[token].append(interaction) examples = [] exampleIndex = 0 self.tokenFeatures = {} # namedEntityNorStrings = set() namedEntityHeadTokens = [] if not "names" in self.styles: namedEntityCount = 0 for entity in sentenceGraph.entities: if entity.get("isName") == "True": # known data which can be used for features namedEntityCount += 1 # namedEntityNorStrings.add( entity.get("text").replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower() ) namedEntityCountFeature = "nameCount_" + str(namedEntityCount) # if namedEntityCount == 0: # no names, no need for triggers # return [] if "pos_pairs" in self.styles: namedEntityHeadTokens = self.getNamedEntityHeadTokens(sentenceGraph) # neFeatures = {} # F: 69.35 -> 69.14 # for norString in namedEntityNorStrings: # neFeatures[self.featureSet.getId("norNE_" + norString)] = 1 bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 bowFeatures = {} for k, v in bagOfWords.iteritems(): bowFeatures[self.featureSet.getId(k)] = v self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True) fixedInEdges = [] for edge in inEdges: fixedInEdges.append((edge[0], edge[1], edge[2]["element"])) inEdges = fixedInEdges inEdges.sort(compareDependencyEdgesById) self.inEdgesByToken[token] = inEdges outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True) fixedOutEdges = [] for edge in outEdges: fixedOutEdges.append((edge[0], edge[1], edge[2]["element"])) outEdges = fixedOutEdges outEdges.sort(compareDependencyEdgesById) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] # Recognize only non-named entities (i.e. interaction words) if sentenceGraph.tokenIsName[token] and not "names" in self.styles and not "all_tokens" in self.styles: continue # CLASS # if len(sentenceGraph.tokenIsEntityHead[token]) > 0: # category = self.classSet.getId(self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token])) # else: # category = 1 offset = token.get("charOffset") if len(goldEntitiesByOffset[offset]) > 0: category = self.classSet.getId(self.getMergedEntityType(goldEntitiesByOffset[offset])) else: category = 1 tokenText = token.get("text").lower() if "stem_gazetteer" in self.styles: tokenText = PorterStemmer.stem(tokenText) if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer: features = {} features[self.featureSet.getId("exclude_gazetteer")] = 1 extra = {"xtype": "token", "t": token.get("id"), "excluded": "True"} examples.append((sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)) exampleIndex += 1 continue # FEATURES features = {} self.features = features if not "names" in self.styles: features[self.featureSet.getId(namedEntityCountFeature)] = 1 # for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) # features.update(neFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_" + text)] = 1 features[self.featureSet.getId("POS_" + token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_" + stem)] = 1 features[self.featureSet.getId("nonstem_" + text[len(stem) :])] = 1 # Normalized versions of the string (if same as non-normalized, overlap without effect) normalizedText = ( text.replace("-", "").replace("/", "").replace(",", "").replace("\\", "").replace(" ", "").lower() ) if normalizedText == "bound": # should be for all irregular verbs normalizedText = "bind" features[self.featureSet.getId("txt_" + normalizedText)] = 1 norStem = PorterStemmer.stem(normalizedText) features[self.featureSet.getId("stem_" + norStem)] = 1 features[self.featureSet.getId("nonstem_" + normalizedText[len(norStem) :])] = 1 if "gazetteer_features_maintoken" in self.styles: tokTxtLower = text.lower() if "stem_gazetteer" in self.styles: tokTxtLower = PorterStemmer.stem(tokTxtLower) if self.gazetteer and tokTxtLower in self.gazetteer: for label, weight in self.gazetteer[tokTxtLower].items(): features[self.featureSet.getId("gaz_knownLabel_" + label)] = weight # 1 performs slightly worse # Linear order features # for index in [-3,-2,-1,1,2,3,4,5]: # 69.35 -> 68.97 for index in [-3, -2, -1, 1, 2, 3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j - 1] == "-": features[self.featureSet.getId("has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_" + text[j - 1 : j + 1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_" + text[j - 2 : j + 1].lower())] = 1 # Attached edges (Hanging in and out edges) t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_" + edgeType)] = 1 features[self.featureSet.getId("t1HIn_" + edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_" + tokenText)] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + tokenText)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_" + edgeType)] = 1 features[self.featureSet.getId("t1HOut_" + edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_" + tokenText)] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + tokenText)] = 1 extra = {"xtype": "token", "t": token.get("id")} examples.append((sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)) exampleIndex += 1 # chains self.buildChains(token, sentenceGraph, features) if "pos_pairs" in self.styles: self.buildPOSPairs(token, namedEntityHeadTokens, features) self.buildPredictionFeatures(sentenceGraph, paths, token, interactionsByToken[token]) return examples def buildChains(self, token, sentenceGraph, features, depthLeft=3, chain="", visited=None): if depthLeft == 0: return strDepthLeft = "dist_" + str(depthLeft) if visited == None: visited = set() inEdges = self.inEdgesByToken[token] outEdges = self.outEdgesByToken[token] edgeSet = visited.union(self.edgeSetByToken[token]) for edge in inEdges: if not edge in visited: edgeType = edge[2].get("type") features[self.featureSet.getId("dep_" + strDepthLeft + edgeType)] = 1 nextToken = edge[0] for tokenFeature, w in self.getTokenFeatures(nextToken, sentenceGraph).iteritems(): features[self.featureSet.getId(strDepthLeft + tokenFeature)] = w # for entity in sentenceGraph.tokenIsEntityHead[nextToken]: # if entity.get("isName") == "True": # features[self.featureSet.getId("name_dist_"+strDepthLeft)] = 1 # features[self.featureSet.getId("name_dist_"+strDepthLeft+entity.get("type"))] = 1 # features[self.featureSet.getId("POS_dist_"+strDepthLeft+nextToken.get("POS"))] = 1 # tokenText = sentenceGraph.getTokenText(nextToken) # features[self.featureSet.getId("text_dist_"+strDepthLeft+tokenText)] = 1 if sentenceGraph.tokenIsName[nextToken]: features[self.featureSet.getId("name_chain_dist_" + strDepthLeft + chain + "-frw_" + edgeType)] = 1 features[self.featureSet.getId("chain_dist_" + strDepthLeft + chain + "-frw_" + edgeType)] = 1 self.buildChains(nextToken, sentenceGraph, features, depthLeft - 1, chain + "-frw_" + edgeType, edgeSet) for edge in outEdges: if not edge in visited: edgeType = edge[2].get("type") features[self.featureSet.getId("dep_dist_" + strDepthLeft + edgeType)] = 1 nextToken = edge[1] for tokenFeature, w in self.getTokenFeatures(nextToken, sentenceGraph).iteritems(): features[self.featureSet.getId(strDepthLeft + tokenFeature)] = w # for entity in sentenceGraph.tokenIsEntityHead[nextToken]: # if entity.get("isName") == "True": # features[self.featureSet.getId("name_dist_"+strDepthLeft)] = 1 # features[self.featureSet.getId("name_dist_"+strDepthLeft+entity.get("type"))] = 1 # features[self.featureSet.getId("POS_dist_"+strDepthLeft+nextToken.get("POS"))] = 1 # tokenText = sentenceGraph.getTokenText(nextToken) # features[self.featureSet.getId("text_dist_"+strDepthLeft+tokenText)] = 1 if sentenceGraph.tokenIsName[nextToken]: features[self.featureSet.getId("name_chain_dist_" + strDepthLeft + chain + "-rev_" + edgeType)] = 1 features[self.featureSet.getId("chain_dist_" + strDepthLeft + chain + "-rev_" + edgeType)] = 1 self.buildChains(nextToken, sentenceGraph, features, depthLeft - 1, chain + "-rev_" + edgeType, edgeSet) def getNamedEntityHeadTokens(self, sentenceGraph): headTokens = [] for entity in sentenceGraph.entities: if entity.get("isName") == "True": # known data which can be used for features headTokens.append(sentenceGraph.entityHeadTokenByEntity[entity]) return headTokens def buildPOSPairs(self, token, namedEntityHeadTokens, features): tokenPOS = token.get("POS") assert tokenPOS != None for headToken in namedEntityHeadTokens: headPOS = headToken.get("POS") features[self.featureSet.getId("POS_pair_NE_" + tokenPOS + "-" + headPOS)] = 1 ###################################################### # Unmerging-style features ###################################################### def buildPredictionFeatures(self, sentenceGraph, paths, token, interactions): # themeEntities, causeEntities=None): # NOTE!!!! TODO # add also features for arguments present, but not in this combination self.buildInterArgumentBagOfWords(interactions, sentenceGraph) if sentenceGraph.entitiesByToken.has_key(token): for eventEntity in sentenceGraph.entitiesByToken[token]: eventEntityType = eventEntity.get("type") self.setFeature("rootType_" + eventEntity.get("type"), 1) self.setFeature("predStrength" + eventEntityType, self.getPredictionStrength(eventEntity)) self.triggerFeatureBuilder.setFeatureVector(self.features) self.triggerFeatureBuilder.tag = "trg" + eventEntityType + "_" self.triggerFeatureBuilder.buildFeatures(token) self.triggerFeatureBuilder.tag = None argThemeCount = 0 argCauseCount = 0 # Current example's edge combination for i in range(len(interactions)): arg = interactions[i] if arg.get("type") == "Theme": argThemeCount += 1 self.buildArgumentFeatures(sentenceGraph, paths, self.features, token, arg, "argTheme") self.buildArgumentFeatures(sentenceGraph, paths, self.features, token, arg, "argTheme" + str(i)) else: # Cause argCauseCount += 1 self.buildArgumentFeatures(sentenceGraph, paths, self.features, token, arg, "argCause") self.buildArgumentFeatures(sentenceGraph, paths, self.features, token, arg, "argCause" + str(i)) self.setFeature("argCount", len(interactions)) self.setFeature("argCount_" + str(len(interactions)), 1) self.setFeature("argThemeCount", argThemeCount) self.setFeature("argThemeCount_" + str(argThemeCount), 1) self.setFeature("argCauseCount", argCauseCount) self.setFeature("argCauseCount_" + str(argCauseCount), 1) self.triggerFeatureBuilder.tag = "" self.triggerFeatureBuilder.setFeatureVector(None) def buildArgumentFeatures(self, sentenceGraph, paths, features, eventToken, arg, tag): argEntity = sentenceGraph.entitiesById[arg.get("e2")] argToken = sentenceGraph.entityHeadTokenByEntity[argEntity] self.buildEdgeFeatures(sentenceGraph, paths, features, eventToken, argToken, tag) self.triggerFeatureBuilder.tag = tag + "trg_" self.triggerFeatureBuilder.buildFeatures(argToken) if argEntity.get("isName") == "True": self.setFeature(tag + "Protein", 1) else: self.setFeature(tag + "Event", 1) self.setFeature("nestingEvent", 1) self.setFeature(tag + "_" + argEntity.get("type"), 1) def buildEdgeFeatures(self, sentenceGraph, paths, features, eventToken, argToken, tag): # eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode] # argToken = sentenceGraph.entityHeadTokenByEntity[argNode] self.multiEdgeFeatureBuilder.tag = tag + "_" self.multiEdgeFeatureBuilder.setFeatureVector(features, None, None, False) self.setFeature(tag + "_present", 1) if eventToken != argToken and paths.has_key(eventToken) and paths[eventToken].has_key(argToken): path = paths[eventToken][argToken] edges = self.multiEdgeFeatureBuilder.getEdges(sentenceGraph.dependencyGraph, path) else: path = [eventToken, argToken] edges = None if not "disable_entity_features" in self.styles: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) # if not "disable_terminus_features" in self.styles: # self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not "disable_single_element_features" in self.styles: self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, edges, sentenceGraph) if not "disable_ngram_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathGrams(2, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(3, path, edges, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(4, path, edges, sentenceGraph) # remove for fast if not "disable_path_edge_features" in self.styles: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, edges, sentenceGraph) # self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None, None, None, False) self.multiEdgeFeatureBuilder.tag = "" def buildInterArgumentBagOfWords(self, arguments, sentenceGraph): if len(arguments) < 2: return indexByToken = {} for i in range(len(sentenceGraph.tokens)): indexByToken[sentenceGraph.tokens[i]] = i argTokenIndices = set() for arg in arguments: argEntity = sentenceGraph.entitiesById[arg.get("e2")] argToken = sentenceGraph.entityHeadTokenByEntity[argEntity] argTokenIndices.add(indexByToken[argToken]) minIndex = min(argTokenIndices) maxIndex = max(argTokenIndices) self.setFeature("argBoWRange", (maxIndex - minIndex)) self.setFeature("argBoWRange_" + str(maxIndex - minIndex), 1) bow = set() for i in range(minIndex + 1, maxIndex): token = sentenceGraph.tokens[i] if len(sentenceGraph.tokenIsEntityHead[token]) == 0 and not sentenceGraph.tokenIsName[token]: bow.add(token.get("text")) bow = sorted(list(bow)) for word in bow: self.setFeature("argBoW_" + word, 1) if word in ["/", "-"]: self.setFeature("argBoW_slashOrHyphen", 1) if len(bow) == 1: self.setFeature("argBoWonly_" + bow[0], 1) if bow[0] in ["/", "-"]: self.setFeature("argBoWonly_slashOrHyphen", 1)