def duplicateEquiv(event, duplDict, debug): """ If the event (event tree) has arguments which have Equiv-statements, create a new event for each combination. Otherwise, return just the existing event. """ argList = [] # depth-first argument list hasEquiv = getArgs(event, argList) if not hasEquiv: return [event] if debug: print "----------------------------------------------" print "Event:", event.id, event.type, event.arguments print " Orig. Duplicates:", argList combinations = combine.combine(*argList) # make all combinations if debug: print " Dup. Combinations:", combinations newEvents = [] count = 0 # used only for marking duplicates' ids for combination in combinations: createdEvents = makeEvent(event, combination, count, duplDict=duplDict, debug=debug) newEvent = createdEvents[0] if debug: for createdEvent in createdEvents: if createdEvent == newEvent: print " New Event (root):", createdEvent.id, createdEvent.type, createdEvent.arguments else: print " New Event:", createdEvent.id, createdEvent.type, createdEvent.arguments Validate.validate([createdEvent], simulation=True) newEvents.append(newEvent) count += 1 return newEvents
def duplicateEquiv(event, duplDict, debug): """ If the event (event tree) has arguments which have Equiv-statements, create a new event for each combination. Otherwise, return just the existing event. """ argList = [] # depth-first argument list hasEquiv = getArgs(event, argList) if not hasEquiv: return [event] if debug: print "----------------------------------------------" print "Event:", event.id, event.type, event.arguments print " Orig. Duplicates:", argList combinations = combine.combine(*argList) # make all combinations if debug: print " Dup. Combinations:", combinations newEvents = [] count = 0 # used only for marking duplicates' ids for combination in combinations: createdEvents = makeEvent(event, combination, count, duplDict=duplDict, debug=debug) newEvent = createdEvents[0] if debug: for createdEvent in createdEvents: if createdEvent == newEvent: print " New Event (root):", createdEvent.id, createdEvent.type, createdEvent.arguments else: print " New Event:", createdEvent.id, createdEvent.type, createdEvent.arguments #Validate.validate([createdEvent], simulation=True) newEvents.append(newEvent) count += 1 return newEvents
def addEvent(self, example, arguments, sentenceObject, forceAdd=False, predictionStrength=None, exampleNotes=None): if len(arguments) == 0: # A zero-argument event e1Id = example[3]["e"] origE1 = sentenceObject.entitiesById[e1Id] entityCombinations = [None] else: # Collect e2 entities linked by this event e1Id = None origE1 = None argEntities = [[]] * (len(arguments)) for i in range(len(arguments)): arg = arguments[i] argE1Id = arg.get("e1") # Take the entity trigger node from the e1 attribute of the argument if e1Id != None: # trigger has already been found assert argE1Id in self.entityToDuplicates[e1Id], ((e1Id, argE1Id), example[3], arguments) # assert e1Id == argE1Id, ((e1Id, argE1Id), example[3], arguments) else: # find the trigger (any of the original identical triggers is OK e1Id = argE1Id origE1 = sentenceObject.entitiesById[argE1Id] e2Id = arg.get("e2") if e2Id in sentenceObject.entitiesById: origE2 = sentenceObject.entitiesById[e2Id] e2HeadOffset = origE2.get("headOffset") e2Type = origE2.get("type") argEntities[i] = self.entitiesByHeadByType[e2HeadOffset][e2Type] if len(argEntities[i]) == 0: assert forceAdd if origE2.get("given") != "True": argEntities[i] = [self.addEntity(origE2)] else: argEntities[i] = [origE2] else: argEntities[i] = ["INTERSENTENCE"] entityCombinations = combine.combine(*argEntities) for combination in entityCombinations: assert origE1 != None, ( sentenceObject.sentence.get("id"), exampleNotes, [(x.get("id"), x.get("e1"), x.get("e2")) for x in arguments], ) root = self.addEntity(origE1) if predictionStrength != None: root.set("umConf", str(predictionStrength)) for i in range(len(arguments)): self.addInteraction(root, combination[i], arguments[i])
def addEvent(self, example, arguments, sentenceObject, forceAdd=False, predictionStrength=None, exampleNotes=None): if len(arguments) == 0: # A zero-argument event e1Id = example[3]["e"] origE1 = sentenceObject.entitiesById[e1Id] entityCombinations = [None] else: # Collect e2 entities linked by this event e1Id = None origE1 = None argEntities = [[]] * (len(arguments)) for i in range(len(arguments)): arg = arguments[i] argE1Id = arg.get("e1") # Take the entity trigger node from the e1 attribute of the argument if e1Id != None: # trigger has already been found assert argE1Id in self.entityToDuplicates[e1Id], ((e1Id, argE1Id), example[3], arguments) #assert e1Id == argE1Id, ((e1Id, argE1Id), example[3], arguments) else: # find the trigger (any of the original identical triggers is OK e1Id = argE1Id origE1 = sentenceObject.entitiesById[argE1Id] e2Id = arg.get("e2") if e2Id in sentenceObject.entitiesById: origE2 = sentenceObject.entitiesById[e2Id] e2HeadOffset = origE2.get("headOffset") e2Type = origE2.get("type") argEntities[i] = self.entitiesByHeadByType[e2HeadOffset][e2Type] if len(argEntities[i]) == 0: assert forceAdd if origE2.get("given") != "True": argEntities[i] = [self.addEntity(origE2)] else: argEntities[i] = [origE2] else: argEntities[i] = ["INTERSENTENCE"] entityCombinations = combine.combine(*argEntities) for combination in entityCombinations: assert origE1 != None, (sentenceObject.sentence.get("id"), exampleNotes, [(x.get("id"), x.get("e1"), x.get("e2")) for x in arguments]) root = self.addEntity(origE1) if predictionStrength != None: root.set("umConf", str(predictionStrength)) for i in range(len(arguments)): self.addInteraction(root, combination[i], arguments[i])
def addEvent(self, arguments, sentenceObject, umType="unknown", forceAdd=False, predictionStrength=None, exampleNotes=None): assert len(arguments) > 0, (sentenceObject.sentence.get("id"), exampleNotes) # Collect e2 entities linked by this event e1Id = None origE1 = None argEntities = [[]] * (len(arguments)) for i in range(len(arguments)): arg = arguments[i] argE1Id = arg.get("e1") # Take the entity trigger node from the e1 attribute of the argument if e1Id != None: # trigger has already been found assert e1Id == argE1Id else: # find the trigger e1Id = argE1Id origE1 = sentenceObject.entitiesById[argE1Id] e2Id = arg.get("e2") origE2 = sentenceObject.entitiesById[e2Id] e2HeadOffset = origE2.get("headOffset") e2Type = origE2.get("type") argEntities[i] = self.entitiesByHeadByType[e2HeadOffset][e2Type] if len(argEntities[i]) == 0: assert forceAdd if origE2.get("isName") != "True": argEntities[i] = [self.addEntity(origE2)] else: argEntities[i] = origE2 entityCombinations = combine.combine(*argEntities) for combination in entityCombinations: assert origE1 != None, (sentenceObject.sentence.get("id"), exampleNotes, [(x.get("id"), x.get("e1"), x.get("e2")) for x in arguments]) root = self.addEntity(origE1) root.set("umType", umType) if predictionStrength != None: root.set("umStrength", str(predictionStrength)) for i in range(len(arguments)): self.addInteraction(root, combination[i], arguments[i])
def getCombinations(parameters, order=None): parameters = get(parameters) parameterNames = sorted(parameters.keys()) if order != None: assert sorted(order) == parameterNames parameterNames = order #parameterNames.sort() #parameterNames.reverse() # to put trigger parameter first (allows optimized 3-parameter grid) parameterValues = [] for parameterName in parameterNames: parameterValues.append([]) values = parameters[parameterName] if isinstance(values, (list, tuple)): for value in values: parameterValues[-1].append((parameterName, value)) else: parameterValues[-1].append((parameterName, values)) combinationLists = combine.combine(*parameterValues) combinations = [] for combinationList in combinationLists: combinations.append({}) for value in combinationList: combinations[-1][value[0]] = value[1] return combinations
def getCombinations(parameters, order=None): parameters = get(parameters) parameterNames = sorted(parameters.keys()) if order != None: assert sorted(order) == parameterNames parameterNames = order #parameterNames.sort() #parameterNames.reverse() # to put trigger parameter first (allows optimized 3-parameter grid) parameterValues = [] for parameterName in parameterNames: parameterValues.append([]) values = parameters[parameterName] if isinstance(values, (list, tuple)): for value in values: parameterValues[-1].append( (parameterName,value) ) else: parameterValues[-1].append( (parameterName,values) ) combinationLists = combine.combine(*parameterValues) combinations = [] for combinationList in combinationLists: combinations.append({}) for value in combinationList: combinations[-1][value[0]] = value[1] return combinations
def doGrid(self): print >> sys.stderr, "--------- Parameter grid search ---------" # Build trigger examples self.triggerDetector.buildExamples( self.model, [self.optData], [self.workDir + "grid-trigger-examples"]) if self.fullGrid: stepParams = { "trigger": Parameters.get(self.model.getStr(self.triggerDetector.tag + "classifier-parameters-train", defaultIfNotExist=""), valueListKey="c"), "booster": [float(i) for i in self.recallAdjustParameters.split(",")], "edge": Parameters.get(self.model.getStr(self.edgeDetector.tag + "classifier-parameters-train", defaultIfNotExist=""), valueListKey="c") } else: stepParams = { "trigger": Parameters.get(self.model.getStr(self.triggerDetector.tag + "classifier-parameter", defaultIfNotExist=""), valueListKey="c"), "booster": [float(i) for i in self.recallAdjustParameters.split(",")], "edge": Parameters.get(self.model.getStr(self.edgeDetector.tag + "classifier-parameter", defaultIfNotExist=""), valueListKey="c") } for step in ["trigger", "edge"]: stepParams[step] = Parameters.getCombinations(stepParams[step]) for i in range(len(stepParams[step])): stepParams[step][i] = Parameters.toString(stepParams[step][i]) print >> sys.stderr, "Parameters", [ stepParams[x] for x in ["trigger", "booster", "edge"] ] paramCombinations = combine( *[stepParams[x] for x in ["trigger", "booster", "edge"]]) print >> sys.stderr, "Combinations", paramCombinations for i in range(len(paramCombinations)): paramCombinations[i] = { "trigger": paramCombinations[i][0], "booster": paramCombinations[i][1], "edge": paramCombinations[i][2] } #paramCombinations = Parameters.getCombinations(ALL_PARAMS, ["trigger", "booster", "edge"]) prevParams = None EDGE_MODEL_STEM = os.path.join( self.edgeDetector.workDir, os.path.normpath(self.model.path) + "-edge-models/model") TRIGGER_MODEL_STEM = os.path.join( self.triggerDetector.workDir, os.path.normpath(self.model.path) + "-trigger-models/model") self.structureAnalyzer.load(self.model) bestResults = None for i in range(len(paramCombinations)): params = paramCombinations[i] print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" print >> sys.stderr, "Processing params", str(i + 1) + "/" + str( len(paramCombinations)), params print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" # Triggers and Boost (the trigger predictions are recalculated only when the relevant parameters change) if (prevParams == None) or ( prevParams["trigger"] != params["trigger"]) or ( prevParams["booster"] != params["booster"]): print >> sys.stderr, "Classifying trigger examples for parameters", "trigger:" + str( params["trigger"]), "booster:" + str(params["booster"]) xml = self.triggerDetector.classifyToXML( self.optData, self.model, self.workDir + "grid-trigger-examples", self.workDir + "grid-", classifierModel=TRIGGER_MODEL_STEM + Parameters.toId(params["trigger"]), recallAdjust=params["booster"], useExistingExamples=True) prevParams = params ## Build edge examples #self.edgeDetector.buildExamples(self.model, [xml], [self.workDir+"grid-edge-examples"], [self.optData]) # Classify with pre-defined model edgeClassifierModel = EDGE_MODEL_STEM + Parameters.toId( params["edge"]) xml = self.edgeDetector.classifyToXML( xml, self.model, self.workDir + "grid-edge-examples", self.workDir + "grid-", classifierModel=edgeClassifierModel, goldData=self.optData) bestResults = self.evaluateGrid(xml, params, bestResults) # Remove remaining intermediate grid files for tag1 in ["edge", "trigger", "unmerging"]: for tag2 in ["examples", "pred.xml.gz"]: if os.path.exists(self.workDir + "grid-" + tag1 + "-" + tag2): os.remove(self.workDir + "grid-" + tag1 + "-" + tag2) print >> sys.stderr, "Parameter grid search complete" print >> sys.stderr, "Tested", len(paramCombinations), "combinations" print >> sys.stderr, "Best parameters:", bestResults[0] print >> sys.stderr, "Best result:", bestResults[2] # f-score # Save grid model self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.model) self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.combinedModel, False) if self.fullGrid: # define best models self.triggerDetector.addClassifierModel( self.model, TRIGGER_MODEL_STEM + str(bestResults[0]["trigger"]), bestResults[0]["trigger"]) self.edgeDetector.addClassifierModel( self.model, EDGE_MODEL_STEM + str(bestResults[0]["edge"]), bestResults[0]["edge"]) # Remove work files for stepTag in [ self.workDir + "grid-trigger", self.workDir + "grid-edge", self.workDir + "grid-unmerging" ]: for fileStem in [ "-classifications", "-classifications.log", "examples.gz", "pred.xml.gz" ]: if os.path.exists(stepTag + fileStem): os.remove(stepTag + fileStem)
def doGrid(self): print >> sys.stderr, "--------- Parameter grid search ---------" # Build trigger examples self.triggerDetector.buildExamples(self.model, [self.optData], [self.workDir+"grid-trigger-examples.gz"]) if self.fullGrid: stepParams = { "trigger":Parameters.get(self.model.getStr(self.triggerDetector.tag+"classifier-parameters-train", defaultIfNotExist=""), valueListKey="c"), "booster":[float(i) for i in self.recallAdjustParameters.split(",")], "edge":Parameters.get(self.model.getStr(self.edgeDetector.tag+"classifier-parameters-train", defaultIfNotExist=""), valueListKey="c")} else: stepParams = { "trigger":Parameters.get(self.model.getStr(self.triggerDetector.tag+"classifier-parameter", defaultIfNotExist=""), valueListKey="c"), "booster":[float(i) for i in self.recallAdjustParameters.split(",")], "edge":Parameters.get(self.model.getStr(self.edgeDetector.tag+"classifier-parameter", defaultIfNotExist=""), valueListKey="c")} for step in ["trigger", "edge"]: stepParams[step] = Parameters.getCombinations(stepParams[step]) for i in range(len(stepParams[step])): stepParams[step][i] = Parameters.toString(stepParams[step][i]) print >> sys.stderr, [stepParams[x] for x in ["trigger", "booster", "edge"]] paramCombinations = combine(*[stepParams[x] for x in ["trigger", "booster", "edge"]]) print >> sys.stderr, paramCombinations for i in range(len(paramCombinations)): paramCombinations[i] = {"trigger":paramCombinations[i][0], "booster":paramCombinations[i][1], "edge":paramCombinations[i][2]} #paramCombinations = Parameters.getCombinations(ALL_PARAMS, ["trigger", "booster", "edge"]) prevParams = None EDGE_MODEL_STEM = os.path.join(self.edgeDetector.workDir, os.path.normpath(self.model.path)+"-edge-models/model") TRIGGER_MODEL_STEM = os.path.join(self.triggerDetector.workDir, os.path.normpath(self.model.path)+"-trigger-models/model") self.structureAnalyzer.load(self.model) bestResults = None for i in range(len(paramCombinations)): params = paramCombinations[i] print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" print >> sys.stderr, "Processing params", str(i+1) + "/" + str(len(paramCombinations)), params print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" # Triggers and Boost if prevParams == None or prevParams["trigger"] != params["trigger"] or prevParams["trigger"] != params["trigger"]: print >> sys.stderr, "Classifying trigger examples for parameters", "trigger:" + str(params["trigger"]), "booster:" + str(params["booster"]) xml = self.triggerDetector.classifyToXML(self.optData, self.model, self.workDir+"grid-trigger-examples", self.workDir+"grid-", classifierModel=TRIGGER_MODEL_STEM + Parameters.toId(params["trigger"]), recallAdjust=params["booster"]) prevParams = params ## Build edge examples #self.edgeDetector.buildExamples(self.model, [xml], [self.workDir+"grid-edge-examples"], [self.optData]) # Classify with pre-defined model edgeClassifierModel = EDGE_MODEL_STEM + Parameters.toId(params["edge"]) xml = self.edgeDetector.classifyToXML(xml, self.model, self.workDir+"grid-edge-examples", self.workDir+"grid-", classifierModel=edgeClassifierModel, goldData=self.optData) bestResults = self.evaluateGrid(xml, params, bestResults) # Remove remaining intermediate grid files for tag1 in ["edge", "trigger", "unmerging"]: for tag2 in ["examples", "pred.xml.gz"]: if os.path.exists(self.workDir+"grid-"+tag1+"-"+tag2): os.remove(self.workDir+"grid-"+tag1+"-"+tag2) print >> sys.stderr, "Parameter grid search complete" print >> sys.stderr, "Tested", len(paramCombinations), "combinations" print >> sys.stderr, "Best parameters:", bestResults[0] print >> sys.stderr, "Best result:", bestResults[2] # f-score # Save grid model self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.model) self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.combinedModel, False) if self.fullGrid: # define best models self.triggerDetector.addClassifierModel(self.model, TRIGGER_MODEL_STEM+str(bestResults[0]["trigger"]), bestResults[0]["trigger"]) self.edgeDetector.addClassifierModel(self.model, EDGE_MODEL_STEM+str(bestResults[0]["edge"]), bestResults[0]["edge"]) # Remove work files for stepTag in [self.workDir+"grid-trigger", self.workDir+"grid-edge", self.workDir+"grid-unmerging"]: for fileStem in ["-classifications", "-classifications.log", "examples.gz", "pred.xml.gz"]: if os.path.exists(stepTag+fileStem): os.remove(stepTag+fileStem)
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) self.triggerFeatureBuilder.initSentence(sentenceGraph) exampleIndex = 0 undirected = sentenceGraph.dependencyGraph.toUndirected() paths = undirected # Get argument order self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths) # Map tokens to character offsets tokenByOffset = {} for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] if goldGraph != None: # check that the tokenizations match goldToken = goldGraph.tokens[i] assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset") tokenByOffset[token.get("charOffset")] = token.get("id") # Map gold entities to their head offsets goldEntitiesByOffset = {} if goldGraph != None: for entity in goldGraph.entities: offset = entity.get("headOffset") assert offset != None if not goldEntitiesByOffset.has_key(offset): goldEntitiesByOffset[offset] = [] goldEntitiesByOffset[offset].append(entity) if self.styles["no_merge"]: mergeInput = False entities = sentenceGraph.entities else: mergeInput = True sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) exampleIndex = 0 for entity in entities: # sentenceGraph.entities: if type(entity) in types.StringTypes: # dummy entity for intersentence interactions continue eType = entity.get("type") assert eType != None, entity.attrib eType = str(eType) interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)] interactions = self.sortInteractionsById(interactions) interactionCounts = defaultdict(int) validInteractionsByType = defaultdict(list) for interaction in interactions: if interaction.get("event") != "True": continue e1 = sentenceGraph.entitiesById[interaction.get("e1")] if interaction.get("e2") in sentenceGraph.entitiesById: e2 = sentenceGraph.entitiesById[interaction.get("e2")] if interaction.get("type") in structureAnalyzer.getValidEdgeTypes(e1.get("type"), e2.get("type")): validInteractionsByType[interaction.get("type")].append(interaction) else: # intersentence validInteractionsByType[interaction.get("type")].append(interaction) interactionCounts[interaction.get("type")] += 1 interactionCountString = ",".join([key + "=" + str(interactionCounts[key]) for key in sorted(interactionCounts.keys())]) #argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id")) intCombinations = [] validIntTypeCount = 0 maxArgCount = 0 if self.debug: print >> sys.stderr, entity.get("id"), entity.get("type"), "int:" + interactionCountString, "validInt:" + str(validInteractionsByType) for intType in sorted(validInteractionsByType.keys()): # for each argument type the event can have validIntTypeCount += 1 intCombinations.append([]) minArgs, maxArgs = structureAnalyzer.getArgLimits(entity.get("type"), intType) if maxArgs > maxArgCount: maxArgCount = maxArgs #if maxArgs > 1: # allow any number of arguments for cases like Binding # maxArgs = len(validInteractionsByType[intType]) for combLen in range(minArgs, maxArgs+1): # for each valid argument count, get all possible combinations. note that there may be zero-lenght combination for singleTypeArgCombination in combinations(validInteractionsByType[intType], combLen): intCombinations[-1].append(singleTypeArgCombination) # e.g. theme:[a,b], cause:[d] = [[ # intCombinations now contains a list of lists, each of which has a tuple for each valid combination # of one argument type. Next, we'll make all valid combinations of multiple argument types if self.debug: print >> sys.stderr, " ", "intCombinations", intCombinations argCombinations = combine.combine(*intCombinations) if self.debug: print >> sys.stderr, " ", "argCombinations", argCombinations for i in range(len(argCombinations)): argCombinations[i] = sum(argCombinations[i], ()) #sum(argCombinations, []) # flatten nested list if self.debug: print >> sys.stderr, " ", "argCombinations flat", argCombinations for argCombination in argCombinations: # Originally binary classification if goldGraph != None: isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset, goldGraph.interactions) #if eType == "Binding": # print argCombination[0].get("e1"), len(argCombination), isGoldEvent else: isGoldEvent = False # Named (multi-)class if isGoldEvent: # category = "zeroArg" # if validIntTypeCount == 1: # category = "singleArg" # event has 0-1 arguments (old simple6) # if validIntTypeCount > 1: # category = "multiType" # event has arguments of several types, 0-1 of each (old Regulation) # if maxArgCount > 1: # category = "multiArg" # event can have 2-n of at least one argument type (old Binding) if self.styles["binary"]: category = "pos" else: category = entity.get("type") assert category != None else: category = "neg" self.exampleStats.beginExample(category) issues = defaultdict(int) # early out for proteins etc. if validIntTypeCount == 0 and entity.get("given") == "True": self.exampleStats.filter("given-leaf:" + entity.get("type")) if self.debug: print >> sys.stderr, " ", category +"("+eType+")", "arg combination", argCombination, "LEAF" elif not structureAnalyzer.isValidEvent(entity, argCombination, self.documentEntitiesById, issues=issues): for key in issues: self.exampleStats.filter(key) if self.debug: print >> sys.stderr, " ", category, "arg combination", argCombination, "INVALID", issues else: if self.debug: print >> sys.stderr, " ", category, "arg combination", argCombination, "VALID" features = {} argString = "" for arg in argCombination: argString += "," + arg.get("type") + "=" + arg.get("id") extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category} extra["allInt"] = interactionCountString assert type(extra["etype"]) in types.StringTypes, extra assert type(extra["class"]) in types.StringTypes, category assert type(extra["i"]) in types.StringTypes, argString example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions) example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex) example[1] = self.classSet.getId(category) example[3] = extra #examples.append( example ) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex
def buildExamplesFromGraph(sentenceGraph, args, goldGraph=None, structureAnalyzer=None, debug=False): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ # self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) # self.triggerFeatureBuilder.initSentence(sentenceGraph) exampleIndex = 0 exampleCounter = defaultdict(dict) # exampleCounter['Binding']: {"tp":xxx, "fp": xxx} # undirected = sentenceGraph.dependencyGraph.toUndirected() # paths = undirected # Get argument order # self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths) # Map tokens to character offsets tokenByOffset = {} for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] if goldGraph != None: # check that the tokenizations match goldToken = goldGraph.tokens[i] assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset") tokenByOffset[token.get("charOffset")] = token.get("id") # Map gold entities to their head offsets goldEntitiesByOffset = {} if goldGraph != None: for entity in goldGraph.entities: offset = entity.get("headOffset") assert offset != None if not goldEntitiesByOffset.has_key(offset): goldEntitiesByOffset[offset] = [] goldEntitiesByOffset[offset].append(entity) # if self.styles["no_merge"]: # mergeInput = False # entities = sentenceGraph.entities # else: # Entered here - Mu # The entities here include both named entities(Protein) and event triggers # The purpose of merging the entities is to convert the original gold annotation, where # a trigger can have multiple trigger annotations, to the merged version. mergeInput = True assert sentenceGraph.mergedEntityToDuplicates == None # make sure here the sentenceGraph is unmerged(entities) sentenceGraph.mergeInteractionGraph(True) assert sentenceGraph.mergedEntityToDuplicates != None # make sure now the sentenceGraph is the merged graph assert goldGraph.mergedEntityToDuplicates == None # make sure gold graph is unmerged entities = sentenceGraph.mergedEntities dupEntityCnt = len(sentence.sentenceGraph.entities) - len(entities) # self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) # pdb.set_trace() # if len(sentenceGraph.entities) != len(sentenceGraph.mergedEntities): # pdb.set_trace() # Up to here, the merged graph has been built. for one sentence - Mu # sentenceGraph_return = sentenceGraph # with open('./GE09_train_graph/merged-'+ sentenceGraph.sentenceElement.get('id'), 'wb') as f: # pickle.dump(sentenceGraph, f) # with open('./GE09_train_graph/gold-'+ goldGraph.sentenceElement.get('id'), 'wb') as f: # pickle.dump(goldGraph, f) # if sentenceGraph.sentenceElement.get('id') == 'GE09.d167.s1': # pdb.set_trace() exampleIndex = 0 for entity in entities: # sentenceGraph.mergedEntities: if type(entity) in types.StringTypes: # dummy entity for intersentence interactions continue eType = entity.get("type") assert eType != None, entity.attrib eType = str(eType) interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)] interactions = sortInteractionsById(interactions) interactionCounts = defaultdict(int) validInteractionsByType = defaultdict(list) for interaction in interactions: # interactions are outgoing edges for the current merged entity - Mu if interaction.get("event") != "True": continue e1 = sentenceGraph.entitiesById[interaction.get("e1")] if interaction.get("e2") in sentenceGraph.entitiesById: e2 = sentenceGraph.entitiesById[interaction.get("e2")] if interaction.get("type") in structureAnalyzer.getValidEdgeTypes(e1.get("type"), e2.get("type")): validInteractionsByType[interaction.get("type")].append(interaction) else: # intersentence print("found inter-sent") validInteractionsByType[interaction.get("type")].append(interaction) interactionCounts[interaction.get("type")] += 1 interactionCountString = ",".join([key + "=" + str(interactionCounts[key]) for key in sorted(interactionCounts.keys())]) # if sentenceGraph.sentenceElement.get('id') == 'GE09.d167.s1': # pdb.set_trace() #argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id")) intCombinations = [] validIntTypeCount = 0 maxArgCount = 0 if debug: print >> sys.stderr, entity.get("id"), entity.get("type"), "int:" + interactionCountString, "validInt:" + str(validInteractionsByType) # pdb.set_trace() # if 'Theme' in validInteractionsByType.keys() and 'Cause' in validInteractionsByType: # pdb.set_trace() for intType in sorted(validInteractionsByType.keys()): # for each argument type the event can have validIntTypeCount += 1 intCombinations.append([]) minArgs, maxArgs = structureAnalyzer.getArgLimits(entity.get("type"), intType) if maxArgs > maxArgCount: maxArgCount = maxArgs #if maxArgs > 1: # allow any number of arguments for cases like Binding # maxArgs = len(validInteractionsByType[intType]) for combLen in range(minArgs, maxArgs+1): # for each valid argument count, get all possible combinations. note that there may be zero-lenght combination for singleTypeArgCombination in combinations(validInteractionsByType[intType], combLen): intCombinations[-1].append(singleTypeArgCombination) # e.g. theme:[a,b], cause:[d] = [[(), (d,)], [(a,), (b,)]] - Mu # pdb.set_trace() # intCombinations now contains a list of lists, each of which has a tuple for each valid combination # of one argument type. Next, we'll make all valid combinations of multiple argument types if debug: print >> sys.stderr, " ", "intCombinations", intCombinations argCombinations = combine.combine(*intCombinations) if debug: print >> sys.stderr, " ", "argCombinations", argCombinations for i in range(len(argCombinations)): argCombinations[i] = sum(argCombinations[i], ()) # Up to here, all possible interaction combinations are found - Mu # Note this is for each trigger - Mu #sum(argCombinations, []) # flatten nested list # argCombinations_return = argCombinations # pdb.set_trace() if debug: print >> sys.stderr, " ", "argCombinations flat", argCombinations # if len(sentenceGraph.entities) != len(sentenceGraph.mergedEntities) and len(argCombinations) != 0: if argCombinations ==[()]: if entity.get('type') not in ['Protein', 'Entity']: # meaning that this is a event trigger and also it has no outgoing edges # due to possibbly removed inter-sentence interactions # so skip this to prevent generating a false positive # TODO: need to think about this - how to deal with the inter-sentence interactions? view it as an error? continue for argCombination in argCombinations: # Originally binary classification # if sentenceGraph.sentenceElement.get('id') == 'GE09.d167.s1': # pdb.set_trace() # filter out the combinations where the mandatory 'Theme' argument is not presented # this can be due to inter-sentence interaction, like the case in the Phosphorylation in GE09.d169.s2 if 'Theme' not in [i.get('type') for i in argCombination]: continue category = None if args.apply_alg: if entity.get('type') in ['Negative_regulation', 'Positive_regulation', 'Regulation']: maxArgCombinationLen = max([len(i) for i in argCombinations]) if len(argCombination) != maxArgCombinationLen: # meaning that for Regulation classes, there are plausible association of both # (Theme, Cause) and (Theme). And we always choose (Theme, Cause) and ignore (Theme) continue elif entity.get('type') in ['Binding']: maxArgCombinationLen = max([len(i) for i in argCombinations]) if len(argCombination) != maxArgCombinationLen: # meaning that for binding events, only take the longest ones. continue elif entity.get('type') in ['Localization', 'Phosphorylation']: maxArgCombinationLen = max([len(i) for i in argCombinations]) if len(argCombination) != maxArgCombinationLen: # meaning that for binding events, only take the longest ones. continue # else: # continue # if not entity.get('type') in ['Gene_expression', 'Transcription', 'Protain_catabolism']: # continue # if not entity.get('type') in ['Localization', 'Phosphorylation']: # continue # if not entity.get('type') in ['Binding']: # continue # if not entity.get('type') in ['Negative_regulation', 'Positive_regulation', 'Regulation']: # continue # if entity.get('type') in ['Negative_regulation', 'Positive_regulation', 'Regulation']: # if entity.get('type') in ['Binding']: if goldGraph != None: isGoldEvent = eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset, goldGraph.interactions) #if eType == "Binding": # print argCombination[0].get("e1"), len(argCombination), isGoldEvent else: isGoldEvent = False # Named (multi-)class if isGoldEvent: # category = "zeroArg" # if validIntTypeCount == 1: # category = "singleArg" # event has 0-1 arguments (old simple6) # if validIntTypeCount > 1: # category = "multiType" # event has arguments of several types, 0-1 of each (old Regulation) # if maxArgCount > 1: # category = "multiArg" # event can have 2-n of at least one argument type (old Binding) # if self.styles["binary"]: # category = "pos" # else: # Entered here, since self.styles["binary"] is None - Mu category = entity.get("type") assert category != None else: category = "neg" # self.exampleStats.beginExample(category) if category != "neg": if category not in exampleCounter: exampleCounter[category] = {"tp":1, "fp":0} else: exampleCounter[category]["tp"] += 1 else: # the unmerging category generates a False Positive eventType = entity.get("type") if eventType not in exampleCounter: exampleCounter[eventType] = {"tp":0, "fp":1} else: exampleCounter[eventType]["fp"] += 1 # For debugging - investigate why for single argument event there is false positives if category == 'neg' and entity.get("type") == 'Positive_regulation': pdb.set_trace() # print entity.get('id') # if entity.get('id') == 'GE09.d554.s5.e18': # pdb.set_trace() #issues = defaultdict(int) ## early out for proteins etc. #if validIntTypeCount == 0 and entity.get("given") == "True": # self.exampleStats.filter("given-leaf:" + entity.get("type")) # if self.debug: # print >> sys.stderr, " ", category +"("+eType+")", "arg combination", argCombination, "LEAF" # TODO: Check this line below, it remove some of the neg classes. #elif structureAnalyzer.isValidEntity(entity) or structureAnalyzer.isValidEvent(entity, argCombination, self.documentEntitiesById, noUpperLimitBeyondOne=self.styles["no_arg_count_upper_limit"], issues=issues): # if self.debug: # print >> sys.stderr, " ", category, "arg combination", argCombination, "VALID" # argString = "" # for arg in argCombination: # argString += "," + arg.get("type") + "=" + arg.get("id") # extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category} # extra["allInt"] = interactionCountString # assert type(extra["etype"]) in types.StringTypes, extra # assert type(extra["class"]) in types.StringTypes, category # assert type(extra["i"]) in types.StringTypes, argString # example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions) # example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex) # example[1] = self.classSet.getId(category) # example[3] = extra # #examples.append( example ) # ExampleUtils.appendExamples([example], outfile) # exampleIndex += 1 #else: # not a valid event or valid entity # if len(issues) == 0: # must be > 0 so that it gets filtered # if not structureAnalyzer.isValidEntity(entity): # issues["INVALID_ENTITY:"+eType] += 1 # else: # issues["UNKNOWN_ISSUE_FOR:"+eType] += 1 # for key in issues: # self.exampleStats.filter(key) # if self.debug: # print >> sys.stderr, " ", category, "arg combination", argCombination, "INVALID", issues #self.exampleStats.endExample() #return examples # if 'Phosphorylation' in exampleCounter: # pdb.set_trace() return exampleIndex, exampleCounter, dupEntityCnt#, sentenceGraph_return, argCombinations_return
def buildExamplesFromGraph(self, sentenceGraph, examples, goldGraph=None): dg = sentenceGraph.dependencyGraph undirected = dg.toUndirected() edgeCounts = { x: len(dg.getInEdges(x) + dg.getOutEdges(x)) for x in sentenceGraph.tokens } paths = undirected # Get argument order #self.interactionLengths = self.getInteractionEdgeLengths(sentenceGraph, paths) # # Map tokens to character offsets # tokenByOffset = {} # for i in range(len(sentenceGraph.tokens)): # token = sentenceGraph.tokens[i] # if goldGraph != None: # check that the tokenizations match # goldToken = goldGraph.tokens[i] # assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset") # tokenByOffset[token.get("charOffset")] = token.get("id") # Map gold entities to their head offsets goldEntitiesByOffset = {} if goldGraph != None: for entity in goldGraph.entities: offset = entity.get("headOffset") assert offset != None if not goldEntitiesByOffset.has_key(offset): goldEntitiesByOffset[offset] = [] goldEntitiesByOffset[offset].append(entity) if "no_merge" in self.styles: mergeInput = False entities = sentenceGraph.entities else: mergeInput = True sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities self.exampleStats.addValue( "Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) # Pre-generate features for all tokens in the sentence tokens, tokenMap = self.getTokenFeatures(sentenceGraph) exampleIndex = 0 for entity in entities: # sentenceGraph.entities: if type( entity ) in types.StringTypes: # dummy entity for intersentence interactions continue eType = entity.get("type") assert eType != None, entity.attrib eType = str(eType) interactions = [ x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput) ] interactions.sort(key=lambda k: k.get("id")) interactionCounts = defaultdict(int) validInteractionsByType = defaultdict(list) for token in tokens: token["interaction"] = None for interaction in interactions: if interaction.get("event") != "True": continue e1 = sentenceGraph.entitiesById[interaction.get("e1")] #assert e1 == entity, (e1.attrib, entity.attrib) if interaction.get("e2") in sentenceGraph.entitiesById: e2 = sentenceGraph.entitiesById[interaction.get("e2")] if interaction.get( "type" ) in self.structureAnalyzer.getValidEdgeTypes( e1.get("type"), e2.get("type")): validInteractionsByType[interaction.get( "type")].append(interaction) e2Token = sentenceGraph.entityHeadTokenByEntity[e2] tokenMap[e2Token]["interaction"] = interaction else: # intersentence validInteractionsByType[interaction.get("type")].append( interaction) interactionCounts[interaction.get("type")] += 1 interactionCountString = ",".join([ key + "=" + str(interactionCounts[key]) for key in sorted(interactionCounts.keys()) ]) #argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id")) intCombinations = [] validIntTypeCount = 0 maxArgCount = 0 if self.debug: print >> sys.stderr, entity.get("id"), entity.get( "type" ), "int:" + interactionCountString, "validInt:" + str( validInteractionsByType) for intType in sorted(validInteractionsByType.keys( )): # for each argument type the event can have validIntTypeCount += 1 intCombinations.append([]) minArgs, maxArgs = self.structureAnalyzer.getArgLimits( entity.get("type"), intType) if maxArgs > maxArgCount: maxArgCount = maxArgs #if maxArgs > 1: # allow any number of arguments for cases like Binding # maxArgs = len(validInteractionsByType[intType]) for combLen in range( minArgs, maxArgs + 1 ): # for each valid argument count, get all possible combinations. note that there may be zero-lenght combination for singleTypeArgCombination in combinations( validInteractionsByType[intType], combLen): intCombinations[-1].append(singleTypeArgCombination) # e.g. theme:[a,b], cause:[d] = [[ # intCombinations now contains a list of lists, each of which has a tuple for each valid combination # of one argument type. Next, we'll make all valid combinations of multiple argument types if self.debug: print >> sys.stderr, " ", "intCombinations", intCombinations argCombinations = combine.combine(*intCombinations) if self.debug: print >> sys.stderr, " ", "argCombinations", argCombinations for i in range(len(argCombinations)): argCombinations[i] = sum(argCombinations[i], ()) #sum(argCombinations, []) # flatten nested list if self.debug: print >> sys.stderr, " ", "argCombinations flat", argCombinations for argCombination in argCombinations: # Originally binary classification if goldGraph != None: isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset, goldGraph.interactions) #if eType == "Binding": # print argCombination[0].get("e1"), len(argCombination), isGoldEvent else: isGoldEvent = False # Named (multi-)class labels = [] if isGoldEvent: if "binary" in self.styles: labels = ["pos"] else: labels = [entity.get("type")] #else: # category = "neg" self.exampleStats.beginExample(",".join(labels)) issues = defaultdict(int) # early out for proteins etc. if validIntTypeCount == 0 and entity.get("given") == "True": self.exampleStats.filter("given-leaf:" + entity.get("type")) if self.debug: print >> sys.stderr, " ", ",".join( labels ) + "(" + eType + ")", "arg combination", argCombination, "LEAF" elif self.structureAnalyzer.isValidEntity( entity) or self.structureAnalyzer.isValidEvent( entity, argCombination, self.documentEntitiesById, noUpperLimitBeyondOne="no_arg_count_upper_limit" in self.styles, issues=issues): if self.debug: print >> sys.stderr, " ", ",".join( labels), "arg combination", argCombination, "VALID" argString = "" for arg in argCombination: argString += "," + arg.get("type") + "=" + arg.get( "id") extra = { "xtype": "um", "e": entity.get("id"), "i": argString[1:], "etype": eType, "class": ",".join(labels) } extra["allInt"] = interactionCountString assert type(extra["etype"]) in types.StringTypes, extra assert type( extra["class"]) in types.StringTypes, ",".join(labels) assert type(extra["i"]) in types.StringTypes, argString features = self.buildFeatures(sentenceGraph, paths, entity, argCombination, interactions, tokens, tokenMap, undirected, edgeCounts) examples.append({ "id": sentenceGraph.getSentenceId() + ".x" + str(self.exampleIndex), "labels": labels, "features": features, "extra": extra, "doc": sentenceGraph.documentElement.get("id") }) #, "extra":{"eIds":entityIds}} self.exampleIndex += 1 else: # not a valid event or valid entity if len(issues ) == 0: # must be > 0 so that it gets filtered if not self.structureAnalyzer.isValidEntity(entity): issues["INVALID_ENTITY:" + eType] += 1 else: issues["UNKNOWN_ISSUE_FOR:" + eType] += 1 for key in issues: self.exampleStats.filter(key) if self.debug: print >> sys.stderr, " ", labels, "arg combination", argCombination, "INVALID", issues self.exampleStats.endExample() #return examples return exampleIndex
def buildExamplesFromGraph(self, sentenceGraph, examples, goldGraph=None): dg = sentenceGraph.dependencyGraph undirected = dg.toUndirected() edgeCounts = {x:len(dg.getInEdges(x) + dg.getOutEdges(x)) for x in sentenceGraph.tokens} paths = undirected # Get argument order #self.interactionLengths = self.getInteractionEdgeLengths(sentenceGraph, paths) # # Map tokens to character offsets # tokenByOffset = {} # for i in range(len(sentenceGraph.tokens)): # token = sentenceGraph.tokens[i] # if goldGraph != None: # check that the tokenizations match # goldToken = goldGraph.tokens[i] # assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset") # tokenByOffset[token.get("charOffset")] = token.get("id") # Map gold entities to their head offsets goldEntitiesByOffset = {} if goldGraph != None: for entity in goldGraph.entities: offset = entity.get("headOffset") assert offset != None if not goldEntitiesByOffset.has_key(offset): goldEntitiesByOffset[offset] = [] goldEntitiesByOffset[offset].append(entity) if "no_merge" in self.styles: mergeInput = False entities = sentenceGraph.entities else: mergeInput = True sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) # Pre-generate features for all tokens in the sentence tokens, tokenMap = self.getTokenFeatures(sentenceGraph) exampleIndex = 0 for entity in entities: # sentenceGraph.entities: if type(entity) in types.StringTypes: # dummy entity for intersentence interactions continue eType = entity.get("type") assert eType != None, entity.attrib eType = str(eType) interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)] interactions.sort(key=lambda k: k.get("id")) interactionCounts = defaultdict(int) validInteractionsByType = defaultdict(list) for token in tokens: token["interaction"] = None for interaction in interactions: if interaction.get("event") != "True": continue e1 = sentenceGraph.entitiesById[interaction.get("e1")] #assert e1 == entity, (e1.attrib, entity.attrib) if interaction.get("e2") in sentenceGraph.entitiesById: e2 = sentenceGraph.entitiesById[interaction.get("e2")] if interaction.get("type") in self.structureAnalyzer.getValidEdgeTypes(e1.get("type"), e2.get("type")): validInteractionsByType[interaction.get("type")].append(interaction) e2Token = sentenceGraph.entityHeadTokenByEntity[e2] tokenMap[e2Token]["interaction"] = interaction else: # intersentence validInteractionsByType[interaction.get("type")].append(interaction) interactionCounts[interaction.get("type")] += 1 interactionCountString = ",".join([key + "=" + str(interactionCounts[key]) for key in sorted(interactionCounts.keys())]) #argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id")) intCombinations = [] validIntTypeCount = 0 maxArgCount = 0 if self.debug: print >> sys.stderr, entity.get("id"), entity.get("type"), "int:" + interactionCountString, "validInt:" + str(validInteractionsByType) for intType in sorted(validInteractionsByType.keys()): # for each argument type the event can have validIntTypeCount += 1 intCombinations.append([]) minArgs, maxArgs = self.structureAnalyzer.getArgLimits(entity.get("type"), intType) if maxArgs > maxArgCount: maxArgCount = maxArgs #if maxArgs > 1: # allow any number of arguments for cases like Binding # maxArgs = len(validInteractionsByType[intType]) for combLen in range(minArgs, maxArgs+1): # for each valid argument count, get all possible combinations. note that there may be zero-lenght combination for singleTypeArgCombination in combinations(validInteractionsByType[intType], combLen): intCombinations[-1].append(singleTypeArgCombination) # e.g. theme:[a,b], cause:[d] = [[ # intCombinations now contains a list of lists, each of which has a tuple for each valid combination # of one argument type. Next, we'll make all valid combinations of multiple argument types if self.debug: print >> sys.stderr, " ", "intCombinations", intCombinations argCombinations = combine.combine(*intCombinations) if self.debug: print >> sys.stderr, " ", "argCombinations", argCombinations for i in range(len(argCombinations)): argCombinations[i] = sum(argCombinations[i], ()) #sum(argCombinations, []) # flatten nested list if self.debug: print >> sys.stderr, " ", "argCombinations flat", argCombinations for argCombination in argCombinations: # Originally binary classification if goldGraph != None: isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset, goldGraph.interactions) #if eType == "Binding": # print argCombination[0].get("e1"), len(argCombination), isGoldEvent else: isGoldEvent = False # Named (multi-)class labels = [] if isGoldEvent: if "binary" in self.styles: labels = ["pos"] else: labels = [entity.get("type")] #else: # category = "neg" self.exampleStats.beginExample(",".join(labels)) issues = defaultdict(int) # early out for proteins etc. if validIntTypeCount == 0 and entity.get("given") == "True": self.exampleStats.filter("given-leaf:" + entity.get("type")) if self.debug: print >> sys.stderr, " ", ",".join(labels) +"("+eType+")", "arg combination", argCombination, "LEAF" elif self.structureAnalyzer.isValidEntity(entity) or self.structureAnalyzer.isValidEvent(entity, argCombination, self.documentEntitiesById, noUpperLimitBeyondOne = "no_arg_count_upper_limit" in self.styles, issues=issues): if self.debug: print >> sys.stderr, " ", ",".join(labels), "arg combination", argCombination, "VALID" argString = "" for arg in argCombination: argString += "," + arg.get("type") + "=" + arg.get("id") extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":",".join(labels)} extra["allInt"] = interactionCountString assert type(extra["etype"]) in types.StringTypes, extra assert type(extra["class"]) in types.StringTypes, ",".join(labels) assert type(extra["i"]) in types.StringTypes, argString features = self.buildFeatures(sentenceGraph, paths, entity, argCombination, interactions, tokens, tokenMap, undirected, edgeCounts) examples.append({"id":sentenceGraph.getSentenceId()+".x"+str(self.exampleIndex), "labels":labels, "features":features, "extra":extra, "doc":sentenceGraph.documentElement.get("id")}) #, "extra":{"eIds":entityIds}} self.exampleIndex += 1 else: # not a valid event or valid entity if len(issues) == 0: # must be > 0 so that it gets filtered if not self.structureAnalyzer.isValidEntity(entity): issues["INVALID_ENTITY:"+eType] += 1 else: issues["UNKNOWN_ISSUE_FOR:"+eType] += 1 for key in issues: self.exampleStats.filter(key) if self.debug: print >> sys.stderr, " ", labels, "arg combination", argCombination, "INVALID", issues self.exampleStats.endExample() #return examples return exampleIndex
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) self.triggerFeatureBuilder.initSentence(sentenceGraph) exampleIndex = 0 undirected = sentenceGraph.dependencyGraph.toUndirected() paths = undirected # Get argument order self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths) # Map tokens to character offsets tokenByOffset = {} for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] if goldGraph != None: # check that the tokenizations match goldToken = goldGraph.tokens[i] assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset") tokenByOffset[token.get("charOffset")] = token.get("id") # Map gold entities to their head offsets goldEntitiesByOffset = {} if goldGraph != None: for entity in goldGraph.entities: offset = entity.get("headOffset") assert offset != None if not goldEntitiesByOffset.has_key(offset): goldEntitiesByOffset[offset] = [] goldEntitiesByOffset[offset].append(entity) if self.styles["no_merge"]: mergeInput = False entities = sentenceGraph.entities else: mergeInput = True sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) exampleIndex = 0 for entity in entities: # sentenceGraph.entities: if type(entity) in types.StringTypes: # dummy entity for intersentence interactions continue eType = entity.get("type") assert eType != None, entity.attrib eType = str(eType) interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)] interactions = self.sortInteractionsById(interactions) interactionCounts = defaultdict(int) validInteractionsByType = defaultdict(list) for interaction in interactions: if interaction.get("event") != "True": continue e1 = sentenceGraph.entitiesById[interaction.get("e1")] if interaction.get("e2") in sentenceGraph.entitiesById: e2 = sentenceGraph.entitiesById[interaction.get("e2")] if interaction.get("type") in structureAnalyzer.getValidEdgeTypes(e1.get("type"), e2.get("type")): validInteractionsByType[interaction.get("type")].append(interaction) else: # intersentence validInteractionsByType[interaction.get("type")].append(interaction) interactionCounts[interaction.get("type")] += 1 interactionCountString = ",".join([key + "=" + str(interactionCounts[key]) for key in sorted(interactionCounts.keys())]) #argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id")) intCombinations = [] validIntTypeCount = 0 maxArgCount = 0 if self.debug: print >> sys.stderr, entity.get("id"), entity.get("type"), "int:" + interactionCountString, "validInt:" + str(validInteractionsByType) for intType in sorted(validInteractionsByType.keys()): # for each argument type the event can have validIntTypeCount += 1 intCombinations.append([]) minArgs, maxArgs = structureAnalyzer.getArgLimits(entity.get("type"), intType) if maxArgs > maxArgCount: maxArgCount = maxArgs #if maxArgs > 1: # allow any number of arguments for cases like Binding # maxArgs = len(validInteractionsByType[intType]) for combLen in range(minArgs, maxArgs+1): # for each valid argument count, get all possible combinations. note that there may be zero-lenght combination for singleTypeArgCombination in combinations(validInteractionsByType[intType], combLen): intCombinations[-1].append(singleTypeArgCombination) # e.g. theme:[a,b], cause:[d] = [[ # intCombinations now contains a list of lists, each of which has a tuple for each valid combination # of one argument type. Next, we'll make all valid combinations of multiple argument types if self.debug: print >> sys.stderr, " ", "intCombinations", intCombinations argCombinations = combine.combine(*intCombinations) if self.debug: print >> sys.stderr, " ", "argCombinations", argCombinations for i in range(len(argCombinations)): argCombinations[i] = sum(argCombinations[i], ()) #sum(argCombinations, []) # flatten nested list if self.debug: print >> sys.stderr, " ", "argCombinations flat", argCombinations for argCombination in argCombinations: # Originally binary classification if goldGraph != None: isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset, goldGraph.interactions) #if eType == "Binding": # print argCombination[0].get("e1"), len(argCombination), isGoldEvent else: isGoldEvent = False # Named (multi-)class if isGoldEvent: # category = "zeroArg" # if validIntTypeCount == 1: # category = "singleArg" # event has 0-1 arguments (old simple6) # if validIntTypeCount > 1: # category = "multiType" # event has arguments of several types, 0-1 of each (old Regulation) # if maxArgCount > 1: # category = "multiArg" # event can have 2-n of at least one argument type (old Binding) if self.styles["binary"]: category = "pos" else: category = entity.get("type") assert category != None else: category = "neg" self.exampleStats.beginExample(category) issues = defaultdict(int) # early out for proteins etc. if validIntTypeCount == 0 and entity.get("given") == "True": self.exampleStats.filter("given-leaf:" + entity.get("type")) if self.debug: print >> sys.stderr, " ", category +"("+eType+")", "arg combination", argCombination, "LEAF" elif structureAnalyzer.isValidEntity(entity) or structureAnalyzer.isValidEvent(entity, argCombination, self.documentEntitiesById, noUpperLimitBeyondOne=self.styles["no_arg_count_upper_limit"], issues=issues): if self.debug: print >> sys.stderr, " ", category, "arg combination", argCombination, "VALID" argString = "" for arg in argCombination: argString += "," + arg.get("type") + "=" + arg.get("id") extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category} extra["allInt"] = interactionCountString assert type(extra["etype"]) in types.StringTypes, extra assert type(extra["class"]) in types.StringTypes, category assert type(extra["i"]) in types.StringTypes, argString example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions) example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex) example[1] = self.classSet.getId(category) example[3] = extra #examples.append( example ) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 else: # not a valid event or valid entity if len(issues) == 0: # must be > 0 so that it gets filtered if not structureAnalyzer.isValidEntity(entity): issues["INVALID_ENTITY:"+eType] += 1 else: issues["UNKNOWN_ISSUE_FOR:"+eType] += 1 for key in issues: self.exampleStats.filter(key) if self.debug: print >> sys.stderr, " ", category, "arg combination", argCombination, "INVALID", issues self.exampleStats.endExample() #return examples return exampleIndex
def getArgumentCombinations(self, eType, interactions, entityId=None): combs = [] if eType == "Binding": # Making examples for only all-together/all-separate cases # doesn't work, since even gold data has several cases of # overlapping bindings with different numbers of arguments #if len(interactions) > 0: # return [interactions] #else: # return interactions # Skip causes themes = [] for interaction in interactions: if interaction.get("type") == "Theme": themes.append(interaction) for i in range(len(themes)): # Looking at a2-normalize.pl reveals that there can be max 6 themes # Based on training+devel data, four is maximum if i < 10: #4: for j in combinations(themes, i+1): combs.append(j) # if len(combs) >= 100: # print >> sys.stderr, "Warning, truncating unmerging examples at 100 for Binding entity", entityId # break return combs elif eType == "Process": # For ID-task argCombinations = [] argCombinations.append([]) # process can have 0 interactions for interaction in interactions: if interaction.get("type") == "Participant": argCombinations.append([interaction]) return argCombinations else: # one of the regulation-types, or one of the simple types themes = [] causes = [] siteArgs = [] contextGenes = [] sideChains = [] locTargets = [] for interaction in interactions: iType = interaction.get("type") #assert iType in ["Theme", "Cause"], (iType, ETUtils.toStr(interaction)) if iType not in ["Theme", "Cause", "SiteArg", "Contextgene", "Sidechain"]: # "AtLoc", "ToLoc"]: continue if iType == "Theme": themes.append(interaction) elif iType == "Cause": causes.append(interaction) elif iType == "SiteArg": siteArgs.append(interaction) elif iType == "Contextgene": contextGenes.append(interaction) elif iType == "Sidechain": sideChains.append(interaction) elif iType in ["AtLoc", "ToLoc"]: locTargets.append(iType) else: assert False, (iType, interaction.get("id")) # Limit arguments to event types that can have them if eType.find("egulation") == -1 and eType != "Catalysis": causes = [] if eType != "Glycosylation": sideChains = [] if eType not in ["Acetylation", "Methylation"]: contextGenes = [] if eType == "Catalysis": siteArgs = [] # Themes can always appear alone themeAloneCombinations = [] for theme in themes: themeAloneCombinations.append([theme]) #print "Combine", combine.combine(themes, causes), "TA", themeAloneCombinations return combine.combine(themes, causes) \ + combine.combine(themes, siteArgs) \ + combine.combine(themes, sideChains) \ + combine.combine(themes, contextGenes) \ + combine.combine(themes, siteArgs, sideChains) \ + combine.combine(themes, siteArgs, contextGenes) \ + combine.combine(themes, locTargets) \ + themeAloneCombinations