Example #1
0
File: Equiv.py Project: ninjin/TEES
def duplicateEquiv(event, duplDict, debug):
    """
    If the event (event tree) has arguments which have Equiv-statements, create a new event
    for each combination. Otherwise, return just the existing event.
    """
    argList = [] # depth-first argument list
    hasEquiv = getArgs(event, argList)
    if not hasEquiv:
        return [event]
    if debug:
        print "----------------------------------------------"
        print "Event:", event.id, event.type, event.arguments
        print " Orig. Duplicates:", argList
    combinations = combine.combine(*argList) # make all combinations
    if debug:
        print " Dup. Combinations:", combinations
    newEvents = []
    count = 0 # used only for marking duplicates' ids
    for combination in combinations:
        createdEvents = makeEvent(event, combination, count, duplDict=duplDict, debug=debug)
        newEvent = createdEvents[0]
        if debug:
            for createdEvent in createdEvents:
                if createdEvent == newEvent:
                    print " New Event (root):", createdEvent.id, createdEvent.type, createdEvent.arguments
                else:
                    print " New Event:", createdEvent.id, createdEvent.type, createdEvent.arguments
                Validate.validate([createdEvent], simulation=True)
        newEvents.append(newEvent)
        count += 1
    return newEvents
Example #2
0
def duplicateEquiv(event, duplDict, debug):
    """
    If the event (event tree) has arguments which have Equiv-statements, create a new event
    for each combination. Otherwise, return just the existing event.
    """
    argList = []  # depth-first argument list
    hasEquiv = getArgs(event, argList)
    if not hasEquiv:
        return [event]
    if debug:
        print "----------------------------------------------"
        print "Event:", event.id, event.type, event.arguments
        print " Orig. Duplicates:", argList
    combinations = combine.combine(*argList)  # make all combinations
    if debug:
        print " Dup. Combinations:", combinations
    newEvents = []
    count = 0  # used only for marking duplicates' ids
    for combination in combinations:
        createdEvents = makeEvent(event,
                                  combination,
                                  count,
                                  duplDict=duplDict,
                                  debug=debug)
        newEvent = createdEvents[0]
        if debug:
            for createdEvent in createdEvents:
                if createdEvent == newEvent:
                    print " New Event (root):", createdEvent.id, createdEvent.type, createdEvent.arguments
                else:
                    print " New Event:", createdEvent.id, createdEvent.type, createdEvent.arguments
                #Validate.validate([createdEvent], simulation=True)
        newEvents.append(newEvent)
        count += 1
    return newEvents
    def addEvent(self, example, arguments, sentenceObject, forceAdd=False, predictionStrength=None, exampleNotes=None):
        if len(arguments) == 0:  # A zero-argument event
            e1Id = example[3]["e"]
            origE1 = sentenceObject.entitiesById[e1Id]
            entityCombinations = [None]
        else:
            # Collect e2 entities linked by this event
            e1Id = None
            origE1 = None
            argEntities = [[]] * (len(arguments))
            for i in range(len(arguments)):
                arg = arguments[i]
                argE1Id = arg.get("e1")
                # Take the entity trigger node from the e1 attribute of the argument
                if e1Id != None:  # trigger has already been found
                    assert argE1Id in self.entityToDuplicates[e1Id], ((e1Id, argE1Id), example[3], arguments)
                    # assert e1Id == argE1Id, ((e1Id, argE1Id), example[3], arguments)
                else:  # find the trigger (any of the original identical triggers is OK
                    e1Id = argE1Id
                    origE1 = sentenceObject.entitiesById[argE1Id]

                e2Id = arg.get("e2")
                if e2Id in sentenceObject.entitiesById:
                    origE2 = sentenceObject.entitiesById[e2Id]
                    e2HeadOffset = origE2.get("headOffset")
                    e2Type = origE2.get("type")
                    argEntities[i] = self.entitiesByHeadByType[e2HeadOffset][e2Type]
                    if len(argEntities[i]) == 0:
                        assert forceAdd
                        if origE2.get("given") != "True":
                            argEntities[i] = [self.addEntity(origE2)]
                        else:
                            argEntities[i] = [origE2]
                else:
                    argEntities[i] = ["INTERSENTENCE"]
            entityCombinations = combine.combine(*argEntities)

        for combination in entityCombinations:
            assert origE1 != None, (
                sentenceObject.sentence.get("id"),
                exampleNotes,
                [(x.get("id"), x.get("e1"), x.get("e2")) for x in arguments],
            )
            root = self.addEntity(origE1)
            if predictionStrength != None:
                root.set("umConf", str(predictionStrength))
            for i in range(len(arguments)):
                self.addInteraction(root, combination[i], arguments[i])
 def addEvent(self, example, arguments, sentenceObject, forceAdd=False, predictionStrength=None, exampleNotes=None):
     if len(arguments) == 0: # A zero-argument event
         e1Id = example[3]["e"]
         origE1 = sentenceObject.entitiesById[e1Id]
         entityCombinations = [None]
     else:
         # Collect e2 entities linked by this event
         e1Id = None
         origE1 = None
         argEntities = [[]] * (len(arguments))
         for i in range(len(arguments)):
             arg = arguments[i]
             argE1Id = arg.get("e1")
             # Take the entity trigger node from the e1 attribute of the argument
             if e1Id != None: # trigger has already been found
                 assert argE1Id in self.entityToDuplicates[e1Id], ((e1Id, argE1Id), example[3], arguments)
                 #assert e1Id == argE1Id, ((e1Id, argE1Id), example[3], arguments)
             else: # find the trigger (any of the original identical triggers is OK
                 e1Id = argE1Id
                 origE1 = sentenceObject.entitiesById[argE1Id]
             
             e2Id = arg.get("e2")
             if e2Id in sentenceObject.entitiesById:
                 origE2 = sentenceObject.entitiesById[e2Id]
                 e2HeadOffset = origE2.get("headOffset")
                 e2Type = origE2.get("type")
                 argEntities[i] = self.entitiesByHeadByType[e2HeadOffset][e2Type]
                 if len(argEntities[i]) == 0:
                     assert forceAdd
                     if origE2.get("given") != "True":
                         argEntities[i] = [self.addEntity(origE2)]
                     else:
                         argEntities[i] = [origE2]
             else:
                 argEntities[i] = ["INTERSENTENCE"]
         entityCombinations = combine.combine(*argEntities)
     
     for combination in entityCombinations:
         assert origE1 != None, (sentenceObject.sentence.get("id"), exampleNotes, [(x.get("id"), x.get("e1"), x.get("e2")) for x in arguments])
         root = self.addEntity(origE1)
         if predictionStrength != None:
             root.set("umConf", str(predictionStrength))
         for i in range(len(arguments)):
             self.addInteraction(root, combination[i], arguments[i])
Example #5
0
 def addEvent(self, arguments, sentenceObject, umType="unknown", forceAdd=False, predictionStrength=None, exampleNotes=None):
     assert len(arguments) > 0, (sentenceObject.sentence.get("id"), exampleNotes)
     # Collect e2 entities linked by this event
     e1Id = None
     origE1 = None
     argEntities = [[]] * (len(arguments))
     for i in range(len(arguments)):
         arg = arguments[i]
         argE1Id = arg.get("e1")
         # Take the entity trigger node from the e1 attribute of the argument
         if e1Id != None: # trigger has already been found
             assert e1Id == argE1Id
         else: # find the trigger
             e1Id = argE1Id
             origE1 = sentenceObject.entitiesById[argE1Id]
         
         e2Id = arg.get("e2")
         origE2 = sentenceObject.entitiesById[e2Id]
         e2HeadOffset = origE2.get("headOffset")
         e2Type = origE2.get("type")
         argEntities[i] = self.entitiesByHeadByType[e2HeadOffset][e2Type]
         if len(argEntities[i]) == 0:
             assert forceAdd
             if origE2.get("isName") != "True":
                 argEntities[i] = [self.addEntity(origE2)]
             else:
                 argEntities[i] = origE2
         
     entityCombinations = combine.combine(*argEntities)
     for combination in entityCombinations:
         assert origE1 != None, (sentenceObject.sentence.get("id"), exampleNotes, [(x.get("id"), x.get("e1"), x.get("e2")) for x in arguments])
         root = self.addEntity(origE1)
         root.set("umType", umType)
         if predictionStrength != None:
             root.set("umStrength", str(predictionStrength))
         for i in range(len(arguments)):
             self.addInteraction(root, combination[i], arguments[i])
Example #6
0
def getCombinations(parameters, order=None):
    parameters = get(parameters)
    parameterNames = sorted(parameters.keys())
    if order != None:
        assert sorted(order) == parameterNames
        parameterNames = order
    #parameterNames.sort()
    #parameterNames.reverse() # to put trigger parameter first (allows optimized 3-parameter grid)
    parameterValues = []
    for parameterName in parameterNames:
        parameterValues.append([])
        values = parameters[parameterName]
        if isinstance(values, (list, tuple)):
            for value in values:
                parameterValues[-1].append((parameterName, value))
        else:
            parameterValues[-1].append((parameterName, values))
    combinationLists = combine.combine(*parameterValues)
    combinations = []
    for combinationList in combinationLists:
        combinations.append({})
        for value in combinationList:
            combinations[-1][value[0]] = value[1]
    return combinations
Example #7
0
def getCombinations(parameters, order=None):
    parameters = get(parameters)
    parameterNames = sorted(parameters.keys())
    if order != None:
        assert sorted(order) == parameterNames
        parameterNames = order
    #parameterNames.sort()
    #parameterNames.reverse() # to put trigger parameter first (allows optimized 3-parameter grid)
    parameterValues = []
    for parameterName in parameterNames:
        parameterValues.append([])
        values = parameters[parameterName] 
        if isinstance(values, (list, tuple)):
            for value in values:
                parameterValues[-1].append( (parameterName,value) )
        else:
            parameterValues[-1].append( (parameterName,values) )
    combinationLists = combine.combine(*parameterValues)
    combinations = []
    for combinationList in combinationLists:
        combinations.append({})
        for value in combinationList:
            combinations[-1][value[0]] = value[1]
    return combinations
Example #8
0
    def doGrid(self):
        print >> sys.stderr, "--------- Parameter grid search ---------"
        # Build trigger examples
        self.triggerDetector.buildExamples(
            self.model, [self.optData],
            [self.workDir + "grid-trigger-examples"])

        if self.fullGrid:
            stepParams = {
                "trigger":
                Parameters.get(self.model.getStr(self.triggerDetector.tag +
                                                 "classifier-parameters-train",
                                                 defaultIfNotExist=""),
                               valueListKey="c"),
                "booster":
                [float(i) for i in self.recallAdjustParameters.split(",")],
                "edge":
                Parameters.get(self.model.getStr(self.edgeDetector.tag +
                                                 "classifier-parameters-train",
                                                 defaultIfNotExist=""),
                               valueListKey="c")
            }
        else:
            stepParams = {
                "trigger":
                Parameters.get(self.model.getStr(self.triggerDetector.tag +
                                                 "classifier-parameter",
                                                 defaultIfNotExist=""),
                               valueListKey="c"),
                "booster":
                [float(i) for i in self.recallAdjustParameters.split(",")],
                "edge":
                Parameters.get(self.model.getStr(self.edgeDetector.tag +
                                                 "classifier-parameter",
                                                 defaultIfNotExist=""),
                               valueListKey="c")
            }

        for step in ["trigger", "edge"]:
            stepParams[step] = Parameters.getCombinations(stepParams[step])
            for i in range(len(stepParams[step])):
                stepParams[step][i] = Parameters.toString(stepParams[step][i])
        print >> sys.stderr, "Parameters", [
            stepParams[x] for x in ["trigger", "booster", "edge"]
        ]
        paramCombinations = combine(
            *[stepParams[x] for x in ["trigger", "booster", "edge"]])
        print >> sys.stderr, "Combinations", paramCombinations
        for i in range(len(paramCombinations)):
            paramCombinations[i] = {
                "trigger": paramCombinations[i][0],
                "booster": paramCombinations[i][1],
                "edge": paramCombinations[i][2]
            }

        #paramCombinations = Parameters.getCombinations(ALL_PARAMS, ["trigger", "booster", "edge"])
        prevParams = None
        EDGE_MODEL_STEM = os.path.join(
            self.edgeDetector.workDir,
            os.path.normpath(self.model.path) + "-edge-models/model")
        TRIGGER_MODEL_STEM = os.path.join(
            self.triggerDetector.workDir,
            os.path.normpath(self.model.path) + "-trigger-models/model")
        self.structureAnalyzer.load(self.model)
        bestResults = None
        for i in range(len(paramCombinations)):
            params = paramCombinations[i]
            print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
            print >> sys.stderr, "Processing params", str(i + 1) + "/" + str(
                len(paramCombinations)), params
            print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
            # Triggers and Boost (the trigger predictions are recalculated only when the relevant parameters change)
            if (prevParams == None) or (
                    prevParams["trigger"] != params["trigger"]) or (
                        prevParams["booster"] != params["booster"]):
                print >> sys.stderr, "Classifying trigger examples for parameters", "trigger:" + str(
                    params["trigger"]), "booster:" + str(params["booster"])
                xml = self.triggerDetector.classifyToXML(
                    self.optData,
                    self.model,
                    self.workDir + "grid-trigger-examples",
                    self.workDir + "grid-",
                    classifierModel=TRIGGER_MODEL_STEM +
                    Parameters.toId(params["trigger"]),
                    recallAdjust=params["booster"],
                    useExistingExamples=True)
            prevParams = params
            ## Build edge examples
            #self.edgeDetector.buildExamples(self.model, [xml], [self.workDir+"grid-edge-examples"], [self.optData])
            # Classify with pre-defined model
            edgeClassifierModel = EDGE_MODEL_STEM + Parameters.toId(
                params["edge"])
            xml = self.edgeDetector.classifyToXML(
                xml,
                self.model,
                self.workDir + "grid-edge-examples",
                self.workDir + "grid-",
                classifierModel=edgeClassifierModel,
                goldData=self.optData)
            bestResults = self.evaluateGrid(xml, params, bestResults)
        # Remove remaining intermediate grid files
        for tag1 in ["edge", "trigger", "unmerging"]:
            for tag2 in ["examples", "pred.xml.gz"]:
                if os.path.exists(self.workDir + "grid-" + tag1 + "-" + tag2):
                    os.remove(self.workDir + "grid-" + tag1 + "-" + tag2)
        print >> sys.stderr, "Parameter grid search complete"
        print >> sys.stderr, "Tested", len(paramCombinations), "combinations"
        print >> sys.stderr, "Best parameters:", bestResults[0]
        print >> sys.stderr, "Best result:", bestResults[2]  # f-score
        # Save grid model
        self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]),
                     self.model)
        self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]),
                     self.combinedModel, False)
        if self.fullGrid:  # define best models
            self.triggerDetector.addClassifierModel(
                self.model,
                TRIGGER_MODEL_STEM + str(bestResults[0]["trigger"]),
                bestResults[0]["trigger"])
            self.edgeDetector.addClassifierModel(
                self.model, EDGE_MODEL_STEM + str(bestResults[0]["edge"]),
                bestResults[0]["edge"])
        # Remove work files
        for stepTag in [
                self.workDir + "grid-trigger", self.workDir + "grid-edge",
                self.workDir + "grid-unmerging"
        ]:
            for fileStem in [
                    "-classifications", "-classifications.log", "examples.gz",
                    "pred.xml.gz"
            ]:
                if os.path.exists(stepTag + fileStem):
                    os.remove(stepTag + fileStem)
Example #9
0
    def doGrid(self):
        print >> sys.stderr, "--------- Parameter grid search ---------"
        # Build trigger examples
        self.triggerDetector.buildExamples(self.model, [self.optData], [self.workDir+"grid-trigger-examples.gz"])

        if self.fullGrid:
            stepParams = {
                "trigger":Parameters.get(self.model.getStr(self.triggerDetector.tag+"classifier-parameters-train", defaultIfNotExist=""), valueListKey="c"),
                "booster":[float(i) for i in self.recallAdjustParameters.split(",")],
                "edge":Parameters.get(self.model.getStr(self.edgeDetector.tag+"classifier-parameters-train", defaultIfNotExist=""), valueListKey="c")}
        else:
            stepParams = {
                "trigger":Parameters.get(self.model.getStr(self.triggerDetector.tag+"classifier-parameter", defaultIfNotExist=""), valueListKey="c"),
                "booster":[float(i) for i in self.recallAdjustParameters.split(",")],
                "edge":Parameters.get(self.model.getStr(self.edgeDetector.tag+"classifier-parameter", defaultIfNotExist=""), valueListKey="c")}
        
        for step in ["trigger", "edge"]:
            stepParams[step] = Parameters.getCombinations(stepParams[step])
            for i in range(len(stepParams[step])):
                stepParams[step][i] = Parameters.toString(stepParams[step][i])
        print >> sys.stderr, [stepParams[x] for x in ["trigger", "booster", "edge"]]
        paramCombinations = combine(*[stepParams[x] for x in ["trigger", "booster", "edge"]])
        print >> sys.stderr, paramCombinations
        for i in range(len(paramCombinations)):
            paramCombinations[i] = {"trigger":paramCombinations[i][0], "booster":paramCombinations[i][1], "edge":paramCombinations[i][2]}
        
        #paramCombinations = Parameters.getCombinations(ALL_PARAMS, ["trigger", "booster", "edge"])
        prevParams = None
        EDGE_MODEL_STEM = os.path.join(self.edgeDetector.workDir, os.path.normpath(self.model.path)+"-edge-models/model")
        TRIGGER_MODEL_STEM = os.path.join(self.triggerDetector.workDir, os.path.normpath(self.model.path)+"-trigger-models/model")
        self.structureAnalyzer.load(self.model)
        bestResults = None
        for i in range(len(paramCombinations)):
            params = paramCombinations[i]
            print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
            print >> sys.stderr, "Processing params", str(i+1) + "/" + str(len(paramCombinations)), params
            print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
            # Triggers and Boost
            if prevParams == None or prevParams["trigger"] != params["trigger"] or prevParams["trigger"] != params["trigger"]:
                print >> sys.stderr, "Classifying trigger examples for parameters", "trigger:" + str(params["trigger"]), "booster:" + str(params["booster"])
                xml = self.triggerDetector.classifyToXML(self.optData, self.model, self.workDir+"grid-trigger-examples", self.workDir+"grid-", classifierModel=TRIGGER_MODEL_STEM + Parameters.toId(params["trigger"]), recallAdjust=params["booster"])
            prevParams = params
            ## Build edge examples
            #self.edgeDetector.buildExamples(self.model, [xml], [self.workDir+"grid-edge-examples"], [self.optData])
            # Classify with pre-defined model
            edgeClassifierModel = EDGE_MODEL_STEM + Parameters.toId(params["edge"])
            xml = self.edgeDetector.classifyToXML(xml, self.model, self.workDir+"grid-edge-examples", self.workDir+"grid-", classifierModel=edgeClassifierModel, goldData=self.optData)
            bestResults = self.evaluateGrid(xml, params, bestResults)
        # Remove remaining intermediate grid files
        for tag1 in ["edge", "trigger", "unmerging"]:
            for tag2 in ["examples", "pred.xml.gz"]:
                if os.path.exists(self.workDir+"grid-"+tag1+"-"+tag2):
                    os.remove(self.workDir+"grid-"+tag1+"-"+tag2)
        print >> sys.stderr, "Parameter grid search complete"
        print >> sys.stderr, "Tested", len(paramCombinations), "combinations"
        print >> sys.stderr, "Best parameters:", bestResults[0]
        print >> sys.stderr, "Best result:", bestResults[2] # f-score
        # Save grid model
        self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.model)
        self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.combinedModel, False)
        if self.fullGrid: # define best models
            self.triggerDetector.addClassifierModel(self.model, TRIGGER_MODEL_STEM+str(bestResults[0]["trigger"]), bestResults[0]["trigger"])
            self.edgeDetector.addClassifierModel(self.model, EDGE_MODEL_STEM+str(bestResults[0]["edge"]), bestResults[0]["edge"])
        # Remove work files
        for stepTag in [self.workDir+"grid-trigger", self.workDir+"grid-edge", self.workDir+"grid-unmerging"]:
            for fileStem in ["-classifications", "-classifications.log", "examples.gz", "pred.xml.gz"]:
                if os.path.exists(stepTag+fileStem):
                    os.remove(stepTag+fileStem)
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True)
        self.triggerFeatureBuilder.initSentence(sentenceGraph)        
        
        exampleIndex = 0
        undirected = sentenceGraph.dependencyGraph.toUndirected()
        paths = undirected
        
        # Get argument order
        self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths)
        
        # Map tokens to character offsets
        tokenByOffset = {}
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            if goldGraph != None: # check that the tokenizations match
                goldToken = goldGraph.tokens[i]
                assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset")
            tokenByOffset[token.get("charOffset")] = token.get("id")
        
        # Map gold entities to their head offsets
        goldEntitiesByOffset = {}
        if goldGraph != None:
            for entity in goldGraph.entities:
                offset = entity.get("headOffset")
                assert offset != None
                if not goldEntitiesByOffset.has_key(offset):
                    goldEntitiesByOffset[offset] = []
                goldEntitiesByOffset[offset].append(entity)
        
        if self.styles["no_merge"]:
            mergeInput = False
            entities = sentenceGraph.entities
        else:
            mergeInput = True
            sentenceGraph.mergeInteractionGraph(True)
            entities = sentenceGraph.mergedEntities
            self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities))
        
        exampleIndex = 0
        for entity in entities: # sentenceGraph.entities:
            if type(entity) in types.StringTypes: # dummy entity for intersentence interactions
                continue
            
            eType = entity.get("type")
            assert eType != None, entity.attrib
            eType = str(eType)
            
            interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)]
            interactions = self.sortInteractionsById(interactions)
            interactionCounts = defaultdict(int)
            validInteractionsByType = defaultdict(list)
            for interaction in interactions:
                if interaction.get("event") != "True":
                    continue
                e1 = sentenceGraph.entitiesById[interaction.get("e1")]
                if interaction.get("e2") in sentenceGraph.entitiesById:
                    e2 = sentenceGraph.entitiesById[interaction.get("e2")]
                    if interaction.get("type") in structureAnalyzer.getValidEdgeTypes(e1.get("type"), e2.get("type")):
                        validInteractionsByType[interaction.get("type")].append(interaction)
                else: # intersentence
                    validInteractionsByType[interaction.get("type")].append(interaction)
                interactionCounts[interaction.get("type")] += 1
            interactionCountString = ",".join([key + "=" + str(interactionCounts[key]) for key in sorted(interactionCounts.keys())])
            #argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id"))
            intCombinations = []
            validIntTypeCount = 0
            maxArgCount = 0
            if self.debug:
                print >> sys.stderr, entity.get("id"), entity.get("type"), "int:" + interactionCountString, "validInt:" + str(validInteractionsByType)
            for intType in sorted(validInteractionsByType.keys()): # for each argument type the event can have
                validIntTypeCount += 1
                intCombinations.append([])
                minArgs, maxArgs = structureAnalyzer.getArgLimits(entity.get("type"), intType)
                if maxArgs > maxArgCount:
                    maxArgCount = maxArgs
                #if maxArgs > 1: # allow any number of arguments for cases like Binding
                #    maxArgs = len(validInteractionsByType[intType])
                for combLen in range(minArgs, maxArgs+1): # for each valid argument count, get all possible combinations. note that there may be zero-lenght combination
                    for singleTypeArgCombination in combinations(validInteractionsByType[intType], combLen):
                        intCombinations[-1].append(singleTypeArgCombination)
                # e.g. theme:[a,b], cause:[d] = [[
            # intCombinations now contains a list of lists, each of which has a tuple for each valid combination
            # of one argument type. Next, we'll make all valid combinations of multiple argument types
            if self.debug:
                print >> sys.stderr, " ", "intCombinations", intCombinations
            argCombinations = combine.combine(*intCombinations)
            if self.debug:
                print >> sys.stderr, " ", "argCombinations", argCombinations
            for i in range(len(argCombinations)):
                argCombinations[i] = sum(argCombinations[i], ())
            #sum(argCombinations, []) # flatten nested list
            if self.debug:
                print >> sys.stderr, " ", "argCombinations flat", argCombinations
            
            for argCombination in argCombinations:
                # Originally binary classification
                if goldGraph != None:
                    isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset, goldGraph.interactions)
                    #if eType == "Binding":
                    #    print argCombination[0].get("e1"), len(argCombination), isGoldEvent
                else:
                    isGoldEvent = False
                # Named (multi-)class
                if isGoldEvent:
#                    category = "zeroArg"
#                    if validIntTypeCount == 1:
#                        category = "singleArg" # event has 0-1 arguments (old simple6)
#                    if validIntTypeCount > 1:
#                        category = "multiType" # event has arguments of several types, 0-1 of each (old Regulation)
#                    if maxArgCount > 1:
#                        category = "multiArg" # event can have 2-n of at least one argument type (old Binding)
                    if self.styles["binary"]:
                        category = "pos"
                    else:
                        category = entity.get("type")
                        
                    assert category != None
                else:
                    category = "neg"
                self.exampleStats.beginExample(category)
                
                issues = defaultdict(int)
                # early out for proteins etc.
                if validIntTypeCount == 0 and entity.get("given") == "True":
                    self.exampleStats.filter("given-leaf:" + entity.get("type"))
                    if self.debug:
                        print >> sys.stderr, " ", category +"("+eType+")", "arg combination", argCombination, "LEAF"
                elif not structureAnalyzer.isValidEvent(entity, argCombination, self.documentEntitiesById, issues=issues):
                    for key in issues:
                        self.exampleStats.filter(key)
                    if self.debug:
                        print >> sys.stderr, " ", category, "arg combination", argCombination, "INVALID", issues
                else:
                    if self.debug:
                        print >> sys.stderr, " ", category, "arg combination", argCombination, "VALID"                
                    features = {}
                    argString = ""
                    for arg in argCombination:
                        argString += "," + arg.get("type") + "=" + arg.get("id")
                    extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category}
                    extra["allInt"] = interactionCountString
                    assert type(extra["etype"]) in types.StringTypes, extra
                    assert type(extra["class"]) in types.StringTypes, category
                    assert type(extra["i"]) in types.StringTypes, argString
                    example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions)
                    example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex)
                    example[1] = self.classSet.getId(category)
                    example[3] = extra
                    #examples.append( example )
                    ExampleUtils.appendExamples([example], outfile)
                    exampleIndex += 1
                self.exampleStats.endExample()
            
        #return examples
        return exampleIndex
Example #11
0
def buildExamplesFromGraph(sentenceGraph, args, goldGraph=None, structureAnalyzer=None, debug=False):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        # self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True)
        # self.triggerFeatureBuilder.initSentence(sentenceGraph)

        exampleIndex = 0
        exampleCounter = defaultdict(dict) # exampleCounter['Binding']: {"tp":xxx, "fp": xxx}
        # undirected = sentenceGraph.dependencyGraph.toUndirected()
        # paths = undirected

        # Get argument order
        # self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths)

        # Map tokens to character offsets
        tokenByOffset = {}
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            if goldGraph != None: # check that the tokenizations match
                goldToken = goldGraph.tokens[i]
                assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset")
            tokenByOffset[token.get("charOffset")] = token.get("id")

        # Map gold entities to their head offsets
        goldEntitiesByOffset = {}
        if goldGraph != None:
            for entity in goldGraph.entities:
                offset = entity.get("headOffset")
                assert offset != None
                if not goldEntitiesByOffset.has_key(offset):
                    goldEntitiesByOffset[offset] = []
                goldEntitiesByOffset[offset].append(entity)

        # if self.styles["no_merge"]:
        #     mergeInput = False
        #     entities = sentenceGraph.entities
        # else:
        # Entered here - Mu
        # The entities here include both named entities(Protein) and event triggers
        # The purpose of merging the entities is to convert the original gold annotation, where
        # a trigger can have multiple trigger annotations, to the merged version.
        mergeInput = True
        assert sentenceGraph.mergedEntityToDuplicates == None # make sure here the sentenceGraph is unmerged(entities)
        sentenceGraph.mergeInteractionGraph(True)
        assert sentenceGraph.mergedEntityToDuplicates != None # make sure now the sentenceGraph is the merged graph
        assert goldGraph.mergedEntityToDuplicates == None # make sure gold graph is unmerged
        entities = sentenceGraph.mergedEntities
        dupEntityCnt = len(sentence.sentenceGraph.entities) - len(entities)
        # self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities))
        # pdb.set_trace()
        # if len(sentenceGraph.entities) != len(sentenceGraph.mergedEntities):
        #     pdb.set_trace()

        # Up to here, the merged graph has been built. for one sentence - Mu
        # sentenceGraph_return = sentenceGraph
        # with open('./GE09_train_graph/merged-'+ sentenceGraph.sentenceElement.get('id'), 'wb') as f:
        #     pickle.dump(sentenceGraph, f)
        # with open('./GE09_train_graph/gold-'+ goldGraph.sentenceElement.get('id'), 'wb') as f:
        #     pickle.dump(goldGraph, f)

        # if sentenceGraph.sentenceElement.get('id') == 'GE09.d167.s1':
        #     pdb.set_trace()
        exampleIndex = 0
        for entity in entities: # sentenceGraph.mergedEntities:
            if type(entity) in types.StringTypes: # dummy entity for intersentence interactions
                continue

            eType = entity.get("type")
            assert eType != None, entity.attrib
            eType = str(eType)

            interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)]
            interactions = sortInteractionsById(interactions)
            interactionCounts = defaultdict(int)
            validInteractionsByType = defaultdict(list)
            for interaction in interactions: # interactions are outgoing edges for the current merged entity - Mu
                if interaction.get("event") != "True":
                    continue
                e1 = sentenceGraph.entitiesById[interaction.get("e1")]
                if interaction.get("e2") in sentenceGraph.entitiesById:
                    e2 = sentenceGraph.entitiesById[interaction.get("e2")]
                    if interaction.get("type") in structureAnalyzer.getValidEdgeTypes(e1.get("type"), e2.get("type")):
                        validInteractionsByType[interaction.get("type")].append(interaction)
                else: # intersentence
                    print("found inter-sent")
                    validInteractionsByType[interaction.get("type")].append(interaction)
                interactionCounts[interaction.get("type")] += 1
            interactionCountString = ",".join([key + "=" + str(interactionCounts[key]) for key in sorted(interactionCounts.keys())])
            # if sentenceGraph.sentenceElement.get('id') == 'GE09.d167.s1':
            #     pdb.set_trace()
            #argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id"))
            intCombinations = []
            validIntTypeCount = 0
            maxArgCount = 0
            if debug:
                print >> sys.stderr, entity.get("id"), entity.get("type"), "int:" + interactionCountString, "validInt:" + str(validInteractionsByType)
            # pdb.set_trace()
            # if 'Theme' in validInteractionsByType.keys() and 'Cause' in validInteractionsByType:
                # pdb.set_trace()
            for intType in sorted(validInteractionsByType.keys()): # for each argument type the event can have
                validIntTypeCount += 1
                intCombinations.append([])
                minArgs, maxArgs = structureAnalyzer.getArgLimits(entity.get("type"), intType)
                if maxArgs > maxArgCount:
                    maxArgCount = maxArgs
                #if maxArgs > 1: # allow any number of arguments for cases like Binding
                #    maxArgs = len(validInteractionsByType[intType])
                for combLen in range(minArgs, maxArgs+1): # for each valid argument count, get all possible combinations. note that there may be zero-lenght combination
                    for singleTypeArgCombination in combinations(validInteractionsByType[intType], combLen):
                        intCombinations[-1].append(singleTypeArgCombination)
                # e.g. theme:[a,b], cause:[d] = [[(), (d,)], [(a,), (b,)]] - Mu
            # pdb.set_trace()
            # intCombinations now contains a list of lists, each of which has a tuple for each valid combination
            # of one argument type. Next, we'll make all valid combinations of multiple argument types
            if debug:
                print >> sys.stderr, " ", "intCombinations", intCombinations
            argCombinations = combine.combine(*intCombinations)
            if debug:
                print >> sys.stderr, " ", "argCombinations", argCombinations
            for i in range(len(argCombinations)):
                argCombinations[i] = sum(argCombinations[i], ())

            # Up to here, all possible interaction combinations are found - Mu
            # Note this is for each trigger - Mu
            #sum(argCombinations, []) # flatten nested list
            # argCombinations_return = argCombinations
            # pdb.set_trace()
            if debug:
                print >> sys.stderr, " ", "argCombinations flat", argCombinations

            # if len(sentenceGraph.entities) != len(sentenceGraph.mergedEntities) and len(argCombinations) != 0:

            if argCombinations ==[()]:
               if entity.get('type') not in ['Protein', 'Entity']:
                   # meaning that this is a event trigger and also it has no outgoing edges
                   # due to possibbly removed inter-sentence interactions
                   # so skip this to prevent generating a false positive
                   # TODO: need to think about this - how to deal with the inter-sentence interactions? view it as an error?
                   continue

            for argCombination in argCombinations:
                # Originally binary classification
                # if sentenceGraph.sentenceElement.get('id') == 'GE09.d167.s1':
                #     pdb.set_trace()

                # filter out the combinations where the mandatory 'Theme' argument is not presented
                # this can be due to inter-sentence interaction, like the case in the Phosphorylation in GE09.d169.s2
                if 'Theme' not in [i.get('type') for i in argCombination]:
                    continue
                category = None
                if args.apply_alg:
                    if entity.get('type') in ['Negative_regulation', 'Positive_regulation', 'Regulation']:
                        maxArgCombinationLen = max([len(i) for i in argCombinations])
                        if len(argCombination) != maxArgCombinationLen:
                            # meaning that for Regulation classes, there are plausible association of both
                            # (Theme, Cause) and (Theme). And we always choose (Theme, Cause) and ignore (Theme)
                            continue
                    elif entity.get('type') in ['Binding']:
                        maxArgCombinationLen = max([len(i) for i in argCombinations])
                        if len(argCombination) != maxArgCombinationLen:
                            # meaning that for binding events, only take the longest ones.
                            continue
                    elif entity.get('type') in ['Localization', 'Phosphorylation']:
                        maxArgCombinationLen = max([len(i) for i in argCombinations])
                        if len(argCombination) != maxArgCombinationLen:
                            # meaning that for binding events, only take the longest ones.
                            continue
                # else:
                #     continue

                # if not entity.get('type') in ['Gene_expression', 'Transcription', 'Protain_catabolism']:
                #     continue
                # if not entity.get('type') in ['Localization', 'Phosphorylation']:
                #     continue
                # if not entity.get('type') in ['Binding']:
                #     continue
                # if not entity.get('type') in ['Negative_regulation', 'Positive_regulation', 'Regulation']:
                #     continue

                # if entity.get('type') in ['Negative_regulation', 'Positive_regulation', 'Regulation']:
                # if entity.get('type') in ['Binding']:
                if goldGraph != None:
                    isGoldEvent = eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset, goldGraph.interactions)
                    #if eType == "Binding":
                    #    print argCombination[0].get("e1"), len(argCombination), isGoldEvent
                else:
                    isGoldEvent = False
                # Named (multi-)class
                if isGoldEvent:
#                    category = "zeroArg"
#                    if validIntTypeCount == 1:
#                        category = "singleArg" # event has 0-1 arguments (old simple6)
#                    if validIntTypeCount > 1:
#                        category = "multiType" # event has arguments of several types, 0-1 of each (old Regulation)
#                    if maxArgCount > 1:
#                        category = "multiArg" # event can have 2-n of at least one argument type (old Binding)
                    # if self.styles["binary"]:
                    #     category = "pos"
                    # else: # Entered here, since self.styles["binary"] is None - Mu
                    category = entity.get("type")

                    assert category != None
                else:
                    category = "neg"
                # self.exampleStats.beginExample(category)
                if category != "neg":
                    if category not in exampleCounter:
                        exampleCounter[category] = {"tp":1, "fp":0}
                    else:
                        exampleCounter[category]["tp"] += 1
                else:
                    # the unmerging category generates a False Positive
                    eventType = entity.get("type")
                    if eventType not in exampleCounter:
                        exampleCounter[eventType] = {"tp":0, "fp":1}
                    else:
                        exampleCounter[eventType]["fp"] += 1

                # For debugging - investigate why for single argument event there is false positives
                if category == 'neg' and entity.get("type") == 'Positive_regulation':
                    pdb.set_trace()
                #     print entity.get('id')
                    # if entity.get('id') == 'GE09.d554.s5.e18':
                        # pdb.set_trace()

                #issues = defaultdict(int)
                ## early out for proteins etc.
                #if validIntTypeCount == 0 and entity.get("given") == "True":
                #    self.exampleStats.filter("given-leaf:" + entity.get("type"))
                #    if self.debug:
                #        print >> sys.stderr, " ", category +"("+eType+")", "arg combination", argCombination, "LEAF"
                # TODO: Check this line below, it remove some of the neg classes.
                #elif structureAnalyzer.isValidEntity(entity) or structureAnalyzer.isValidEvent(entity, argCombination, self.documentEntitiesById, noUpperLimitBeyondOne=self.styles["no_arg_count_upper_limit"], issues=issues):
                #    if self.debug:
                #        print >> sys.stderr, " ", category, "arg combination", argCombination, "VALID"
                #    argString = ""
                #    for arg in argCombination:
                #        argString += "," + arg.get("type") + "=" + arg.get("id")
                #    extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category}
                #    extra["allInt"] = interactionCountString
                #    assert type(extra["etype"]) in types.StringTypes, extra
                #    assert type(extra["class"]) in types.StringTypes, category
                #    assert type(extra["i"]) in types.StringTypes, argString
                #    example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions)
                #    example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex)
                #    example[1] = self.classSet.getId(category)
                #    example[3] = extra
                #    #examples.append( example )
                #    ExampleUtils.appendExamples([example], outfile)
                #    exampleIndex += 1
                #else: # not a valid event or valid entity
                #    if len(issues) == 0: # must be > 0 so that it gets filtered
                #        if not structureAnalyzer.isValidEntity(entity):
                #            issues["INVALID_ENTITY:"+eType] += 1
                #        else:
                #            issues["UNKNOWN_ISSUE_FOR:"+eType] += 1
                #    for key in issues:
                #        self.exampleStats.filter(key)
                #    if self.debug:
                #        print >> sys.stderr, " ", category, "arg combination", argCombination, "INVALID", issues
                #self.exampleStats.endExample()

        #return examples
        # if 'Phosphorylation' in exampleCounter:
        #     pdb.set_trace()
        return exampleIndex, exampleCounter, dupEntityCnt#, sentenceGraph_return, argCombinations_return
Example #12
0
    def buildExamplesFromGraph(self, sentenceGraph, examples, goldGraph=None):
        dg = sentenceGraph.dependencyGraph
        undirected = dg.toUndirected()
        edgeCounts = {
            x: len(dg.getInEdges(x) + dg.getOutEdges(x))
            for x in sentenceGraph.tokens
        }
        paths = undirected

        # Get argument order
        #self.interactionLengths = self.getInteractionEdgeLengths(sentenceGraph, paths)

        #         # Map tokens to character offsets
        #         tokenByOffset = {}
        #         for i in range(len(sentenceGraph.tokens)):
        #             token = sentenceGraph.tokens[i]
        #             if goldGraph != None: # check that the tokenizations match
        #                 goldToken = goldGraph.tokens[i]
        #                 assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset")
        #             tokenByOffset[token.get("charOffset")] = token.get("id")

        # Map gold entities to their head offsets
        goldEntitiesByOffset = {}
        if goldGraph != None:
            for entity in goldGraph.entities:
                offset = entity.get("headOffset")
                assert offset != None
                if not goldEntitiesByOffset.has_key(offset):
                    goldEntitiesByOffset[offset] = []
                goldEntitiesByOffset[offset].append(entity)

        if "no_merge" in self.styles:
            mergeInput = False
            entities = sentenceGraph.entities
        else:
            mergeInput = True
            sentenceGraph.mergeInteractionGraph(True)
            entities = sentenceGraph.mergedEntities
            self.exampleStats.addValue(
                "Duplicate entities skipped",
                len(sentenceGraph.entities) - len(entities))

        # Pre-generate features for all tokens in the sentence
        tokens, tokenMap = self.getTokenFeatures(sentenceGraph)

        exampleIndex = 0
        for entity in entities:  # sentenceGraph.entities:
            if type(
                    entity
            ) in types.StringTypes:  # dummy entity for intersentence interactions
                continue

            eType = entity.get("type")
            assert eType != None, entity.attrib
            eType = str(eType)

            interactions = [
                x[2]
                for x in sentenceGraph.getOutInteractions(entity, mergeInput)
            ]
            interactions.sort(key=lambda k: k.get("id"))
            interactionCounts = defaultdict(int)
            validInteractionsByType = defaultdict(list)
            for token in tokens:
                token["interaction"] = None
            for interaction in interactions:
                if interaction.get("event") != "True":
                    continue
                e1 = sentenceGraph.entitiesById[interaction.get("e1")]
                #assert e1 == entity, (e1.attrib, entity.attrib)
                if interaction.get("e2") in sentenceGraph.entitiesById:
                    e2 = sentenceGraph.entitiesById[interaction.get("e2")]
                    if interaction.get(
                            "type"
                    ) in self.structureAnalyzer.getValidEdgeTypes(
                            e1.get("type"), e2.get("type")):
                        validInteractionsByType[interaction.get(
                            "type")].append(interaction)
                        e2Token = sentenceGraph.entityHeadTokenByEntity[e2]
                        tokenMap[e2Token]["interaction"] = interaction
                else:  # intersentence
                    validInteractionsByType[interaction.get("type")].append(
                        interaction)
                interactionCounts[interaction.get("type")] += 1
            interactionCountString = ",".join([
                key + "=" + str(interactionCounts[key])
                for key in sorted(interactionCounts.keys())
            ])
            #argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id"))
            intCombinations = []
            validIntTypeCount = 0
            maxArgCount = 0
            if self.debug:
                print >> sys.stderr, entity.get("id"), entity.get(
                    "type"
                ), "int:" + interactionCountString, "validInt:" + str(
                    validInteractionsByType)
            for intType in sorted(validInteractionsByType.keys(
            )):  # for each argument type the event can have
                validIntTypeCount += 1
                intCombinations.append([])
                minArgs, maxArgs = self.structureAnalyzer.getArgLimits(
                    entity.get("type"), intType)
                if maxArgs > maxArgCount:
                    maxArgCount = maxArgs
                #if maxArgs > 1: # allow any number of arguments for cases like Binding
                #    maxArgs = len(validInteractionsByType[intType])
                for combLen in range(
                        minArgs, maxArgs + 1
                ):  # for each valid argument count, get all possible combinations. note that there may be zero-lenght combination
                    for singleTypeArgCombination in combinations(
                            validInteractionsByType[intType], combLen):
                        intCombinations[-1].append(singleTypeArgCombination)
                # e.g. theme:[a,b], cause:[d] = [[
            # intCombinations now contains a list of lists, each of which has a tuple for each valid combination
            # of one argument type. Next, we'll make all valid combinations of multiple argument types
            if self.debug:
                print >> sys.stderr, " ", "intCombinations", intCombinations
            argCombinations = combine.combine(*intCombinations)
            if self.debug:
                print >> sys.stderr, " ", "argCombinations", argCombinations
            for i in range(len(argCombinations)):
                argCombinations[i] = sum(argCombinations[i], ())
            #sum(argCombinations, []) # flatten nested list
            if self.debug:
                print >> sys.stderr, " ", "argCombinations flat", argCombinations

            for argCombination in argCombinations:
                # Originally binary classification
                if goldGraph != None:
                    isGoldEvent = self.eventIsGold(entity, argCombination,
                                                   sentenceGraph, goldGraph,
                                                   goldEntitiesByOffset,
                                                   goldGraph.interactions)
                    #if eType == "Binding":
                    #    print argCombination[0].get("e1"), len(argCombination), isGoldEvent
                else:
                    isGoldEvent = False
                # Named (multi-)class
                labels = []
                if isGoldEvent:
                    if "binary" in self.styles:
                        labels = ["pos"]
                    else:
                        labels = [entity.get("type")]
                #else:
                #    category = "neg"
                self.exampleStats.beginExample(",".join(labels))

                issues = defaultdict(int)
                # early out for proteins etc.
                if validIntTypeCount == 0 and entity.get("given") == "True":
                    self.exampleStats.filter("given-leaf:" +
                                             entity.get("type"))
                    if self.debug:
                        print >> sys.stderr, " ", ",".join(
                            labels
                        ) + "(" + eType + ")", "arg combination", argCombination, "LEAF"
                elif self.structureAnalyzer.isValidEntity(
                        entity) or self.structureAnalyzer.isValidEvent(
                            entity,
                            argCombination,
                            self.documentEntitiesById,
                            noUpperLimitBeyondOne="no_arg_count_upper_limit"
                            in self.styles,
                            issues=issues):
                    if self.debug:
                        print >> sys.stderr, " ", ",".join(
                            labels), "arg combination", argCombination, "VALID"
                    argString = ""
                    for arg in argCombination:
                        argString += "," + arg.get("type") + "=" + arg.get(
                            "id")
                    extra = {
                        "xtype": "um",
                        "e": entity.get("id"),
                        "i": argString[1:],
                        "etype": eType,
                        "class": ",".join(labels)
                    }
                    extra["allInt"] = interactionCountString
                    assert type(extra["etype"]) in types.StringTypes, extra
                    assert type(
                        extra["class"]) in types.StringTypes, ",".join(labels)
                    assert type(extra["i"]) in types.StringTypes, argString
                    features = self.buildFeatures(sentenceGraph, paths, entity,
                                                  argCombination, interactions,
                                                  tokens, tokenMap, undirected,
                                                  edgeCounts)
                    examples.append({
                        "id":
                        sentenceGraph.getSentenceId() + ".x" +
                        str(self.exampleIndex),
                        "labels":
                        labels,
                        "features":
                        features,
                        "extra":
                        extra,
                        "doc":
                        sentenceGraph.documentElement.get("id")
                    })  #, "extra":{"eIds":entityIds}}
                    self.exampleIndex += 1
                else:  # not a valid event or valid entity
                    if len(issues
                           ) == 0:  # must be > 0 so that it gets filtered
                        if not self.structureAnalyzer.isValidEntity(entity):
                            issues["INVALID_ENTITY:" + eType] += 1
                        else:
                            issues["UNKNOWN_ISSUE_FOR:" + eType] += 1
                    for key in issues:
                        self.exampleStats.filter(key)
                    if self.debug:
                        print >> sys.stderr, " ", labels, "arg combination", argCombination, "INVALID", issues
                self.exampleStats.endExample()

        #return examples
        return exampleIndex
Example #13
0
    def buildExamplesFromGraph(self, sentenceGraph, examples, goldGraph=None):
        dg = sentenceGraph.dependencyGraph
        undirected = dg.toUndirected()
        edgeCounts = {x:len(dg.getInEdges(x) + dg.getOutEdges(x)) for x in sentenceGraph.tokens}
        paths = undirected
        
        # Get argument order
        #self.interactionLengths = self.getInteractionEdgeLengths(sentenceGraph, paths)
        
#         # Map tokens to character offsets
#         tokenByOffset = {}
#         for i in range(len(sentenceGraph.tokens)):
#             token = sentenceGraph.tokens[i]
#             if goldGraph != None: # check that the tokenizations match
#                 goldToken = goldGraph.tokens[i]
#                 assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset")
#             tokenByOffset[token.get("charOffset")] = token.get("id")
        
        # Map gold entities to their head offsets
        goldEntitiesByOffset = {}
        if goldGraph != None:
            for entity in goldGraph.entities:
                offset = entity.get("headOffset")
                assert offset != None
                if not goldEntitiesByOffset.has_key(offset):
                    goldEntitiesByOffset[offset] = []
                goldEntitiesByOffset[offset].append(entity)
        
        if "no_merge" in self.styles:
            mergeInput = False
            entities = sentenceGraph.entities
        else:
            mergeInput = True
            sentenceGraph.mergeInteractionGraph(True)
            entities = sentenceGraph.mergedEntities
            self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities))
        
        # Pre-generate features for all tokens in the sentence
        tokens, tokenMap = self.getTokenFeatures(sentenceGraph)
        
        exampleIndex = 0
        for entity in entities: # sentenceGraph.entities:
            if type(entity) in types.StringTypes: # dummy entity for intersentence interactions
                continue
            
            eType = entity.get("type")
            assert eType != None, entity.attrib
            eType = str(eType)
            
            interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)]
            interactions.sort(key=lambda k: k.get("id"))
            interactionCounts = defaultdict(int)
            validInteractionsByType = defaultdict(list)
            for token in tokens:
                token["interaction"] = None
            for interaction in interactions:
                if interaction.get("event") != "True":
                    continue
                e1 = sentenceGraph.entitiesById[interaction.get("e1")]
                #assert e1 == entity, (e1.attrib, entity.attrib)
                if interaction.get("e2") in sentenceGraph.entitiesById:
                    e2 = sentenceGraph.entitiesById[interaction.get("e2")]
                    if interaction.get("type") in self.structureAnalyzer.getValidEdgeTypes(e1.get("type"), e2.get("type")):
                        validInteractionsByType[interaction.get("type")].append(interaction)
                        e2Token = sentenceGraph.entityHeadTokenByEntity[e2]
                        tokenMap[e2Token]["interaction"] = interaction
                else: # intersentence
                    validInteractionsByType[interaction.get("type")].append(interaction)
                interactionCounts[interaction.get("type")] += 1
            interactionCountString = ",".join([key + "=" + str(interactionCounts[key]) for key in sorted(interactionCounts.keys())])
            #argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id"))
            intCombinations = []
            validIntTypeCount = 0
            maxArgCount = 0
            if self.debug:
                print >> sys.stderr, entity.get("id"), entity.get("type"), "int:" + interactionCountString, "validInt:" + str(validInteractionsByType)
            for intType in sorted(validInteractionsByType.keys()): # for each argument type the event can have
                validIntTypeCount += 1
                intCombinations.append([])
                minArgs, maxArgs = self.structureAnalyzer.getArgLimits(entity.get("type"), intType)
                if maxArgs > maxArgCount:
                    maxArgCount = maxArgs
                #if maxArgs > 1: # allow any number of arguments for cases like Binding
                #    maxArgs = len(validInteractionsByType[intType])
                for combLen in range(minArgs, maxArgs+1): # for each valid argument count, get all possible combinations. note that there may be zero-lenght combination
                    for singleTypeArgCombination in combinations(validInteractionsByType[intType], combLen):
                        intCombinations[-1].append(singleTypeArgCombination)
                # e.g. theme:[a,b], cause:[d] = [[
            # intCombinations now contains a list of lists, each of which has a tuple for each valid combination
            # of one argument type. Next, we'll make all valid combinations of multiple argument types
            if self.debug:
                print >> sys.stderr, " ", "intCombinations", intCombinations
            argCombinations = combine.combine(*intCombinations)
            if self.debug:
                print >> sys.stderr, " ", "argCombinations", argCombinations
            for i in range(len(argCombinations)):
                argCombinations[i] = sum(argCombinations[i], ())
            #sum(argCombinations, []) # flatten nested list
            if self.debug:
                print >> sys.stderr, " ", "argCombinations flat", argCombinations
            
            for argCombination in argCombinations:
                # Originally binary classification
                if goldGraph != None:
                    isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset, goldGraph.interactions)
                    #if eType == "Binding":
                    #    print argCombination[0].get("e1"), len(argCombination), isGoldEvent
                else:
                    isGoldEvent = False
                # Named (multi-)class
                labels = []
                if isGoldEvent:
                    if "binary" in self.styles:
                        labels = ["pos"]
                    else:
                        labels = [entity.get("type")]
                #else:
                #    category = "neg"
                self.exampleStats.beginExample(",".join(labels))
                
                issues = defaultdict(int)
                # early out for proteins etc.
                if validIntTypeCount == 0 and entity.get("given") == "True":
                    self.exampleStats.filter("given-leaf:" + entity.get("type"))
                    if self.debug:
                        print >> sys.stderr, " ", ",".join(labels) +"("+eType+")", "arg combination", argCombination, "LEAF"
                elif self.structureAnalyzer.isValidEntity(entity) or self.structureAnalyzer.isValidEvent(entity, argCombination, self.documentEntitiesById, noUpperLimitBeyondOne = "no_arg_count_upper_limit" in self.styles, issues=issues):
                    if self.debug:
                        print >> sys.stderr, " ", ",".join(labels), "arg combination", argCombination, "VALID"
                    argString = ""
                    for arg in argCombination:
                        argString += "," + arg.get("type") + "=" + arg.get("id")
                    extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":",".join(labels)}
                    extra["allInt"] = interactionCountString
                    assert type(extra["etype"]) in types.StringTypes, extra
                    assert type(extra["class"]) in types.StringTypes, ",".join(labels)
                    assert type(extra["i"]) in types.StringTypes, argString
                    features = self.buildFeatures(sentenceGraph, paths, entity, argCombination, interactions, tokens, tokenMap, undirected, edgeCounts)
                    examples.append({"id":sentenceGraph.getSentenceId()+".x"+str(self.exampleIndex), "labels":labels, "features":features, "extra":extra, "doc":sentenceGraph.documentElement.get("id")}) #, "extra":{"eIds":entityIds}}
                    self.exampleIndex += 1
                else: # not a valid event or valid entity
                    if len(issues) == 0: # must be > 0 so that it gets filtered
                        if not self.structureAnalyzer.isValidEntity(entity):
                            issues["INVALID_ENTITY:"+eType] += 1
                        else:
                            issues["UNKNOWN_ISSUE_FOR:"+eType] += 1
                    for key in issues:
                        self.exampleStats.filter(key)
                    if self.debug:
                        print >> sys.stderr, " ", labels, "arg combination", argCombination, "INVALID", issues
                self.exampleStats.endExample()
            
        #return examples
        return exampleIndex
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True)
        self.triggerFeatureBuilder.initSentence(sentenceGraph)        
        
        exampleIndex = 0
        undirected = sentenceGraph.dependencyGraph.toUndirected()
        paths = undirected
        
        # Get argument order
        self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths)
        
        # Map tokens to character offsets
        tokenByOffset = {}
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            if goldGraph != None: # check that the tokenizations match
                goldToken = goldGraph.tokens[i]
                assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset")
            tokenByOffset[token.get("charOffset")] = token.get("id")
        
        # Map gold entities to their head offsets
        goldEntitiesByOffset = {}
        if goldGraph != None:
            for entity in goldGraph.entities:
                offset = entity.get("headOffset")
                assert offset != None
                if not goldEntitiesByOffset.has_key(offset):
                    goldEntitiesByOffset[offset] = []
                goldEntitiesByOffset[offset].append(entity)
        
        if self.styles["no_merge"]:
            mergeInput = False
            entities = sentenceGraph.entities
        else:
            mergeInput = True
            sentenceGraph.mergeInteractionGraph(True)
            entities = sentenceGraph.mergedEntities
            self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities))
        
        exampleIndex = 0
        for entity in entities: # sentenceGraph.entities:
            if type(entity) in types.StringTypes: # dummy entity for intersentence interactions
                continue
            
            eType = entity.get("type")
            assert eType != None, entity.attrib
            eType = str(eType)
            
            interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)]
            interactions = self.sortInteractionsById(interactions)
            interactionCounts = defaultdict(int)
            validInteractionsByType = defaultdict(list)
            for interaction in interactions:
                if interaction.get("event") != "True":
                    continue
                e1 = sentenceGraph.entitiesById[interaction.get("e1")]
                if interaction.get("e2") in sentenceGraph.entitiesById:
                    e2 = sentenceGraph.entitiesById[interaction.get("e2")]
                    if interaction.get("type") in structureAnalyzer.getValidEdgeTypes(e1.get("type"), e2.get("type")):
                        validInteractionsByType[interaction.get("type")].append(interaction)
                else: # intersentence
                    validInteractionsByType[interaction.get("type")].append(interaction)
                interactionCounts[interaction.get("type")] += 1
            interactionCountString = ",".join([key + "=" + str(interactionCounts[key]) for key in sorted(interactionCounts.keys())])
            #argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id"))
            intCombinations = []
            validIntTypeCount = 0
            maxArgCount = 0
            if self.debug:
                print >> sys.stderr, entity.get("id"), entity.get("type"), "int:" + interactionCountString, "validInt:" + str(validInteractionsByType)
            for intType in sorted(validInteractionsByType.keys()): # for each argument type the event can have
                validIntTypeCount += 1
                intCombinations.append([])
                minArgs, maxArgs = structureAnalyzer.getArgLimits(entity.get("type"), intType)
                if maxArgs > maxArgCount:
                    maxArgCount = maxArgs
                #if maxArgs > 1: # allow any number of arguments for cases like Binding
                #    maxArgs = len(validInteractionsByType[intType])
                for combLen in range(minArgs, maxArgs+1): # for each valid argument count, get all possible combinations. note that there may be zero-lenght combination
                    for singleTypeArgCombination in combinations(validInteractionsByType[intType], combLen):
                        intCombinations[-1].append(singleTypeArgCombination)
                # e.g. theme:[a,b], cause:[d] = [[
            # intCombinations now contains a list of lists, each of which has a tuple for each valid combination
            # of one argument type. Next, we'll make all valid combinations of multiple argument types
            if self.debug:
                print >> sys.stderr, " ", "intCombinations", intCombinations
            argCombinations = combine.combine(*intCombinations)
            if self.debug:
                print >> sys.stderr, " ", "argCombinations", argCombinations
            for i in range(len(argCombinations)):
                argCombinations[i] = sum(argCombinations[i], ())
            #sum(argCombinations, []) # flatten nested list
            if self.debug:
                print >> sys.stderr, " ", "argCombinations flat", argCombinations
            
            for argCombination in argCombinations:
                # Originally binary classification
                if goldGraph != None:
                    isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset, goldGraph.interactions)
                    #if eType == "Binding":
                    #    print argCombination[0].get("e1"), len(argCombination), isGoldEvent
                else:
                    isGoldEvent = False
                # Named (multi-)class
                if isGoldEvent:
#                    category = "zeroArg"
#                    if validIntTypeCount == 1:
#                        category = "singleArg" # event has 0-1 arguments (old simple6)
#                    if validIntTypeCount > 1:
#                        category = "multiType" # event has arguments of several types, 0-1 of each (old Regulation)
#                    if maxArgCount > 1:
#                        category = "multiArg" # event can have 2-n of at least one argument type (old Binding)
                    if self.styles["binary"]:
                        category = "pos"
                    else:
                        category = entity.get("type")
                        
                    assert category != None
                else:
                    category = "neg"
                self.exampleStats.beginExample(category)
                
                issues = defaultdict(int)
                # early out for proteins etc.
                if validIntTypeCount == 0 and entity.get("given") == "True":
                    self.exampleStats.filter("given-leaf:" + entity.get("type"))
                    if self.debug:
                        print >> sys.stderr, " ", category +"("+eType+")", "arg combination", argCombination, "LEAF"
                elif structureAnalyzer.isValidEntity(entity) or structureAnalyzer.isValidEvent(entity, argCombination, self.documentEntitiesById, noUpperLimitBeyondOne=self.styles["no_arg_count_upper_limit"], issues=issues):
                    if self.debug:
                        print >> sys.stderr, " ", category, "arg combination", argCombination, "VALID"
                    argString = ""
                    for arg in argCombination:
                        argString += "," + arg.get("type") + "=" + arg.get("id")
                    extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category}
                    extra["allInt"] = interactionCountString
                    assert type(extra["etype"]) in types.StringTypes, extra
                    assert type(extra["class"]) in types.StringTypes, category
                    assert type(extra["i"]) in types.StringTypes, argString
                    example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions)
                    example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex)
                    example[1] = self.classSet.getId(category)
                    example[3] = extra
                    #examples.append( example )
                    ExampleUtils.appendExamples([example], outfile)
                    exampleIndex += 1
                else: # not a valid event or valid entity
                    if len(issues) == 0: # must be > 0 so that it gets filtered
                        if not structureAnalyzer.isValidEntity(entity):
                            issues["INVALID_ENTITY:"+eType] += 1
                        else:
                            issues["UNKNOWN_ISSUE_FOR:"+eType] += 1
                    for key in issues:
                        self.exampleStats.filter(key)
                    if self.debug:
                        print >> sys.stderr, " ", category, "arg combination", argCombination, "INVALID", issues
                self.exampleStats.endExample()
            
        #return examples
        return exampleIndex
Example #15
0
    def getArgumentCombinations(self, eType, interactions, entityId=None):
        combs = []
        if eType == "Binding":
            # Making examples for only all-together/all-separate cases
            # doesn't work, since even gold data has several cases of
            # overlapping bindings with different numbers of arguments
            #if len(interactions) > 0:
            #    return [interactions]
            #else:
            #    return interactions
            
            # Skip causes
            themes = []
            for interaction in interactions:
                if interaction.get("type") == "Theme":
                    themes.append(interaction)
                
            for i in range(len(themes)):
                # Looking at a2-normalize.pl reveals that there can be max 6 themes
                # Based on training+devel data, four is maximum
                if i < 10: #4: 
                    for j in combinations(themes, i+1):
                        combs.append(j)
#                if len(combs) >= 100:
#                    print >> sys.stderr, "Warning, truncating unmerging examples at 100 for Binding entity", entityId
#                    break
            return combs
        elif eType == "Process": # For ID-task
            argCombinations = []
            argCombinations.append([]) # process can have 0 interactions
            for interaction in interactions:
                if interaction.get("type") == "Participant":
                    argCombinations.append([interaction])
            return argCombinations
        else: # one of the regulation-types, or one of the simple types
            themes = []
            causes = []
            siteArgs = []
            contextGenes = []
            sideChains = []
            locTargets = []
            for interaction in interactions:
                iType = interaction.get("type")
                #assert iType in ["Theme", "Cause"], (iType, ETUtils.toStr(interaction))
                if iType not in ["Theme", "Cause", "SiteArg", "Contextgene", "Sidechain"]: # "AtLoc", "ToLoc"]:
                    continue
                if iType == "Theme":
                    themes.append(interaction)
                elif iType == "Cause":
                    causes.append(interaction)
                elif iType == "SiteArg":
                    siteArgs.append(interaction)
                elif iType == "Contextgene":
                    contextGenes.append(interaction)
                elif iType == "Sidechain":
                    sideChains.append(interaction)
                elif iType in ["AtLoc", "ToLoc"]:
                    locTargets.append(iType)
                else:
                    assert False, (iType, interaction.get("id"))
            # Limit arguments to event types that can have them
            if eType.find("egulation") == -1 and eType != "Catalysis": 
                causes = []
            if eType != "Glycosylation": sideChains = []
            if eType not in ["Acetylation", "Methylation"]: contextGenes = []
            if eType == "Catalysis": siteArgs = []
            # Themes can always appear alone
            themeAloneCombinations = []
            for theme in themes:
                themeAloneCombinations.append([theme])
            #print "Combine", combine.combine(themes, causes), "TA", themeAloneCombinations
            return combine.combine(themes, causes) \
                   + combine.combine(themes, siteArgs) \
                   + combine.combine(themes, sideChains) \
                   + combine.combine(themes, contextGenes) \
                   + combine.combine(themes, siteArgs, sideChains) \
                   + combine.combine(themes, siteArgs, contextGenes) \
                   + combine.combine(themes, locTargets) \
                   + themeAloneCombinations