예제 #1
0
class EventExampleBuilder(ExampleBuilder):
    def __init__(self, style=["typed", "directed", "headsOnly"], length=None, types=[], featureSet=None, classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert classSet.getId("neg") == 1

        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        self.styles = style

        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        if "noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True
        # self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        # if "ontology" in self.styles:
        #    self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
        self.pathLengths = length
        assert self.pathLengths == None
        self.types = types

        # self.outFile = open("exampleTempFile.txt","wt")

    @classmethod
    def run(cls, input, output, parse, tokenization, style, idFileTag=None):
        classSet, featureSet = cls.getIdSets(idFileTag)
        e = EventExampleBuilder(style=style, classSet=classSet, featureSet=featureSet)
        sentences = cls.getSentences(input, parse, tokenization)
        e.buildExamplesForSentences(sentences, output, idFileTag)

    def definePredictedValueRange(self, sentences, elementName):
        self.multiEdgeFeatureBuilder.definePredictedValueRange(sentences, elementName)

    def getPredictedValueRange(self):
        return self.multiEdgeFeatureBuilder.predictedRange

    def preProcessExamples(self, allExamples):
        if "normalize" in self.styles:
            print >>sys.stderr, " Normalizing feature vectors"
            ExampleUtils.normalizeFeatureVectors(allExamples)
        return allExamples

    def isPotentialGeniaInteraction(self, e1, e2):
        if e1.get("isName") == "True" and e2.get("isName") == "True":
            return False
        elif e1.get("isName") == "True" and e2.get("isName") == "False":
            return False
        else:
            return True

    def getArgumentEntities(self, sentenceGraph, entityNode):
        eId = entityNode.get("id")
        assert eId != None
        themeNodes = []
        causeNodes = []
        for edge in sentenceGraph.interactions:
            if edge.get("e1") == eId:
                edgeType = edge.get("type")
                assert edgeType in ["Theme", "Cause"], edgeType
                if edgeType == "Theme":
                    themeNodes.append(sentenceGraph.entitiesById[edge.get("e2")])
                elif edgeType == "Cause":
                    causeNodes.append(sentenceGraph.entitiesById[edge.get("e2")])
        return themeNodes, causeNodes

    def makeGSEvents(self, sentenceGraph):
        self.gsEvents = {}  # [token]->[event-type]->[1-n argument sets]
        for token in sentenceGraph.tokens:
            self.gsEvents[token] = {}

        for entity in sentenceGraph.entities:
            if entity.get("type") == "neg":
                continue

            eId = entity.get("id")
            eType = entity.get("type")
            arguments = set()
            for interaction in sentenceGraph.interactions:
                if interaction.get("e1") == eId:
                    arguments.add((interaction.get("type"), interaction.get("e2")))
            eHeadToken = sentenceGraph.entityHeadTokenByEntity[entity]
            if not self.gsEvents[eHeadToken].has_key(eType):
                self.gsEvents[eHeadToken][eType] = []
            self.gsEvents[eHeadToken][eType].append(arguments)

    def isGSEvent(self, sentenceGraph, entity, themeNodes, causeNodes):
        eHeadToken = sentenceGraph.entityHeadTokenByEntity[entity]
        eType = entity.get("type")
        if not self.gsEvents[eHeadToken].has_key(eType):
            return False

        argumentSet = set()
        for themeNode in themeNodes:
            if themeNode != None:
                argumentSet.add(("Theme", themeNode.get("id")))
        for causeNode in causeNodes:
            if causeNode != None:
                argumentSet.add(("Cause", causeNode.get("id")))
        if argumentSet in self.gsEvents[eHeadToken][eType]:
            return True
        else:
            return False

    #    def isEvent(self, sentenceGraph, eventNode, themeNodes, causeNodes):
    #        goldThemeNodes, goldCauseNodes = self.getArgumentEntities(sentenceGraph, eventNode)
    #        for node in themeNodes:
    #            if node != None and node not in goldThemeNodes:
    #                return False
    #        for node in causeNodes:
    #            if node != None and node not in goldCauseNodes:
    #                return False
    #        return True

    def buildExamples(self, sentenceGraph):
        self.makeGSEvents(sentenceGraph)

        eventNodes = []
        nameNodes = []
        for entity in sentenceGraph.entities:
            if entity.get("type") == "neg":
                continue
            if entity.get("isName") == "True":
                nameNodes.append(entity)
            else:
                eventNodes.append(entity)
        allNodes = eventNodes + nameNodes

        examples = []
        exampleIndex = 0

        undirected = sentenceGraph.dependencyGraph.to_undirected()
        paths = NX.all_pairs_shortest_path(undirected, cutoff=999)

        for eventNode in eventNodes:
            eventType = eventNode.get("type")
            if eventType in [
                "Gene_expression",
                "Transcription",
                "Protein_catabolism",
                "Localization",
                "Phosphorylation",
            ]:
                for nameNode in nameNodes:
                    if self.isPotentialGeniaInteraction(eventNode, nameNode):
                        examples.append(self.buildExample(exampleIndex, sentenceGraph, paths, eventNode, nameNode))
                        exampleIndex += 1
            elif eventType in ["Regulation", "Positive_regulation", "Negative_regulation"]:
                combinations = combine.combine(allNodes + [None], allNodes + [None])
                for combination in combinations:
                    if combination[0] == combination[1]:
                        continue
                    if combination[0] == eventNode or combination[1] == eventNode:
                        continue
                    if combination[0] != None and not self.isPotentialGeniaInteraction(eventNode, combination[0]):
                        continue
                    if combination[1] != None and not self.isPotentialGeniaInteraction(eventNode, combination[1]):
                        continue
                    examples.append(
                        self.buildExample(exampleIndex, sentenceGraph, paths, eventNode, combination[0], combination[1])
                    )
                    exampleIndex += 1
            elif eventType in ["Binding"]:
                continue
            else:
                assert False, eventType

        self.gsEvents = None
        return examples

    def buildExample(self, exampleIndex, sentenceGraph, paths, eventNode, themeNode, causeNode=None):
        features = {}

        if self.isGSEvent(sentenceGraph, eventNode, [themeNode], [causeNode]):
            category = self.classSet.getId("pos")
        else:
            category = self.classSet.getId("neg")

        if themeNode != None:
            self.buildArgumentFeatures(sentenceGraph, paths, features, eventNode, themeNode, "theme_")
        if causeNode != None:
            self.buildArgumentFeatures(sentenceGraph, paths, features, eventNode, causeNode, "cause_")

        # Common features
        #        eventType = eventNode.get("type")
        #        e2Type = entity2.get("type")
        #        assert(entity1.get("isName") == "False")
        #        if entity2.get("isName") == "True":
        #            features[self.featureSet.getId("GENIA_target_protein")] = 1
        #        else:
        #            features[self.featureSet.getId("GENIA_nested_event")] = 1
        #        if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization
        #            if entity2.get("isName") == "True":
        #                features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1
        #            else:
        #                features[self.featureSet.getId("GENIA_regulation_of_event")] = 1

        # define extra attributes
        extra = {"xtype": "trigger-event", "type": eventNode.get("type")}
        extra["e"] = eventNode.get("id")
        eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode]
        extra["et"] = eventToken.get("id")
        if themeNode != None:
            extra["t"] = themeNode.get("id")
            themeToken = sentenceGraph.entityHeadTokenByEntity[themeNode]
            extra["tt"] = themeToken.get("id")
        if causeNode != None:
            extra["c"] = causeNode.get("id")
            causeToken = sentenceGraph.entityHeadTokenByEntity[causeNode]
            extra["ct"] = causeToken.get("id")
        sentenceOrigId = sentenceGraph.sentenceElement.get("origId")
        if sentenceOrigId != None:
            extra["SOID"] = sentenceOrigId
        # make example
        # assert (category == 1 or category == -1)
        return (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)

    def buildArgumentFeatures(self, sentenceGraph, paths, features, eventNode, argNode, tag):
        eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode]
        argToken = sentenceGraph.entityHeadTokenByEntity[argNode]
        if eventToken != argToken and paths.has_key(eventToken) and paths[eventToken].has_key(argToken):
            path = paths[eventToken][argToken]
            edges = self.multiEdgeFeatureBuilder.getEdges(sentenceGraph.dependencyGraph, path)
        else:
            path = [eventToken, argToken]
            edges = None

        self.multiEdgeFeatureBuilder.tag = tag
        self.multiEdgeFeatureBuilder.setFeatureVector(features, eventNode, argNode)
        if not "disable_entity_features" in self.styles:
            self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph)
        self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path)
        if not "disable_terminus_features" in self.styles:
            self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph)  # remove for fast
        if not "disable_single_element_features" in self.styles:
            self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, edges, sentenceGraph)
        if not "disable_ngram_features" in self.styles:
            self.multiEdgeFeatureBuilder.buildPathGrams(2, path, edges, sentenceGraph)  # remove for fast
            self.multiEdgeFeatureBuilder.buildPathGrams(3, path, edges, sentenceGraph)  # remove for fast
            self.multiEdgeFeatureBuilder.buildPathGrams(4, path, edges, sentenceGraph)  # remove for fast
        if not "disable_path_edge_features" in self.styles:
            self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, edges, sentenceGraph)
        self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph)
        self.multiEdgeFeatureBuilder.setFeatureVector(None)
        self.multiEdgeFeatureBuilder.tag = ""
예제 #2
0
class MultiEdgeExampleBuilder(ExampleBuilder):
    """
    This example builder makes edge examples, i.e. examples describing
    the event arguments.
    """
    def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert( classSet.getId("neg") == 1 or (len(classSet.Ids)== 2 and classSet.getId("neg") == -1) )
        
        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        
        self.styles = self.getParameters(style, [
            "typed", "directed", "headsOnly", "graph_kernel", "noAnnType", "noMasking", "maxFeatures",
            "genia_limits", "epi_limits", "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits",
            "genia_task1", "ontology", "nodalida", "bacteria_renaming", "trigger_features", "rel_features",
            "ddi_features", "evex", "giuliano", "random", "themeOnly", "causeOnly", "no_path", "entities", 
            "skip_extra_triggers", "headsOnly", "graph_kernel", "trigger_features", "no_task", "no_dependency", 
            "disable_entity_features", "disable_terminus_features", "disable_single_element_features", 
            "disable_ngram_features", "disable_path_edge_features", "no_linear", "subset", "binary", "pos_only",
            "entity_type"
        ])
        if style == None: # no parameters given
            style["typed"] = style["directed"] = style["headsOnly"] = True
#        self.styles = style
#        if "selftrain_group" in self.styles:
#            self.selfTrainGroups = set()
#            if "selftrain_group-1" in self.styles:
#                self.selfTrainGroups.add("-1")
#            if "selftrain_group0" in self.styles:
#                self.selfTrainGroups.add("0")
#            if "selftrain_group1" in self.styles:
#                self.selfTrainGroups.add("1")
#            if "selftrain_group2" in self.styles:
#                self.selfTrainGroups.add("2")
#            if "selftrain_group3" in self.styles:
#                self.selfTrainGroups.add("3")
#            print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups
        
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        # NOTE Temporarily re-enabling predicted range
        #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None)
        if self.styles["graph_kernel"]:
            from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder
            self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet)
        if self.styles["noAnnType"]:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if self.styles["noMasking"]:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if self.styles["maxFeatures"]:
			self.multiEdgeFeatureBuilder.maximum = True
        if self.styles["genia_task1"]:
            self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity")
        self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        if self.styles["ontology"]:
            self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
        if self.styles["nodalida"]:
            self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet)
        if self.styles["bacteria_renaming"]:
            self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder(self.featureSet)
        if self.styles["trigger_features"]:
            self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
            self.triggerFeatureBuilder.useNonNameEntities = True
            if self.styles["genia_task1"]:
                self.triggerFeatureBuilder.filterAnnTypes.add("Entity")
            #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName)
        if self.styles["rel_features"]:
            self.relFeatureBuilder = RELFeatureBuilder(featureSet)
        if self.styles["ddi_features"]:
            self.drugFeatureBuilder = DrugFeatureBuilder(featureSet)
        if self.styles["evex"]:
            self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet)
        if self.styles["giuliano"]:
            self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
        self.pathLengths = length
        assert(self.pathLengths == None)
        self.types = types
        if self.styles["random"]:
            from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder
            self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
    
    def definePredictedValueRange(self, sentences, elementName):
        self.multiEdgeFeatureBuilder.definePredictedValueRange(sentences, elementName)                        
    
    def getPredictedValueRange(self):
        return self.multiEdgeFeatureBuilder.predictedRange
    
    def filterEdgesByType(self, edges, typesToInclude):
        if len(typesToInclude) == 0:
            return edges
        edgesToKeep = []
        for edge in edges:
            if edge.get("type") in typesToInclude:
                edgesToKeep.append(edge)
        return edgesToKeep
    
    def getCategoryNameFromTokens(self, sentenceGraph, t1, t2, directed=True):
        """
        Example class. Multiple overlapping edges create a merged type.
        """
        types = set()
#        if sentenceGraph.interactionGraph.has_edge(t1, t2):
#            intEdges = sentenceGraph.interactionGraph.get_edge_data(t1, t2, default={})
#            # NOTE: Only works if keys are ordered integers
#            for i in range(len(intEdges)):
#                types.add(intEdges[i]["element"].get("type"))
#        if (not directed) and sentenceGraph.interactionGraph.has_edge(t2, t1):
#            intEdges = sentenceGraph.interactionGraph.get_edge(t2, t1, default={})
#            # NOTE: Only works if keys are ordered integers
#            for i in range(len(intEdges)):
#                types.add(intEdges[i]["element"].get("type"))
        intEdges = sentenceGraph.interactionGraph.getEdges(t1, t2)
        if (not directed):
            intEdges = intEdges + sentenceGraph.interactionGraph.getEdges(t2, t1)
        for intEdge in intEdges:
            types.add(intEdge[2].get("type"))
        types = list(types)
        types.sort()
        categoryName = ""
        for name in types:
            if categoryName != "":
                categoryName += "---"
            categoryName += name
        if categoryName != "":
            return categoryName
        else:
            return "neg"
        
    def getCategoryName(self, sentenceGraph, e1, e2, directed=True, duplicateEntities=None):
        """
        Example class. Multiple overlapping edges create a merged type.
        """
#        interactions = []
#        e1s = [e1]
#        if duplicateEntities != None and e1 in duplicateEntities:
#            e1s += duplicateEntities[e1]
#        e2s = [e2]
#        if duplicateEntities != None and e2 in duplicateEntities:
#            e2s += duplicateEntities[e2]
#        for entity1 in e1s:
#            for entity2 in e2s:
#                interactions = interactions + sentenceGraph.getInteractions(entity1, entity2)
#                if not directed:
#                    interactions = interactions + sentenceGraph.getInteractions(entity2, entity1)
        interactions = sentenceGraph.getInteractions(e1, e2, True)
        #print interactions
        
        types = set()
        for interaction in interactions:
            types.add(interaction[2].get("type"))
        types = list(types)
        types.sort()
        categoryName = ""
        for name in types:
            if self.styles["causeOnly"] and name != "Cause":
                continue
            if self.styles["themeOnly"] and name != "Theme":
                continue
            if categoryName != "":
                categoryName += "---"
            categoryName += name
        if categoryName != "":
            return categoryName
        else:
            return "neg"
    
    def isPotentialRELInteraction(self, e1, e2):
        if e1.get("type") == "Protein" and e2.get("type") == "Entity":
            return True
        else:
            return False

    def isPotentialBBInteraction(self, e1, e2, sentenceGraph):
        #if e1.get("type") == "Bacterium" and e2.get("type") in ["Host", "HostPart", "Geographical", "Environmental", "Food", "Medical", "Soil", "Water"]:
        # Note: "Environment" type is misspelled as "Environmental" in the BB-task documentation
        if e1.get("type") == "Bacterium" and e2.get("type") in ["Host", "HostPart", "Geographical", "Environment", "Food", "Medical", "Soil", "Water"]:
            return True
        elif e1.get("type") == "Host" and e2.get("type") == "HostPart":
            return True
        else:
            return False
    
    def getBISuperType(self, eType):
        if eType in ["GeneProduct", "Protein", "ProteinFamily", "PolymeraseComplex"]:
            return "ProteinEntity"
        elif eType in ["Gene", "GeneFamily", "GeneComplex", "Regulon", "Site", "Promoter"]:
            return "GeneEntity"
        else:
            return None
    
    def isPotentialBIInteraction(self, e1, e2, sentenceGraph, stats):
        e1Type = e1.get("type")
        e1SuperType = self.getBISuperType(e1Type)
        e2Type = e2.get("type")
        e2SuperType = self.getBISuperType(e2Type)
        
        tag = "(" + e1Type + "/" + e2Type + ")"
        if e1Type == "Regulon":
            if e2SuperType in ["GeneEntity", "ProteinEntity"]:
                return True
        if e1SuperType == "ProteinEntity":
            if e2Type in ["Site", "Promoter", "Gene", "GeneComplex"]:
                return True
        if e1Type in ["Action", "Transcription", "Expression"]:
            return True
        if e1Type == "Site":
            if e2SuperType == "GeneEntity":
                return True
        if e1Type == "Promoter":
            if e2SuperType in ["GeneEntity", "ProteinEntity"]:
                return True
        if e1SuperType in ["GeneEntity", "ProteinEntity"]:
            if e2SuperType in ["GeneEntity", "ProteinEntity"]:
                return True
        stats.filter("bi_limits") #+tag)
        return False

    def isPotentialEPIInteraction(self, e1, e2, sentenceGraph):
        if e1.get("type") != "Catalysis":
            if e1.get("type") in ["Protein", "Entity"]:
                return False
            elif e2.get("type") in ["Protein", "Entity"]:
                return True
            else:
                return False
        else: # Catalysis
            if e2.get("type") != "Entity":
                return True
            else:
                return False
        assert False, (e1.get("type"), e2.get("type"))

    def isPotentialIDInteraction(self, e1, e2, sentenceGraph):
        e1Type = e1.get("type")
        e2Type = e2.get("type")
        e1IsCore = e1Type in ["Protein", "Regulon-operon", "Two-component-system", "Chemical", "Organism"]
        e2IsCore = e2Type in ["Protein", "Regulon-operon", "Two-component-system", "Chemical", "Organism"]
        if e1IsCore:
            return False
        elif e1Type in ["Gene_expression", "Transcription"]:
            if e2Type in ["Protein", "Regulon-operon"]:
                return True
            else:
                return False
        elif e1Type in ["Protein_catabolism", "Phosphorylation"]:
            if e2Type == "Protein":
                return True
            else:
                return False
        elif e1Type == "Localization":
            if e2IsCore or e2Type == "Entity":
                return True
            else:
                return False
        elif e1Type in ["Binding", "Process"]:
            if e2IsCore:
                return True
            else:
                return False
        elif "egulation" in e1Type:
            if e2Type != "Entity":
                return True
            else:
                return False
        elif e1Type == "Entity":
            if e2IsCore:
                return True
            else:
                return False
        assert False, (e1Type, e2Type)
    
    def isPotentialCOInteraction(self, e1, e2, sentenceGraph):
        if e1.get("type") == "Exp" and e2.get("type") == "Exp":
            anaphoraTok = sentenceGraph.entityHeadTokenByEntity[e1]
            antecedentTok = sentenceGraph.entityHeadTokenByEntity[e2]
            antecedentTokenFound = False
            for token in sentenceGraph.tokens:
                if token == antecedentTok:
                    antecedentTokenFound = True
                if token == anaphoraTok: # if, not elif, to take into accoutn cases where e1Tok == e2Tok
                    if antecedentTokenFound:
                        return True
                    else:
                        return False
            assert False
        elif e1.get("type") == "Exp" and e2.get("type") == "Protein":
            return True
        else:
            return False
    
    def isPotentialGeniaInteraction(self, e1, e2):
        e1Type = e1.get("type")
        e2Type = e2.get("type")
        if e1Type == "Protein":
            return False
        elif e1Type in ["Entity", "Gene_expression", "Transcription", "Protein_catabolism", "Phosphorylation", "Binding"]:
            if e2Type == "Protein":
                return True
            else:
                return False
        elif e1Type == "Localization":
            if e2Type in ["Protein", "Entity"]:
                return True
            else:
                return False
        elif "egulation" in e1Type:
            if e2Type != "Entity":
                return True
            else:
                return False
        assert False, (e1Type, e2Type)

    def getGoldCategoryName(self, goldGraph, entityToGold, e1, e2, directed=True):
        if len(entityToGold[e1]) > 0 and len(entityToGold[e2]) > 0:
            return self.getCategoryName(goldGraph, entityToGold[e1][0], entityToGold[e2][0], directed=directed)
        else:
            return "neg"
                
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph = None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        #examples = []
        exampleIndex = 0
        
        if self.styles["trigger_features"]: 
            self.triggerFeatureBuilder.initSentence(sentenceGraph)
        if self.styles["evex"]: 
            self.evexFeatureBuilder.initSentence(sentenceGraph)
            
        # Filter entities, if needed
        #mergedIds = None
        #duplicateEntities = None
        #entities = sentenceGraph.entities
        #entities, mergedIds, duplicateEntities = self.mergeEntities(sentenceGraph, False) # "no_duplicates" in self.styles)
        sentenceGraph.mergeInteractionGraph(True)
        entities = sentenceGraph.mergedEntities
        entityToDuplicates = sentenceGraph.mergedEntityToDuplicates
        self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities))
        
        # Connect to optional gold graph
        if goldGraph != None:
            entityToGold = EvaluateInteractionXML.mapEntities(entities, goldGraph.entities)
        
        paths = None
        if not self.styles["no_path"]:
            ##undirected = sentenceGraph.getUndirectedDependencyGraph()
            #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
            ###undirected = sentenceGraph.dependencyGraph.to_undirected()
            ####undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work
            undirected = sentenceGraph.dependencyGraph.toUndirected()
            #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
            paths = undirected
        
        #for edge in sentenceGraph.dependencyGraph.edges:
        #    assert edge[2] != None
        #for edge in undirected.edges:
        #    assert edge[2] != None
        #if sentenceGraph.sentenceElement.get("id") == "GENIA.d70.s5":
        #    print [(x[0].get("id"), x[1].get("id"), x[2].get("id")) for x in sentenceGraph.dependencyGraph.edges]
        
        # Generate examples based on interactions between entities or interactions between tokens
        if self.styles["entities"]:
            loopRange = len(entities)
        else:
            loopRange = len(sentenceGraph.tokens)
        for i in range(loopRange-1):
            for j in range(i+1,loopRange):
                eI = None
                eJ = None
                if self.styles["entities"]:
                    eI = entities[i]
                    eJ = entities[j]
                    tI = sentenceGraph.entityHeadTokenByEntity[eI]
                    tJ = sentenceGraph.entityHeadTokenByEntity[eJ]
                    #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True":
                    #    continue
                    if eI.get("type") == "neg" or eJ.get("type") == "neg":
                        continue
                    if self.styles["skip_extra_triggers"]:
                        if eI.get("source") != None or eJ.get("source") != None:
                            continue
                else:
                    tI = sentenceGraph.tokens[i]
                    tJ = sentenceGraph.tokens[j]
                # only consider paths between entities (NOTE! entities, not only named entities)
                if self.styles["headsOnly"]:
                    if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0):
                        continue
                
                if self.styles["directed"]:
                    # define forward
                    if self.styles["entities"]:
                        categoryName = self.getCategoryName(sentenceGraph, eI, eJ, True)
                        if goldGraph != None:
                            categoryName = self.getGoldCategoryName(goldGraph, entityToGold, eI, eJ, True)
                    else:
                        categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, True)
                    # make forward
                    self.exampleStats.beginExample(categoryName)
                    makeExample = True
                    if self.styles["genia_limits"] and not self.isPotentialGeniaInteraction(eI, eJ):
                        makeExample = False
                        self.exampleStats.filter("genia_limits")
                    if self.styles["genia_task1"] and (eI.get("type") == "Entity" or eJ.get("type") == "Entity"):
                        makeExample = False
                        self.exampleStats.filter("genia_task1")
                    if self.styles["rel_limits"] and not self.isPotentialRELInteraction(eI, eJ):
                        makeExample = False
                        self.exampleStats.filter("rel_limits")
                    if self.styles["co_limits"] and not self.isPotentialCOInteraction(eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("co_limits")
                    if self.styles["bb_limits"] and not self.isPotentialBBInteraction(eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("bb_limits")
                        if categoryName != "neg":
                            self.exampleStats.filter("bb_limits(" + categoryName + ":" + eI.get("type") + "/" + eJ.get("type") + ")")
                    if self.styles["bi_limits"] and not self.isPotentialBIInteraction(eI, eJ, sentenceGraph, self.exampleStats):
                        makeExample = False
                        #self.exampleStats.filter("bi_limits")
                    if self.styles["epi_limits"] and not self.isPotentialEPIInteraction(eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("epi_limits")
                    if self.styles["id_limits"] and not self.isPotentialIDInteraction(eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("id_limits")
#                    if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_limits")
#                    if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_group")
                    if self.styles["pos_only"] and categoryName == "neg":
                        makeExample = False
                        self.exampleStats.filter("pos_only")
                    if makeExample:
                        #examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) )
                        ExampleUtils.appendExamples([self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ)], outfile)
                        exampleIndex += 1
                    self.exampleStats.endExample()
                    
                    # define reverse
                    if self.styles["entities"]:
                        categoryName = self.getCategoryName(sentenceGraph, eJ, eI, True)
                        if goldGraph != None:
                            categoryName = self.getGoldCategoryName(goldGraph, entityToGold, eJ, eI, True)
                    else:
                        categoryName = self.getCategoryNameFromTokens(sentenceGraph, tJ, tI, True)
                    # make reverse
                    self.exampleStats.beginExample(categoryName)
                    makeExample = True
                    if self.styles["genia_limits"] and not self.isPotentialGeniaInteraction(eJ, eI):
                        makeExample = False
                        self.exampleStats.filter("genia_limits")
                    if self.styles["genia_task1"] and (eI.get("type") == "Entity" or eJ.get("type") == "Entity"):
                        makeExample = False
                        self.exampleStats.filter("genia_task1")
                    if self.styles["rel_limits"] and not self.isPotentialRELInteraction(eJ, eI):
                        makeExample = False
                        self.exampleStats.filter("rel_limits")
                    if self.styles["co_limits"] and not self.isPotentialCOInteraction(eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("co_limits")
                    if self.styles["bb_limits"] and not self.isPotentialBBInteraction(eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("bb_limits")
                        if categoryName != "neg":
                            self.exampleStats.filter("bb_limits(" + categoryName + ":" + eJ.get("type") + "/" + eI.get("type") + ")")
                    if self.styles["bi_limits"] and not self.isPotentialBIInteraction(eJ, eI, sentenceGraph, self.exampleStats):
                        makeExample = False
                        #self.exampleStats.filter("bi_limits")
                    if self.styles["epi_limits"] and not self.isPotentialEPIInteraction(eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("epi_limits")
                    if self.styles["id_limits"] and not self.isPotentialIDInteraction(eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("id_limits")
#                    if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_limits")
#                    if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_group")
                    if self.styles["pos_only"] and categoryName == "neg":
                        makeExample = False
                        self.exampleStats.filter("pos_only")
                    if makeExample:
                        #examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) )
                        ExampleUtils.appendExamples([self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI)], outfile)
                        exampleIndex += 1
                    self.exampleStats.endExample()
                else:
                    if self.styles["entities"]:
                        categoryName = self.getCategoryName(sentenceGraph, eI, eJ, False)
                    else:
                        categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, False)
                    self.exampleStats.beginExample(categoryName)
                    forwardExample = self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ)
                    if not self.styles["graph_kernel"]:
                        reverseExample = self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI)
                        forwardExample[2].update(reverseExample[2])
                    #examples.append(forwardExample)
                    ExampleUtils.appendExamples([forwardExample], outfile)
                    exampleIndex += 1
                    self.exampleStats.endExample()
        
        #return examples
        return exampleIndex
    
    def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, exampleIndex, entity1=None, entity2=None):
        """
        Build a single directed example for the potential edge between token1 and token2
        """
        # dummy return for speed testing
        #return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),1,{},{})
    
        # define features
        features = {}
        if True: #token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2):
            #if token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2):
            #    path = paths[token1][token2]
            #else:
            #    path = [token1, token2]
            if not self.styles["no_path"]:
                # directedPath reduces performance by 0.01 pp
                #directedPath = sentenceGraph.dependencyGraph.getPaths(token1, token2)
                #if len(directedPath) == 0:
                #    directedPath = sentenceGraph.dependencyGraph.getPaths(token2, token1)
                #    for dp in directedPath:
                #        dp.reverse()
                #if len(directedPath) == 0:
                #    path = paths.getPaths(token1, token2)
                #else:
                #    path = directedPath
                
                path = paths.getPaths(token1, token2)
                if len(path) > 0:
                    #if len(path) > 1:
                    #    print len(path)
                    path = path[0]
                    pathExists = True
                else:
                    path = [token1, token2]
                    pathExists = False
            else:
                path = [token1, token2]
                pathExists = False
            #print token1.get("id"), token2.get("id")
            assert(self.pathLengths == None)
            if self.pathLengths == None or len(path)-1 in self.pathLengths:
#                if not "no_ontology" in self.styles:
#                    self.ontologyFeatureBuilder.setFeatureVector(features)
#                    self.ontologyFeatureBuilder.buildOntologyFeaturesForPath(sentenceGraph, path)
#                    self.ontologyFeatureBuilder.setFeatureVector(None)
                if self.styles["trigger_features"]: # F 85.52 -> 85.55
                    self.triggerFeatureBuilder.setFeatureVector(features)
                    self.triggerFeatureBuilder.tag = "trg1_"
                    self.triggerFeatureBuilder.buildFeatures(token1)
                    self.triggerFeatureBuilder.tag = "trg2_"
                    self.triggerFeatureBuilder.buildFeatures(token2)
                    self.triggerFeatureBuilder.setFeatureVector(None)
                # REL features
                if self.styles["rel_features"] and not self.styles["no_task"]:
                    self.relFeatureBuilder.setFeatureVector(features)
                    self.relFeatureBuilder.tag = "rel1_"
                    self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token1))
                    self.relFeatureBuilder.tag = "rel2_"
                    self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token2))
                    self.relFeatureBuilder.setFeatureVector(None)
                if self.styles["bacteria_renaming"] and not self.styles["no_task"]:
                    self.bacteriaRenamingFeatureBuilder.setFeatureVector(features)
                    self.bacteriaRenamingFeatureBuilder.buildPairFeatures(entity1, entity2)
                    #self.bacteriaRenamingFeatureBuilder.buildSubstringFeatures(entity1, entity2) # decreases perf. 74.76 -> 72.41
                    self.bacteriaRenamingFeatureBuilder.setFeatureVector(None)
                if self.styles["co_limits"] and not self.styles["no_task"]:
                    e1Offset = Range.charOffsetToSingleTuple(entity1.get("charOffset"))
                    e2Offset = Range.charOffsetToSingleTuple(entity2.get("charOffset"))
                    if Range.contains(e1Offset, e2Offset):
                        features[self.featureSet.getId("e1_contains_e2")] = 1
                        if entity2.get("isName") == "True":
                            features[self.featureSet.getId("e1_contains_e2name")] = 1
                    if Range.contains(e2Offset, e1Offset):
                        features[self.featureSet.getId("e2_contains_e1")] = 1
                        if entity1.get("isName") == "True":
                            features[self.featureSet.getId("e2_contains_e1name")] = 1
                if self.styles["ddi_features"]:
                    self.drugFeatureBuilder.setFeatureVector(features)
                    self.drugFeatureBuilder.tag = "ddi_"
                    self.drugFeatureBuilder.buildPairFeatures(entity1, entity2)  
                    if self.styles["ddi_mtmx"]:
                        self.drugFeatureBuilder.buildMTMXFeatures(entity1, entity2)
                    self.drugFeatureBuilder.setFeatureVector(None)
                #if "graph_kernel" in self.styles or not "no_dependency" in self.styles:
                #    #print "Getting edges"
                #    if token1 != token2 and pathExists:
                #        #print "g1"
                #        edges = self.multiEdgeFeatureBuilder.getEdges(sentenceGraph.dependencyGraph, path)
                #        #print "g2"
                #    else:
                #        edges = None
                if self.styles["graph_kernel"]:
                    self.graphKernelFeatureBuilder.setFeatureVector(features, entity1, entity2)
                    self.graphKernelFeatureBuilder.buildGraphKernelFeatures(sentenceGraph, path)
                    self.graphKernelFeatureBuilder.setFeatureVector(None)
                if self.styles["entity_type"]:
                    features[self.featureSet.getId("e1_"+entity1.get("type"))] = 1
                    features[self.featureSet.getId("e2_"+entity2.get("type"))] = 1
                    features[self.featureSet.getId("distance_"+str(len(path)))] = 1
                if not self.styles["no_dependency"]:
                    #print "Dep features"
                    self.multiEdgeFeatureBuilder.setFeatureVector(features, entity1, entity2)
                    #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast
                    if not self.styles["disable_entity_features"]:
                        self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph)
                    self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path)
                    if not self.styles["disable_terminus_features"]:
                        self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast
                    if not self.styles["disable_single_element_features"]:
                        self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, sentenceGraph)
                    if not self.styles["disable_ngram_features"]:
                        #print "NGrams"
                        self.multiEdgeFeatureBuilder.buildPathGrams(2, path, sentenceGraph) # remove for fast
                        self.multiEdgeFeatureBuilder.buildPathGrams(3, path, sentenceGraph) # remove for fast
                        self.multiEdgeFeatureBuilder.buildPathGrams(4, path, sentenceGraph) # remove for fast
                    #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast
                    #if edges != None:
                    #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast
                    #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast
                    if not self.styles["disable_path_edge_features"]:
                        self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, sentenceGraph)
                    self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph)
                    self.multiEdgeFeatureBuilder.setFeatureVector(None)
                if self.styles["nodalida"]:
                    self.nodalidaFeatureBuilder.setFeatureVector(features, entity1, entity2)
                    shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths(sentenceGraph.dependencyGraph, path)
                    print shortestPaths
                    if len(shortestPaths) > 0:
                        self.nodalidaFeatureBuilder.buildNGrams(shortestPaths, sentenceGraph)
                    self.nodalidaFeatureBuilder.setFeatureVector(None)
                if not self.styles["no_linear"]:
                    self.tokenFeatureBuilder.setFeatureVector(features)
                    for i in range(len(sentenceGraph.tokens)):
                        if sentenceGraph.tokens[i] == token1:
                            token1Index = i
                        if sentenceGraph.tokens[i] == token2:
                            token2Index = i
                    linearPreTag = "linfw_"
                    if token1Index > token2Index: 
                        token1Index, token2Index = token2Index, token1Index
                        linearPreTag = "linrv_"
                    self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1")
                    self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2")
                    # Before, middle, after
    #                self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf")
    #                self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw")
    #                self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af")
                    # before-middle, middle, middle-after
#                    self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2)
#                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2)
#                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2)
                    self.tokenFeatureBuilder.setFeatureVector(None)
                if self.styles["random"]:
                    self.randomFeatureBuilder.setFeatureVector(features)
                    self.randomFeatureBuilder.buildRandomFeatures(100, 0.01)
                    self.randomFeatureBuilder.setFeatureVector(None)
                if self.styles["genia_limits"] and not self.styles["no_task"]:
                    e1Type = entity1.get("type")
                    e2Type = entity2.get("type")
                    assert(entity1.get("isName") == "False")
                    if entity2.get("isName") == "True":
                        features[self.featureSet.getId("GENIA_target_protein")] = 1
                    else:
                        features[self.featureSet.getId("GENIA_nested_event")] = 1
                    if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization
                        if entity2.get("isName") == "True":
                            features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1
                        else:
                            features[self.featureSet.getId("GENIA_regulation_of_event")] = 1
                if self.styles["bi_limits"]:
                    # Make features based on entity types
                    e1Type = entity1.get("type")
                    e2Type = entity2.get("type")
                    e1SuperType = str(self.getBISuperType(e1Type))
                    e2SuperType = str(self.getBISuperType(e2Type))
                    features[self.featureSet.getId("BI_e1_"+e1Type)] = 1
                    features[self.featureSet.getId("BI_e2_"+e2Type)] = 1
                    features[self.featureSet.getId("BI_e1sup_"+e1SuperType)] = 1
                    features[self.featureSet.getId("BI_e2sup_"+e2SuperType)] = 1
                    features[self.featureSet.getId("BI_e1e2_"+e1Type+"_"+e2Type)] = 1
                    features[self.featureSet.getId("BI_e1e2sup_"+e1SuperType+"_"+e2SuperType)] = 1
                if self.styles["evex"]:
                    self.evexFeatureBuilder.setFeatureVector(features, entity1, entity2)
                    self.evexFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph)
                    self.evexFeatureBuilder.setFeatureVector(None)
                if self.styles["giuliano"]:
                    self.giulianoFeatureBuilder.setFeatureVector(features, entity1, entity2)
                    self.giulianoFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph)
                    self.giulianoFeatureBuilder.setFeatureVector(None)
            else:
                features[self.featureSet.getId("always_negative")] = 1
                if self.styles["subset"]:
                    features[self.featureSet.getId("out_of_scope")] = 1
        else:
            features[self.featureSet.getId("always_negative")] = 1
            if self.styles["subset"]:
                features[self.featureSet.getId("out_of_scope")] = 1
            path = [token1, token2]
        # define extra attributes
        #if int(path[0].get("id").split("_")[-1]) < int(path[-1].get("id").split("_")[-1]):
        if int(path[0].get("charOffset").split("-")[0]) < int(path[-1].get("charOffset").split("-")[0]):
            #extra = {"xtype":"edge","type":"i","t1":path[0],"t2":path[-1]}
            extra = {"xtype":"edge","type":"i","t1":path[0].get("id"),"t2":path[-1].get("id")}
            extra["deprev"] = False
        else:
            #extra = {"xtype":"edge","type":"i","t1":path[-1],"t2":path[0]}
            extra = {"xtype":"edge","type":"i","t1":path[-1].get("id"),"t2":path[0].get("id")}
            extra["deprev"] = True
        if entity1 != None:
            #extra["e1"] = entity1
            extra["e1"] = entity1.get("id")
            if sentenceGraph.mergedEntityToDuplicates != None:
                #extra["e1GoldIds"] = mergedEntityIds[entity1]
                extra["e1DuplicateIds"] = ",".join([x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity1]])
        if entity2 != None:
            #extra["e2"] = entity2
            extra["e2"] = entity2.get("id")
            if sentenceGraph.mergedEntityToDuplicates != None:
                extra["e2DuplicateIds"] = ",".join([x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity2]])
                #extra["e2GoldIds"] = mergedEntityIds[entity2]
        extra["categoryName"] = categoryName
        if self.styles["bacteria_renaming"]:
            if entity1.get("text") != None and entity1.get("text") != "":
                extra["e1t"] = entity1.get("text").replace(" ", "---").replace(":","-COL-")
            if entity2.get("text") != None and entity2.get("text") != "":
                extra["e2t"] = entity2.get("text").replace(" ", "---").replace(":","-COL-")
        sentenceOrigId = sentenceGraph.sentenceElement.get("origId")
        if sentenceOrigId != None:
            extra["SOID"] = sentenceOrigId       
        # make example
        if self.styles["binary"]:
            if categoryName != "neg":
                category = 1
            else:
                category = -1
            categoryName = "i"
        else:
            category = self.classSet.getId(categoryName)
        
        # NOTE: temporarily disable for replicating 110310 experiment
        #features[self.featureSet.getId("extra_constant")] = 1
        return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)
예제 #3
0
class EventExampleBuilder(ExampleBuilder):
    def __init__(self,
                 style=["typed", "directed", "headsOnly"],
                 length=None,
                 types=[],
                 featureSet=None,
                 classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert (classSet.getId("neg") == 1)

        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        self.styles = style

        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        if "noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True
        #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        #if "ontology" in self.styles:
        #    self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
        self.pathLengths = length
        assert (self.pathLengths == None)
        self.types = types

        #self.outFile = open("exampleTempFile.txt","wt")

    @classmethod
    def run(cls, input, output, parse, tokenization, style, idFileTag=None):
        classSet, featureSet = cls.getIdSets(idFileTag)
        e = EventExampleBuilder(style=style,
                                classSet=classSet,
                                featureSet=featureSet)
        sentences = cls.getSentences(input, parse, tokenization)
        e.buildExamplesForSentences(sentences, output, idFileTag)

    def definePredictedValueRange(self, sentences, elementName):
        self.multiEdgeFeatureBuilder.definePredictedValueRange(
            sentences, elementName)

    def getPredictedValueRange(self):
        return self.multiEdgeFeatureBuilder.predictedRange

    def preProcessExamples(self, allExamples):
        if "normalize" in self.styles:
            print >> sys.stderr, " Normalizing feature vectors"
            ExampleUtils.normalizeFeatureVectors(allExamples)
        return allExamples

    def isPotentialGeniaInteraction(self, e1, e2):
        if e1.get("isName") == "True" and e2.get("isName") == "True":
            return False
        elif e1.get("isName") == "True" and e2.get("isName") == "False":
            return False
        else:
            return True

    def getArgumentEntities(self, sentenceGraph, entityNode):
        eId = entityNode.get("id")
        assert (eId != None)
        themeNodes = []
        causeNodes = []
        for edge in sentenceGraph.interactions:
            if edge.get("e1") == eId:
                edgeType = edge.get("type")
                assert (edgeType in ["Theme", "Cause"]), edgeType
                if edgeType == "Theme":
                    themeNodes.append(
                        sentenceGraph.entitiesById[edge.get("e2")])
                elif edgeType == "Cause":
                    causeNodes.append(
                        sentenceGraph.entitiesById[edge.get("e2")])
        return themeNodes, causeNodes

    def makeGSEvents(self, sentenceGraph):
        self.gsEvents = {}  # [token]->[event-type]->[1-n argument sets]
        for token in sentenceGraph.tokens:
            self.gsEvents[token] = {}

        for entity in sentenceGraph.entities:
            if entity.get("type") == "neg":
                continue

            eId = entity.get("id")
            eType = entity.get("type")
            arguments = set()
            for interaction in sentenceGraph.interactions:
                if interaction.get("e1") == eId:
                    arguments.add(
                        (interaction.get("type"), interaction.get("e2")))
            eHeadToken = sentenceGraph.entityHeadTokenByEntity[entity]
            if not self.gsEvents[eHeadToken].has_key(eType):
                self.gsEvents[eHeadToken][eType] = []
            self.gsEvents[eHeadToken][eType].append(arguments)

    def isGSEvent(self, sentenceGraph, entity, themeNodes, causeNodes):
        eHeadToken = sentenceGraph.entityHeadTokenByEntity[entity]
        eType = entity.get("type")
        if not self.gsEvents[eHeadToken].has_key(eType):
            return False

        argumentSet = set()
        for themeNode in themeNodes:
            if themeNode != None:
                argumentSet.add(("Theme", themeNode.get("id")))
        for causeNode in causeNodes:
            if causeNode != None:
                argumentSet.add(("Cause", causeNode.get("id")))
        if argumentSet in self.gsEvents[eHeadToken][eType]:
            return True
        else:
            return False

#    def isEvent(self, sentenceGraph, eventNode, themeNodes, causeNodes):
#        goldThemeNodes, goldCauseNodes = self.getArgumentEntities(sentenceGraph, eventNode)
#        for node in themeNodes:
#            if node != None and node not in goldThemeNodes:
#                return False
#        for node in causeNodes:
#            if node != None and node not in goldCauseNodes:
#                return False
#        return True

    def buildExamples(self, sentenceGraph):
        self.makeGSEvents(sentenceGraph)

        eventNodes = []
        nameNodes = []
        for entity in sentenceGraph.entities:
            if entity.get("type") == "neg":
                continue
            if entity.get("isName") == "True":
                nameNodes.append(entity)
            else:
                eventNodes.append(entity)
        allNodes = eventNodes + nameNodes

        examples = []
        exampleIndex = 0

        undirected = sentenceGraph.dependencyGraph.to_undirected()
        paths = NX.all_pairs_shortest_path(undirected, cutoff=999)

        for eventNode in eventNodes:
            eventType = eventNode.get("type")
            if eventType in [
                    "Gene_expression", "Transcription", "Protein_catabolism",
                    "Localization", "Phosphorylation"
            ]:
                for nameNode in nameNodes:
                    if self.isPotentialGeniaInteraction(eventNode, nameNode):
                        examples.append(
                            self.buildExample(exampleIndex, sentenceGraph,
                                              paths, eventNode, nameNode))
                        exampleIndex += 1
            elif eventType in [
                    "Regulation", "Positive_regulation", "Negative_regulation"
            ]:
                combinations = combine.combine(allNodes + [None],
                                               allNodes + [None])
                for combination in combinations:
                    if combination[0] == combination[1]:
                        continue
                    if combination[0] == eventNode or combination[
                            1] == eventNode:
                        continue
                    if combination[
                            0] != None and not self.isPotentialGeniaInteraction(
                                eventNode, combination[0]):
                        continue
                    if combination[
                            1] != None and not self.isPotentialGeniaInteraction(
                                eventNode, combination[1]):
                        continue
                    examples.append(
                        self.buildExample(exampleIndex, sentenceGraph, paths,
                                          eventNode, combination[0],
                                          combination[1]))
                    exampleIndex += 1
            elif eventType in ["Binding"]:
                continue
            else:
                assert False, eventType

        self.gsEvents = None
        return examples

    def buildExample(self,
                     exampleIndex,
                     sentenceGraph,
                     paths,
                     eventNode,
                     themeNode,
                     causeNode=None):
        features = {}

        if self.isGSEvent(sentenceGraph, eventNode, [themeNode], [causeNode]):
            category = self.classSet.getId("pos")
        else:
            category = self.classSet.getId("neg")

        if themeNode != None:
            self.buildArgumentFeatures(sentenceGraph, paths, features,
                                       eventNode, themeNode, "theme_")
        if causeNode != None:
            self.buildArgumentFeatures(sentenceGraph, paths, features,
                                       eventNode, causeNode, "cause_")

        # Common features
#        eventType = eventNode.get("type")
#        e2Type = entity2.get("type")
#        assert(entity1.get("isName") == "False")
#        if entity2.get("isName") == "True":
#            features[self.featureSet.getId("GENIA_target_protein")] = 1
#        else:
#            features[self.featureSet.getId("GENIA_nested_event")] = 1
#        if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization
#            if entity2.get("isName") == "True":
#                features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1
#            else:
#                features[self.featureSet.getId("GENIA_regulation_of_event")] = 1

# define extra attributes
        extra = {"xtype": "trigger-event", "type": eventNode.get("type")}
        extra["e"] = eventNode.get("id")
        eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode]
        extra["et"] = eventToken.get("id")
        if themeNode != None:
            extra["t"] = themeNode.get("id")
            themeToken = sentenceGraph.entityHeadTokenByEntity[themeNode]
            extra["tt"] = themeToken.get("id")
        if causeNode != None:
            extra["c"] = causeNode.get("id")
            causeToken = sentenceGraph.entityHeadTokenByEntity[causeNode]
            extra["ct"] = causeToken.get("id")
        sentenceOrigId = sentenceGraph.sentenceElement.get("origId")
        if sentenceOrigId != None:
            extra["SOID"] = sentenceOrigId
        # make example
        #assert (category == 1 or category == -1)
        return (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex),
                category, features, extra)

    def buildArgumentFeatures(self, sentenceGraph, paths, features, eventNode,
                              argNode, tag):
        eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode]
        argToken = sentenceGraph.entityHeadTokenByEntity[argNode]
        if eventToken != argToken and paths.has_key(
                eventToken) and paths[eventToken].has_key(argToken):
            path = paths[eventToken][argToken]
            edges = self.multiEdgeFeatureBuilder.getEdges(
                sentenceGraph.dependencyGraph, path)
        else:
            path = [eventToken, argToken]
            edges = None

        self.multiEdgeFeatureBuilder.tag = tag
        self.multiEdgeFeatureBuilder.setFeatureVector(features, eventNode,
                                                      argNode)
        if not "disable_entity_features" in self.styles:
            self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph)
        self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path)
        if not "disable_terminus_features" in self.styles:
            self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(
                path, sentenceGraph)  # remove for fast
        if not "disable_single_element_features" in self.styles:
            self.multiEdgeFeatureBuilder.buildSingleElementFeatures(
                path, edges, sentenceGraph)
        if not "disable_ngram_features" in self.styles:
            self.multiEdgeFeatureBuilder.buildPathGrams(
                2, path, edges, sentenceGraph)  # remove for fast
            self.multiEdgeFeatureBuilder.buildPathGrams(
                3, path, edges, sentenceGraph)  # remove for fast
            self.multiEdgeFeatureBuilder.buildPathGrams(
                4, path, edges, sentenceGraph)  # remove for fast
        if not "disable_path_edge_features" in self.styles:
            self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(
                path, edges, sentenceGraph)
        self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph)
        self.multiEdgeFeatureBuilder.setFeatureVector(None)
        self.multiEdgeFeatureBuilder.tag = ""
예제 #4
0
class EdgeExampleBuilder(ExampleBuilder):
    """
    This example builder makes edge examples, i.e. examples describing
    the event arguments.
    """
    def __init__(self, style=None, types=[], featureSet=None, classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        
        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        assert( classSet.getId("neg") == 1 or (len(classSet.Ids)== 2 and classSet.getId("neg") == -1) )
        
        # Basic style = trigger_features:typed:directed:no_linear:entities:auto_limits:noMasking:maxFeatures
        self._setDefaultParameters([
            "directed", "undirected", "headsOnly", "graph_kernel", "noAnnType", "mask_nodes", "limit_features",
            "no_auto_limits", "co_features", "genia_features", "bi_features", #"genia_limits", "epi_limits", "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits",
            "genia_task1", "ontology", "nodalida", "bacteria_renaming", "no_trigger_features", "rel_features",
            "drugbank_features", "ddi_mtmx", "evex", "giuliano", "random", "themeOnly", "causeOnly", "no_path", "token_nodes", 
            "skip_extra_triggers", "headsOnly", "graph_kernel", "no_task", "no_dependency", 
            "disable_entity_features", "disable_terminus_features", "disable_single_element_features", 
            "disable_ngram_features", "disable_path_edge_features", "linear_features", "subset", "binary", "pos_only",
            "entity_type", "filter_shortest_path", "maskTypeAsProtein", "keep_neg", "metamap", 
            "sdb_merge", "sdb_features", "ontobiotope_features", "no_self_loops", "full_entities",
            "no_features", "wordnet", "wordvector", "se10t8_undirected", "filter_types", "doc_extra",
            "entity_extra"])
        self.styles = self.getParameters(style)
        #if style == None: # no parameters given
        #    style["typed"] = style["directed"] = style["headsOnly"] = True
        
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet, self.styles)
        # NOTE Temporarily re-enabling predicted range
        #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None)
        if self.styles["graph_kernel"]:
            from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder
            self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet)
        if self.styles["noAnnType"]:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if self.styles["mask_nodes"]:
            self.multiEdgeFeatureBuilder.maskNamedEntities = True
        else:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if not self.styles["limit_features"]:
            self.multiEdgeFeatureBuilder.maximum = True
        if self.styles["genia_task1"]:
            self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity")
        self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        if self.styles["ontology"]:
            self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
        if self.styles["ontobiotope_features"]:
            self.ontobiotopeFeatureBuilder = OntoBiotopeFeatureBuilder(self.featureSet)
        if self.styles["nodalida"]:
            self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet)
        if self.styles["bacteria_renaming"]:
            self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder(self.featureSet)
        if not self.styles["no_trigger_features"]:
            self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet, self.styles)
            self.triggerFeatureBuilder.useNonNameEntities = True
            if self.styles["noAnnType"]:
                self.triggerFeatureBuilder.noAnnType = True
            if self.styles["genia_task1"]:
                self.triggerFeatureBuilder.filterAnnTypes.add("Entity")
            #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName)
        if self.styles["rel_features"]:
            self.relFeatureBuilder = RELFeatureBuilder(featureSet)
        if self.styles["drugbank_features"]:
            self.drugFeatureBuilder = DrugFeatureBuilder(featureSet)
        if self.styles["evex"]:
            self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet)
        if self.styles["wordnet"]:
            self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet)
        if self.styles["wordvector"]:
            self.wordVectorFeatureBuilder = WordVectorFeatureBuilder(featureSet, self.styles)
        if self.styles["giuliano"]:
            self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
        self.types = types
        if self.styles["random"]:
            from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder
            self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
    
    def definePredictedValueRange(self, sentences, elementName):
        self.multiEdgeFeatureBuilder.definePredictedValueRange(sentences, elementName)                        
    
    def getPredictedValueRange(self):
        return self.multiEdgeFeatureBuilder.predictedRange
    
    def filterEdgesByType(self, edges, typesToInclude):
        if len(typesToInclude) == 0:
            return edges
        edgesToKeep = []
        for edge in edges:
            if edge.get("type") in typesToInclude:
                edgesToKeep.append(edge)
        return edgesToKeep
    
    def getCategoryNameFromTokens(self, sentenceGraph, t1, t2, directed=True):
        """
        Example class. Multiple overlapping edges create a merged type.
        """
        types = set()
        intEdges = sentenceGraph.interactionGraph.getEdges(t1, t2)
        if not directed:
            intEdges = intEdges + sentenceGraph.interactionGraph.getEdges(t2, t1)
        for intEdge in intEdges:
            types.add(intEdge[2].get("type"))
        types = list(types)
        types.sort()
        categoryName = ""
        for name in types:
            if categoryName != "":
                categoryName += "---"
            categoryName += name
        if categoryName != "":
            return categoryName
        else:
            return "neg"
        
    def getCategoryName(self, sentenceGraph, e1, e2, directed=True):
        """
        Example class. Multiple overlapping edges create a merged type.
        """
        interactions = sentenceGraph.getInteractions(e1, e2, True)
        if not directed and not self.styles["se10t8_undirected"]:
            interactions = interactions + sentenceGraph.getInteractions(e2, e1, True)
        
        types = set()
        for interaction in interactions:
            types.add(interaction[2].get("type"))
        types = list(types)
        types.sort()
        categoryName = ""
        for name in types:
            if self.styles["causeOnly"] and name != "Cause":
                continue
            if self.styles["themeOnly"] and name != "Theme":
                continue
            if categoryName != "":
                categoryName += "---"
            if self.styles["sdb_merge"]:
                name = self.mergeForSeeDev(name, self.structureAnalyzer)
            categoryName += name
        if categoryName != "":
            return categoryName
        else:
            return "neg"

    def getBISuperType(self, eType):
        if eType in ["GeneProduct", "Protein", "ProteinFamily", "PolymeraseComplex"]:
            return "ProteinEntity"
        elif eType in ["Gene", "GeneFamily", "GeneComplex", "Regulon", "Site", "Promoter"]:
            return "GeneEntity"
        else:
            return None
    
    def getSeeDevSuperTypes(self, eType):
        if eType in ("Gene", "Gene_Family", "Box", "Promoter"):
            return ("DNA", "Molecule")
        elif eType == "RNA":
            return ("RNA", "DNA_Product", "Molecule")
        elif eType in ("Protein", "Protein_Family", "Protein_Complex", "Protein_Domain"):
            return ("Amino_acid_sequence", "DNA_Product", "Molecule")
        elif eType == "Hormone":
            return ("Molecule",)
        elif eType in ("Regulatory_Network", "Pathway"):
            return ("Dynamic_process",)
        elif eType in ("Genotype", "Tissue", "Development_Phase"):
            return ("Biological_context", "Context")
        elif eType == "Environmental_Factor":
            return ("Context",)
        else:
            raise Exception("Unknown SeeDev type '" + str(eType) + "'")
    
    def mergeForSeeDev(self, categoryName, structureAnalyzer):
        if categoryName in structureAnalyzer.typeMap["forward"]:
            return structureAnalyzer.typeMap["forward"][categoryName]
        return categoryName
#         for tag in ("Regulates", "Exists", "Interacts", "Is", "Occurs"):
#             if categoryName.startswith(tag):
#                 categoryName = tag
#                 break
#         return categoryName
    
    def processCorpus(self, input, output, gold=None, append=False, allowNewIds=True, structureAnalyzer=None):
        if self.styles["sdb_merge"]:
            structureAnalyzer.determineNonOverlappingTypes()
            self.structureAnalyzer = structureAnalyzer
        ExampleBuilder.processCorpus(self, input, output, gold, append, allowNewIds, structureAnalyzer)
    
    def isValidInteraction(self, e1, e2, structureAnalyzer,forceUndirected=False):
        return len(structureAnalyzer.getValidEdgeTypes(e1.get("type"), e2.get("type"), forceUndirected=forceUndirected)) > 0

    def getGoldCategoryName(self, goldGraph, entityToGold, e1, e2, directed=True):
        if len(entityToGold[e1]) > 0 and len(entityToGold[e2]) > 0:
            return self.getCategoryName(goldGraph, entityToGold[e1][0], entityToGold[e2][0], directed=directed)
        else:
            return "neg"
    
    def filterEdge(self, edge, edgeTypes):
        import types
        assert edgeTypes != None
        if type(edgeTypes) not in [types.ListType, types.TupleType]:
            edgeTypes = [edgeTypes]
        if edge[2].get("type") in edgeTypes:
            return True
        else:
            return False
    
    def keepExample(self, e1, e2, categoryName, isDirected, structureAnalyzer):
        makeExample = True
        if (not self.styles["no_auto_limits"]) and not self.isValidInteraction(e1, e2, structureAnalyzer, forceUndirected=not isDirected):
            makeExample = False
            self.exampleStats.filter("auto_limits")
        if self.styles["genia_task1"] and (e1.get("type") == "Entity" or e2.get("type") == "Entity"):
            makeExample = False
            self.exampleStats.filter("genia_task1")
        if self.styles["pos_only"] and categoryName == "neg":
            makeExample = False
            self.exampleStats.filter("pos_only")
        if self.styles["no_self_loops"] and ((e1 == e2) or (e1.get("headOffset") == e2.get("headOffset"))):
            makeExample = False
            self.exampleStats.filter("no_self_loops")
        return makeExample
    
    def getExampleCategoryName(self, e1=None, e2=None, t1=None, t2=None, sentenceGraph=None, goldGraph=None, entityToGold=None, isDirected=True, structureAnalyzer=None):
        if self.styles["token_nodes"]:
            categoryName = self.getCategoryNameFromTokens(sentenceGraph, t1, t2, isDirected)
        else:
            categoryName = self.getCategoryName(sentenceGraph, e1, e2, isDirected)
            if goldGraph != None:
                categoryName = self.getGoldCategoryName(goldGraph, entityToGold, e1, e2, isDirected)
        if self.styles["filter_types"] != None and categoryName in self.styles["filter_types"]:
            categoryName = "neg"
        if self.styles["se10t8_undirected"]:
            assert e1.get("id").endswith(".e1")
            assert e2.get("id").endswith(".e2")
        #if self.styles["sdb_merge"]:
        #    categoryName = self.mergeForSeeDev(categoryName, structureAnalyzer)
        return categoryName
                
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph = None, structureAnalyzer=None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        #examples = []
        exampleIndex = 0
        # example directionality
        if self.styles["directed"] == None and self.styles["undirected"] == None: # determine directedness from corpus
            examplesAreDirected = structureAnalyzer.hasDirectedTargets() if structureAnalyzer != None else True
        elif self.styles["directed"]:
            assert self.styles["undirected"] in [None, False]
            examplesAreDirected = True
        elif self.styles["undirected"]:
            assert self.styles["directed"] in [None, False]
            examplesAreDirected = False
        
        if not self.styles["no_trigger_features"]: 
            self.triggerFeatureBuilder.initSentence(sentenceGraph)
        if self.styles["evex"]: 
            self.evexFeatureBuilder.initSentence(sentenceGraph)
#         if self.styles["sdb_merge"]:
#             self.determineNonOverlappingTypes(structureAnalyzer)
            
        # Filter entities, if needed
        sentenceGraph.mergeInteractionGraph(True)
        entities = sentenceGraph.mergedEntities
        entityToDuplicates = sentenceGraph.mergedEntityToDuplicates
        self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities))
        
        # Connect to optional gold graph
        entityToGold = None
        if goldGraph != None:
            entityToGold = EvaluateInteractionXML.mapEntities(entities, goldGraph.entities)
        
        paths = None
        if not self.styles["no_path"]:
            undirected = sentenceGraph.dependencyGraph.toUndirected()
            paths = undirected
            if self.styles["filter_shortest_path"] != None: # For DDI use filter_shortest_path=conj_and
                paths.resetAnalyses() # just in case
                paths.FloydWarshall(self.filterEdge, {"edgeTypes":self.styles["filter_shortest_path"]})
        
        # Generate examples based on interactions between entities or interactions between tokens
        if self.styles["token_nodes"]:
            loopRange = len(sentenceGraph.tokens)
        else:
            loopRange = len(entities)
        for i in range(loopRange-1):
            for j in range(i+1,loopRange):
                eI = None
                eJ = None
                if self.styles["token_nodes"]:
                    tI = sentenceGraph.tokens[i]
                    tJ = sentenceGraph.tokens[j]
                else:
                    eI = entities[i]
                    eJ = entities[j]
                    tI = sentenceGraph.entityHeadTokenByEntity[eI]
                    tJ = sentenceGraph.entityHeadTokenByEntity[eJ]
                    if eI.get("type") == "neg" or eJ.get("type") == "neg":
                        continue
                    if self.styles["skip_extra_triggers"]:
                        if eI.get("source") != None or eJ.get("source") != None:
                            continue
                # only consider paths between entities (NOTE! entities, not only named entities)
                if self.styles["headsOnly"]:
                    if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0):
                        continue
                
                examples = self.buildExamplesForPair(tI, tJ, paths, sentenceGraph, goldGraph, entityToGold, eI, eJ, structureAnalyzer, examplesAreDirected)
                for categoryName, features, extra in examples:
                    # make example
                    if self.styles["binary"]:
                        if categoryName != "neg":
                            category = 1
                        else:
                            category = -1
                        extra["categoryName"] = "i"
                    else:
                        category = self.classSet.getId(categoryName)
                    example = [sentenceGraph.getSentenceId()+".x"+str(exampleIndex), category, features, extra]
                    ExampleUtils.appendExamples([example], outfile)
                    exampleIndex += 1

        return exampleIndex
    
    def buildExamplesForPair(self, token1, token2, paths, sentenceGraph, goldGraph, entityToGold, entity1=None, entity2=None, structureAnalyzer=None, isDirected=True):
        # define forward
        categoryName = self.getExampleCategoryName(entity1, entity2, token1, token2, sentenceGraph, goldGraph, entityToGold, isDirected, structureAnalyzer=structureAnalyzer)
        # make forward
        forwardExample = None
        self.exampleStats.beginExample(categoryName)
        if self.keepExample(entity1, entity2, categoryName, isDirected, structureAnalyzer):
            forwardExample = self.buildExample(token1, token2, paths, sentenceGraph, categoryName, entity1, entity2, structureAnalyzer, isDirected)
        
        if isDirected: # build a separate reverse example (if that is valid)
            self.exampleStats.endExample() # end forward example
            # define reverse
            categoryName = self.getExampleCategoryName(entity2, entity1, token2, token1, sentenceGraph, goldGraph, entityToGold, True, structureAnalyzer=structureAnalyzer)
            # make reverse
            self.exampleStats.beginExample(categoryName)
            reverseExample = None
            if self.keepExample(entity2, entity1, categoryName, True, structureAnalyzer):
                reverseExample = self.buildExample(token2, token1, paths, sentenceGraph, categoryName, entity2, entity1, structureAnalyzer, isDirected)
            self.exampleStats.endExample()
            return filter(None, [forwardExample, reverseExample])
        elif self.styles["se10t8_undirected"]: # undirected example with a directed type
            self.exampleStats.endExample()
            return [forwardExample]
        elif forwardExample != None: # merge features from the reverse example to the forward one
            reverseExample = self.buildExample(token2, token1, paths, sentenceGraph, categoryName, entity2, entity1, structureAnalyzer, isDirected)
            forwardExample[1].update(reverseExample[1])
            self.exampleStats.endExample() # end merged example
            return [forwardExample]
        else: # undirected example that was filtered
            self.exampleStats.endExample() # end merged example
            return []
    
    def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, entity1=None, entity2=None, structureAnalyzer=None, isDirected=True):
        """
        Build a single directed example for the potential edge between token1 and token2
        """
        # define features
        if not self.styles["no_path"]:
            path = paths.getPaths(token1, token2)
            if len(path) > 0:
                path = path[0]
                #pathExists = True
            else:
                path = [token1, token2]
                #pathExists = False
        else:
            path = [token1, token2]
            #pathExists = False
        
        features = {}
        if not self.styles["no_features"]:
            features = self.buildFeatures(sentenceGraph, entity1, entity2, token1, token2, path)
        
        # define extra attributes
        if int(path[0].get("charOffset").split("-")[0]) < int(path[-1].get("charOffset").split("-")[0]):
            extra = {"xtype":"edge","type":"i","t1":path[0].get("id"),"t2":path[-1].get("id")}
            extra["deprev"] = False
        else:
            extra = {"xtype":"edge","type":"i","t1":path[-1].get("id"),"t2":path[0].get("id")}
            extra["deprev"] = True
        if entity1 != None:
            extra["e1"] = entity1.get("id")
            if sentenceGraph.mergedEntityToDuplicates != None:
                extra["e1DuplicateIds"] = ",".join([x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity1]])
        if entity2 != None:
            extra["e2"] = entity2.get("id")
            if sentenceGraph.mergedEntityToDuplicates != None:
                extra["e2DuplicateIds"] = ",".join([x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity2]])
        extra["categoryName"] = categoryName
        if self.styles["bacteria_renaming"]:
            if entity1.get("text") != None and entity1.get("text") != "":
                extra["e1t"] = entity1.get("text").replace(" ", "---").replace(":","-COL-")
            if entity2.get("text") != None and entity2.get("text") != "":
                extra["e2t"] = entity2.get("text").replace(" ", "---").replace(":","-COL-")
        if self.styles["doc_extra"]:
            if hasattr(sentenceGraph, "documentElement") and sentenceGraph.documentElement.get("origId") != None:
                extra["DOID"] = sentenceGraph.documentElement.get("origId")
        if self.styles["entity_extra"]:
            if entity1.get("origId") != None: extra["e1OID"] = entity1.get("origId")
            if entity2.get("origId") != None: extra["e2OID"] = entity2.get("origId")
        sentenceOrigId = sentenceGraph.sentenceElement.get("origId")
        if sentenceOrigId != None:
            extra["SOID"] = sentenceOrigId 
        extra["directed"] = str(isDirected)
        if self.styles["sdb_merge"]:
            extra["sdb_merge"] = "True"
            #print extra
        
        return (categoryName, features, extra)
        
    
    def buildFeatures(self, sentenceGraph, entity1, entity2, token1, token2, path):
        features = {} 
        if not self.styles["no_trigger_features"]: # F 85.52 -> 85.55
            self.triggerFeatureBuilder.setFeatureVector(features)
            self.triggerFeatureBuilder.tag = "trg1_"
            self.triggerFeatureBuilder.buildFeatures(token1)
            self.triggerFeatureBuilder.tag = "trg2_"
            self.triggerFeatureBuilder.buildFeatures(token2)
            self.triggerFeatureBuilder.setFeatureVector(None)
        # REL features
        if self.styles["rel_features"] and not self.styles["no_task"]:
            self.relFeatureBuilder.setFeatureVector(features)
            self.relFeatureBuilder.tag = "rel1_"
            self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token1))
            self.relFeatureBuilder.tag = "rel2_"
            self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token2))
            self.relFeatureBuilder.setFeatureVector(None)
        if self.styles["bacteria_renaming"] and not self.styles["no_task"]:
            self.bacteriaRenamingFeatureBuilder.setFeatureVector(features)
            self.bacteriaRenamingFeatureBuilder.buildPairFeatures(entity1, entity2)
            #self.bacteriaRenamingFeatureBuilder.buildSubstringFeatures(entity1, entity2) # decreases perf. 74.76 -> 72.41
            self.bacteriaRenamingFeatureBuilder.setFeatureVector(None)
        if self.styles["co_features"] and not self.styles["no_task"]:
            e1Offset = Range.charOffsetToSingleTuple(entity1.get("charOffset"))
            e2Offset = Range.charOffsetToSingleTuple(entity2.get("charOffset"))
            if Range.contains(e1Offset, e2Offset):
                features[self.featureSet.getId("e1_contains_e2")] = 1
                if entity2.get("given") == "True":
                    features[self.featureSet.getId("e1_contains_e2name")] = 1
            if Range.contains(e2Offset, e1Offset):
                features[self.featureSet.getId("e2_contains_e1")] = 1
                if entity1.get("given") == "True":
                    features[self.featureSet.getId("e2_contains_e1name")] = 1
        if self.styles["drugbank_features"]:
            self.drugFeatureBuilder.setFeatureVector(features)
            self.drugFeatureBuilder.tag = "ddi_"
            self.drugFeatureBuilder.buildPairFeatures(entity1, entity2)  
            if self.styles["ddi_mtmx"]:
                self.drugFeatureBuilder.buildMTMXFeatures(entity1, entity2)
            self.drugFeatureBuilder.setFeatureVector(None)
        if self.styles["graph_kernel"]:
            self.graphKernelFeatureBuilder.setFeatureVector(features, entity1, entity2)
            self.graphKernelFeatureBuilder.buildGraphKernelFeatures(sentenceGraph, path)
            self.graphKernelFeatureBuilder.setFeatureVector(None)
        if self.styles["entity_type"]:
            e1Type = self.multiEdgeFeatureBuilder.getEntityType(entity1)
            e2Type = self.multiEdgeFeatureBuilder.getEntityType(entity2)
            features[self.featureSet.getId("e1_"+e1Type)] = 1
            features[self.featureSet.getId("e2_"+e2Type)] = 1
            features[self.featureSet.getId("distance_"+str(len(path)))] = 1
        if not self.styles["no_dependency"]:
            #print "Dep features"
            self.multiEdgeFeatureBuilder.setFeatureVector(features, entity1, entity2)
            #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast
            if not self.styles["disable_entity_features"]:
                self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph)
            self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path)
            if not self.styles["disable_terminus_features"]:
                self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast
            if not self.styles["disable_single_element_features"]:
                self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, sentenceGraph)
            if not self.styles["disable_ngram_features"]:
                #print "NGrams"
                self.multiEdgeFeatureBuilder.buildPathGrams(2, path, sentenceGraph) # remove for fast
                self.multiEdgeFeatureBuilder.buildPathGrams(3, path, sentenceGraph) # remove for fast
                self.multiEdgeFeatureBuilder.buildPathGrams(4, path, sentenceGraph) # remove for fast
            #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast
            #if edges != None:
            #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast
            #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast
            if not self.styles["disable_path_edge_features"]:
                self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, sentenceGraph)
            self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph)
            self.multiEdgeFeatureBuilder.setFeatureVector(None)
        if self.styles["nodalida"]:
            self.nodalidaFeatureBuilder.setFeatureVector(features, entity1, entity2)
            shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths(sentenceGraph.dependencyGraph, path)
            print shortestPaths
            if len(shortestPaths) > 0:
                self.nodalidaFeatureBuilder.buildNGrams(shortestPaths, sentenceGraph)
            self.nodalidaFeatureBuilder.setFeatureVector(None)
        if self.styles["linear_features"]:
            self.tokenFeatureBuilder.setFeatureVector(features)
            for i in range(len(sentenceGraph.tokens)):
                if sentenceGraph.tokens[i] == token1:
                    token1Index = i
                if sentenceGraph.tokens[i] == token2:
                    token2Index = i
            linearPreTag = "linfw_"
            if token1Index > token2Index: 
                token1Index, token2Index = token2Index, token1Index
                linearPreTag = "linrv_"
            self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1")
            self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2")
            # Before, middle, after
#                self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf")
#                self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw")
#                self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af")
            # before-middle, middle, middle-after
#                    self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2)
#                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2)
#                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2)
            self.tokenFeatureBuilder.setFeatureVector(None)
        if self.styles["random"]:
            self.randomFeatureBuilder.setFeatureVector(features)
            self.randomFeatureBuilder.buildRandomFeatures(100, 0.01)
            self.randomFeatureBuilder.setFeatureVector(None)
        if self.styles["genia_features"] and not self.styles["no_task"]:
            e1Type = entity1.get("type")
            e2Type = entity2.get("type")
            assert(entity1.get("given") in (None, "False"))
            if entity2.get("given") == "True":
                features[self.featureSet.getId("GENIA_target_protein")] = 1
            else:
                features[self.featureSet.getId("GENIA_nested_event")] = 1
            if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization
                if entity2.get("given") == "True":
                    features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1
                else:
                    features[self.featureSet.getId("GENIA_regulation_of_event")] = 1
        if self.styles["bi_features"]:
            # Make features based on entity types
            e1Type = entity1.get("type")
            e2Type = entity2.get("type")
            e1SuperType = str(self.getBISuperType(e1Type))
            e2SuperType = str(self.getBISuperType(e2Type))
            features[self.featureSet.getId("BI_e1_"+e1Type)] = 1
            features[self.featureSet.getId("BI_e2_"+e2Type)] = 1
            features[self.featureSet.getId("BI_e1sup_"+e1SuperType)] = 1
            features[self.featureSet.getId("BI_e2sup_"+e2SuperType)] = 1
            features[self.featureSet.getId("BI_e1e2_"+e1Type+"_"+e2Type)] = 1
            features[self.featureSet.getId("BI_e1e2sup_"+e1SuperType+"_"+e2SuperType)] = 1
        if self.styles["sdb_features"]:
            e1Type = entity1.get("type")
            e2Type = entity2.get("type")
            features[self.featureSet.getId("SDB_e1_"+e1Type)] = 1
            features[self.featureSet.getId("SDB_e2_"+e2Type)] = 1
            features[self.featureSet.getId("SDB_e1e2_"+e1Type+"_"+e2Type)] = 1
            if e1Type == e2Type:
                features[self.featureSet.getId("SDB_e1e2_equal")] = 1
                features[self.featureSet.getId("SDB_e1e2_equal_" + e1Type)] = 1
            e1SuperTypes = str(self.getSeeDevSuperTypes(e1Type))
            e2SuperTypes = str(self.getSeeDevSuperTypes(e2Type))
            for e1SuperType in e1SuperTypes:
                for e2SuperType in e2SuperTypes:
                    features[self.featureSet.getId("SDB_e1sup_"+e1SuperType)] = 1
                    features[self.featureSet.getId("SDB_e2sup_"+e2SuperType)] = 1
                    features[self.featureSet.getId("SDB_e1e2sup_"+e1SuperType+"_"+e2SuperType)] = 1
                    if e1SuperType == e2SuperType:
                        features[self.featureSet.getId("SDB_e1e2sup_equal")] = 1
                        features[self.featureSet.getId("SDB_e1e2sup_equal_" + e1SuperType)] = 1
        if self.styles["ontobiotope_features"]:
            self.ontobiotopeFeatureBuilder.setFeatureVector(features)
            self.ontobiotopeFeatureBuilder.buildOBOFeaturesForEntityPair(entity1, entity2)
            self.ontobiotopeFeatureBuilder.setFeatureVector(None)
        if self.styles["full_entities"]:
            e1Text = entity1.get("text").lower()
            e2Text = entity2.get("text").lower()
            features[self.featureSet.getId("FULL_e1_"+e1Text)] = 1
            features[self.featureSet.getId("FULL_e2_"+e2Text)] = 1
            for ep1 in e1Text.split():
                for ep2 in e2Text.split():
                    features[self.featureSet.getId("FULL_e1_"+ep1)] = 1
                    features[self.featureSet.getId("FULL_e2_"+ep2)] = 1
                    features[self.featureSet.getId("FULL_e1e2_"+ep1+"_"+ep2)] = 1
        if self.styles["evex"]:
            self.evexFeatureBuilder.setFeatureVector(features, entity1, entity2)
            self.evexFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph)
            self.evexFeatureBuilder.setFeatureVector(None)
        if self.styles["wordnet"]:
            self.wordNetFeatureBuilder.setFeatureVector(features, entity1, entity2)
            self.wordNetFeatureBuilder.buildFeaturesForEntityPair(token1, token2)
            self.wordNetFeatureBuilder.buildLinearFeatures(token1, sentenceGraph.tokens, tag="t1_")
            self.wordNetFeatureBuilder.buildLinearFeatures(token2, sentenceGraph.tokens, tag="t2_")
            self.wordNetFeatureBuilder.buildPathFeatures(path)
            self.wordNetFeatureBuilder.setFeatureVector(None)
        if self.styles["wordvector"]:
            self.wordVectorFeatureBuilder.setFeatureVector(features, entity1, entity2)
            self.wordVectorFeatureBuilder.buildFeatures(token1, "t1_")
            self.wordVectorFeatureBuilder.buildFeatures(token2, "t2_")
            self.wordVectorFeatureBuilder.buildLinearFeatures(token1, sentenceGraph.tokens, tag="t1_")
            self.wordVectorFeatureBuilder.buildLinearFeatures(token2, sentenceGraph.tokens, tag="t2_")
            self.wordVectorFeatureBuilder.buildPathFeatures(path)
            self.wordVectorFeatureBuilder.buildFBAFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token1), sentenceGraph.tokens.index(token2))
            self.wordVectorFeatureBuilder.setFeatureVector(None)
        if self.styles["giuliano"]:
            self.giulianoFeatureBuilder.setFeatureVector(features, entity1, entity2)
            self.giulianoFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph)
            self.giulianoFeatureBuilder.setFeatureVector(None)
        
        return features
예제 #5
0
class Round2TriggerExampleBuilder(ExampleBuilder):
    def nxMultiDiGraphToUndirected(self, graph):
        undirected = NX10.MultiGraph(name=graph.name)
        undirected.add_nodes_from(graph)
        undirected.add_edges_from(graph.edges_iter())
        return undirected

    def getPredictionStrength(self, element):
        eType = element.get("type")
        predictions = element.get("predictions")
        if predictions == None:
            return 0
        predictions = predictions.split(",")
        for prediction in predictions:
            predClass, predStrength = prediction.split(":")
            if predClass == eType:
                predStrength = float(predStrength)
                return predStrength
        return 0

    def getInteractionEdgeLengths(self, sentenceGraph, paths):
        """
        Return dependency and linear length of all interaction edges
        (measured between the two tokens).
        """
        interactionLengths = {}
        for interaction in sentenceGraph.interactions:
            # Calculated interaction edge dep and lin length
            e1 = sentenceGraph.entitiesById[interaction.get("e1")]
            e2 = sentenceGraph.entitiesById[interaction.get("e2")]
            t1 = sentenceGraph.entityHeadTokenByEntity[e1]
            t2 = sentenceGraph.entityHeadTokenByEntity[e2]
            # Get dep path length
            if t1 != t2 and paths.has_key(t1) and paths[t1].has_key(t2):
                pathLength = len(paths[t1][t2])
            else:  # no dependencyPath
                pathLength = 999999  # more than any real path
            # Linear distance
            t1Pos = -1
            t2Pos = -1
            for i in range(len(sentenceGraph.tokens)):
                if sentenceGraph.tokens[i] == t1:
                    t1Pos = i
                    if t2Pos != -1:
                        break
                if sentenceGraph.tokens[i] == t2:
                    t2Pos = i
                    if t1Pos != -1:
                        break
            linLength = abs(t1Pos - t2Pos)
            interactionLengths[interaction] = (interaction, pathLength,
                                               linLength, t2Pos)
        return interactionLengths

    def __init__(self,
                 style=None,
                 classSet=None,
                 featureSet=None,
                 gazetteerFileName=None,
                 skiplist=None):
        if classSet == None:
            classSet = IdSet(1)
        assert (classSet.getId("neg") == 1)
        if featureSet == None:
            featureSet = IdSet()

        ExampleBuilder.__init__(self, classSet, featureSet)
        #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
        if gazetteerFileName != None:
            self.gazetteer = Gazetteer.loadGztr(gazetteerFileName)
            print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName
        else:
            print >> sys.stderr, "No gazetteer loaded"
            self.gazetteer = None
        self.styles = style

        self.skiplist = set()
        if skiplist != None:
            f = open(skiplist, "rt")
            for line in f.readlines():
                self.skiplist.add(line.strip())
            f.close()

        self.styles = [
            "trigger_features", "typed", "directed", "no_linear", "entities",
            "genia_limits", "noMasking", "maxFeatures"
        ]
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        if "graph_kernel" in self.styles:
            from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder
            self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(
                self.featureSet)
        if "noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True

        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)

    @classmethod
    def run(cls,
            input,
            gold,
            output,
            parse,
            tokenization,
            style,
            idFileTag=None,
            append=False):
        """
        An interface for running the example builder without needing to create a class
        """
        classSet, featureSet = cls.getIdSets(idFileTag)
        if style != None:
            e = Round2TriggerExampleBuilder(style=style,
                                            classSet=classSet,
                                            featureSet=featureSet)
        else:
            e = Round2TriggerExampleBuilder(classSet=classSet,
                                            featureSet=featureSet)
        sentences = cls.getSentences(input, parse, tokenization)
        if gold != None:
            goldSentences = cls.getSentences(gold, parse, tokenization)
        else:
            goldSentences = None
        e.buildExamplesForSentences(sentences,
                                    goldSentences,
                                    output,
                                    idFileTag,
                                    append=append)

    def buildExamplesForSentences(self,
                                  sentences,
                                  goldSentences,
                                  output,
                                  idFileTag=None,
                                  append=False):
        examples = []
        counter = ProgressCounter(len(sentences), "Build examples")

        if append:
            outfile = open(output, "at")
        else:
            outfile = open(output, "wt")
        exampleCount = 0
        for i in range(len(sentences)):
            sentence = sentences[i]
            goldSentence = [None]
            if goldSentences != None:
                goldSentence = goldSentences[i]
            counter.update(
                1, "Building examples (" + sentence[0].getSentenceId() + "): ")
            examples = self.buildExamples(sentence[0],
                                          goldSentence[0],
                                          append=append)
            exampleCount += len(examples)
            examples = self.preProcessExamples(examples)
            ExampleUtils.appendExamples(examples, outfile)
        outfile.close()

        print >> sys.stderr, "Examples built:", exampleCount
        print >> sys.stderr, "Features:", len(self.featureSet.getNames())
        #IF LOCAL
        if self.exampleStats.getExampleCount() > 0:
            self.exampleStats.printStats()
        #ENDIF
        # Save Ids
        if idFileTag != None:
            print >> sys.stderr, "Saving class names to", idFileTag + ".class_names"
            self.classSet.write(idFileTag + ".class_names")
            print >> sys.stderr, "Saving feature names to", idFileTag + ".feature_names"
            self.featureSet.write(idFileTag + ".feature_names")

    def preProcessExamples(self, allExamples):
        if "normalize" in self.styles:
            print >> sys.stderr, " Normalizing feature vectors"
            ExampleUtils.normalizeFeatureVectors(allExamples)
        return allExamples

    def getMergedEntityType(self, entities):
        """
        If a single token belongs to multiple entities of different types,
        a new, composite type is defined. This type is the alphabetically
        ordered types of these entities joined with '---'.
        """
        types = set()
        for entity in entities:
            types.add(entity.get("type"))
        types = list(types)
        types.sort()
        typeString = ""
        for type in types:
            if type == "Protein" and "all_tokens" in self.styles:
                continue
            if typeString != "":
                typeString += "---"
            typeString += type

        if typeString == "":
            return "neg"

        if "limit_merged_types" in self.styles:
            if typeString.find("---") != -1:
                if typeString == "Gene_expression---Positive_regulation":
                    return typeString
                else:
                    return typeString.split("---")[0]
            else:
                return typeString
        return typeString

    def getTokenFeatures(self, token, sentenceGraph):
        """
        Returns a list of features based on the attributes of a token.
        These can be used to define more complex features.
        """
        # These features are cached when this method is first called
        # for a token.
        if self.tokenFeatures.has_key(token):
            return self.tokenFeatures[token]
        tokTxt = sentenceGraph.getTokenText(token)
        features = {}
        features["_txt_" + tokTxt] = 1

        # F 69.35 -> 68.22
        #normalizedText = tokTxt.replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower()
        #features["_norTxt_"+normalizedText]=1
        #features["_norStem_" + PorterStemmer.stem(normalizedText)]=1

        features["_POS_" + token.get("POS")] = 1
        if sentenceGraph.tokenIsName[token]:
            features["_isName"] = 1
            for entity in sentenceGraph.tokenIsEntityHead[token]:
                if entity.get("isName") == "True":
                    features["_annType_" + entity.get("type")] = 1
        # Filip's gazetteer based features (can be used separately from exclude_gazetteer)
        if "gazetteer_features" in self.styles:
            tokTxtLower = tokTxt.lower()
            if "stem_gazetteer" in self.styles:
                tokTxtLower = PorterStemmer.stem(tokTxtLower)
            if self.gazetteer and tokTxtLower in self.gazetteer:
                for label, weight in self.gazetteer[tokTxtLower].items():
                    features["_knownLabel_" +
                             label] = weight  # 1 performs slightly worse
        self.tokenFeatures[token] = features
        return features

    def buildLinearOrderFeatures(self, sentenceGraph, index, tag, features):
        """
        Linear features are built by marking token features with a tag
        that defines their relative position in the linear order.
        """
        tag = "linear_" + tag
        for tokenFeature, w in self.getTokenFeatures(
                sentenceGraph.tokens[index], sentenceGraph).iteritems():
            features[self.featureSet.getId(tag + tokenFeature)] = w

    def buildExamples(self, sentenceGraph, goldGraph, append=False):
        examples = self.buildExamplesInner(sentenceGraph, goldGraph)

        entityCounts = {}
        exampleCounts = {}
        for entity in sentenceGraph.entities:
            eType = entity.get("type")
            if eType == "Protein":
                continue
            if not entityCounts.has_key(eType):
                entityCounts[eType] = 0
                exampleCounts[eType] = 0
            entityCounts[eType] += 1

        for example in examples:
            eTypes = self.classSet.getName(example[1]).split("---")
            for eType in eTypes:
                if not exampleCounts.has_key(eType):
                    exampleCounts[eType] = 0
                exampleCounts[eType] += 1
        #for key in sorted(entityCounts.keys()):
        #    if entityCounts[key] != exampleCounts[key]:
        #        print >> sys.stderr, "Warning, sentence", sentenceGraph.getSentenceId(), "example", key, "diff", entityCounts[key] - exampleCounts[key]

        return examples

    def buildExamplesInner(self, sentenceGraph, goldGraph):
        """
        Build one example for each token of the sentence
        """
        if sentenceGraph.sentenceElement.get("origId") in self.skiplist:
            print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get(
                "origId")
            return []

        self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True)
        self.triggerFeatureBuilder.initSentence(sentenceGraph)

        undirected = self.nxMultiDiGraphToUndirected(
            sentenceGraph.dependencyGraph)
        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
        # Get argument order
        self.interactionLengths = self.getInteractionEdgeLengths(
            sentenceGraph, paths)
        self.interactionLengths = self.interactionLengths.values()
        self.interactionLengths.sort(compareInteractionPrecedence)
        # Map tokens to entities
        tokenByOffset = {}
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            if goldGraph != None:
                goldToken = goldGraph.tokens[i]
                assert token.get("id") == goldToken.get("id") and token.get(
                    "charOffset") == goldToken.get("charOffset")
            tokenByOffset[token.get("charOffset")] = token.get("id")
        # Map gold entities to their head offsets
        goldEntitiesByOffset = {}
        for token in sentenceGraph.tokens:
            goldEntitiesByOffset[token.get("charOffset")] = []
        entityToGold = {}
        for entity in sentenceGraph.entities:
            entityToGold[entity] = []
        if goldGraph != None:
            for entity in goldGraph.entities:
                offset = entity.get("headOffset")
                assert offset != None
                goldEntitiesByOffset[offset].append(entity)
            # Map predicted entities to gold entities
            for entity in sentenceGraph.entities:
                eType = entity.get("type")
                eOffset = entity.get("headOffset")
                for goldEntity in goldEntitiesByOffset[eOffset]:
                    if goldEntity.get("type") == eType:
                        entityToGold[entity].append(goldEntity)
        # Map entities to interactions
        #interactionsByEntityId = {}
        #for entity in sentenceGraph.entities:
        #    interactionsByEntityId[entity.get("id")] = []
        # Map tokens to interactions
        interactionsByToken = {}
        for token in sentenceGraph.tokens:
            interactionsByToken[token] = []
        for interactionTuple in self.interactionLengths:
            interaction = interactionTuple[0]
            if interaction.get("type") == "neg":
                continue
            e1Id = interaction.get("e1")
            token = sentenceGraph.entityHeadTokenByEntity[
                sentenceGraph.entitiesById[e1Id]]
            interactionsByToken[token].append(interaction)

        examples = []
        exampleIndex = 0

        self.tokenFeatures = {}

        #namedEntityNorStrings = set()
        namedEntityHeadTokens = []
        if not "names" in self.styles:
            namedEntityCount = 0
            for entity in sentenceGraph.entities:
                if entity.get(
                        "isName"
                ) == "True":  # known data which can be used for features
                    namedEntityCount += 1
                    #namedEntityNorStrings.add( entity.get("text").replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower() )
            namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
            #if namedEntityCount == 0: # no names, no need for triggers
            #    return []

            if "pos_pairs" in self.styles:
                namedEntityHeadTokens = self.getNamedEntityHeadTokens(
                    sentenceGraph)

        #neFeatures = {} # F: 69.35 -> 69.14
        #for norString in namedEntityNorStrings:
        #    neFeatures[self.featureSet.getId("norNE_" + norString)] = 1

        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
        bowFeatures = {}
        for k, v in bagOfWords.iteritems():
            bowFeatures[self.featureSet.getId(k)] = v

        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True)
            fixedInEdges = []
            for edge in inEdges:
                fixedInEdges.append((edge[0], edge[1], edge[2]["element"]))
            inEdges = fixedInEdges
            inEdges.sort(compareDependencyEdgesById)
            self.inEdgesByToken[token] = inEdges
            outEdges = sentenceGraph.dependencyGraph.out_edges(token,
                                                               data=True)
            fixedOutEdges = []
            for edge in outEdges:
                fixedOutEdges.append((edge[0], edge[1], edge[2]["element"]))
            outEdges = fixedOutEdges
            outEdges.sort(compareDependencyEdgesById)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)

        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            # Recognize only non-named entities (i.e. interaction words)
            if sentenceGraph.tokenIsName[
                    token] and not "names" in self.styles and not "all_tokens" in self.styles:
                continue

            # CLASS
            #if len(sentenceGraph.tokenIsEntityHead[token]) > 0:
            #    category = self.classSet.getId(self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token]))
            #else:
            #    category = 1
            offset = token.get("charOffset")
            if len(goldEntitiesByOffset[offset]) > 0:
                category = self.classSet.getId(
                    self.getMergedEntityType(goldEntitiesByOffset[offset]))
            else:
                category = 1

            tokenText = token.get("text").lower()
            if "stem_gazetteer" in self.styles:
                tokenText = PorterStemmer.stem(tokenText)
            if ("exclude_gazetteer" in self.styles
                ) and self.gazetteer and tokenText not in self.gazetteer:
                features = {}
                features[self.featureSet.getId("exclude_gazetteer")] = 1
                extra = {
                    "xtype": "token",
                    "t": token.get("id"),
                    "excluded": "True"
                }
                examples.append(
                    (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex),
                     category, features, extra))
                exampleIndex += 1
                continue

            # FEATURES
            features = {}
            self.features = features

            if not "names" in self.styles:
                features[self.featureSet.getId(namedEntityCountFeature)] = 1
            #for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)
            #features.update(neFeatures)

            #            for j in range(len(sentenceGraph.tokens)):
            #                text = "bow_" + sentenceGraph.tokens[j].get("text")
            #                if j < i:
            #                    features[self.featureSet.getId("bf_" + text)] = 1
            #                elif j > i:
            #                    features[self.featureSet.getId("af_" + text)] = 1

            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_" + text)] = 1
            features[self.featureSet.getId("POS_" + token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_" + stem)] = 1
            features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1

            # Normalized versions of the string (if same as non-normalized, overlap without effect)
            normalizedText = text.replace("-", "").replace("/", "").replace(
                ",", "").replace("\\", "").replace(" ", "").lower()
            if normalizedText == "bound":  # should be for all irregular verbs
                normalizedText = "bind"
            features[self.featureSet.getId("txt_" + normalizedText)] = 1
            norStem = PorterStemmer.stem(normalizedText)
            features[self.featureSet.getId("stem_" + norStem)] = 1
            features[self.featureSet.getId("nonstem_" +
                                           normalizedText[len(norStem):])] = 1

            if "gazetteer_features_maintoken" in self.styles:
                tokTxtLower = text.lower()
                if "stem_gazetteer" in self.styles:
                    tokTxtLower = PorterStemmer.stem(tokTxtLower)
                if self.gazetteer and tokTxtLower in self.gazetteer:
                    for label, weight in self.gazetteer[tokTxtLower].items():
                        features[self.featureSet.getId(
                            "gaz_knownLabel_" +
                            label)] = weight  # 1 performs slightly worse

            # Linear order features
            #for index in [-3,-2,-1,1,2,3,4,5]: # 69.35 -> 68.97
            for index in [-3, -2, -1, 1, 2, 3]:
                if i + index > 0 and i + index < len(sentenceGraph.tokens):
                    self.buildLinearOrderFeatures(sentenceGraph, i + index,
                                                  str(index), features)

            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j - 1] == "-":
                        features[self.featureSet.getId(
                            "has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_" +
                                                   text[j - 1:j +
                                                        1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_" +
                                                   text[j - 2:j +
                                                        1].lower())] = 1

            # Attached edges (Hanging in and out edges)
            t1InEdges = self.inEdgesByToken[token]
            for edge in t1InEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HIn_" + edgeType)] = 1
                features[self.featureSet.getId("t1HIn_" +
                                               edge[0].get("POS"))] = 1
                features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                               edge[0].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[0])
                features[self.featureSet.getId("t1HIn_" + tokenText)] = 1
                features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                               tokenText)] = 1
            t1OutEdges = self.outEdgesByToken[token]
            for edge in t1OutEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HOut_" + edgeType)] = 1
                features[self.featureSet.getId("t1HOut_" +
                                               edge[1].get("POS"))] = 1
                features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                               edge[1].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[1])
                features[self.featureSet.getId("t1HOut_" + tokenText)] = 1
                features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                               tokenText)] = 1

            extra = {"xtype": "token", "t": token.get("id")}
            examples.append(
                (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex),
                 category, features, extra))
            exampleIndex += 1

            # chains
            self.buildChains(token, sentenceGraph, features)

            if "pos_pairs" in self.styles:
                self.buildPOSPairs(token, namedEntityHeadTokens, features)

            self.buildPredictionFeatures(sentenceGraph, paths, token,
                                         interactionsByToken[token])
        return examples

    def buildChains(self,
                    token,
                    sentenceGraph,
                    features,
                    depthLeft=3,
                    chain="",
                    visited=None):
        if depthLeft == 0:
            return
        strDepthLeft = "dist_" + str(depthLeft)

        if visited == None:
            visited = set()

        inEdges = self.inEdgesByToken[token]
        outEdges = self.outEdgesByToken[token]
        edgeSet = visited.union(self.edgeSetByToken[token])
        for edge in inEdges:
            if not edge in visited:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("dep_" + strDepthLeft +
                                               edgeType)] = 1

                nextToken = edge[0]
                for tokenFeature, w in self.getTokenFeatures(
                        nextToken, sentenceGraph).iteritems():
                    features[self.featureSet.getId(strDepthLeft +
                                                   tokenFeature)] = w
#                for entity in sentenceGraph.tokenIsEntityHead[nextToken]:
#                    if entity.get("isName") == "True":
#                        features[self.featureSet.getId("name_dist_"+strDepthLeft)] = 1
#                        features[self.featureSet.getId("name_dist_"+strDepthLeft+entity.get("type"))] = 1
#                features[self.featureSet.getId("POS_dist_"+strDepthLeft+nextToken.get("POS"))] = 1
#                tokenText = sentenceGraph.getTokenText(nextToken)
#                features[self.featureSet.getId("text_dist_"+strDepthLeft+tokenText)] = 1

                if sentenceGraph.tokenIsName[nextToken]:
                    features[self.featureSet.getId("name_chain_dist_" +
                                                   strDepthLeft + chain +
                                                   "-frw_" + edgeType)] = 1
                features[self.featureSet.getId("chain_dist_" + strDepthLeft +
                                               chain + "-frw_" + edgeType)] = 1
                self.buildChains(nextToken, sentenceGraph, features,
                                 depthLeft - 1, chain + "-frw_" + edgeType,
                                 edgeSet)

        for edge in outEdges:
            if not edge in visited:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("dep_dist_" + strDepthLeft +
                                               edgeType)] = 1

                nextToken = edge[1]
                for tokenFeature, w in self.getTokenFeatures(
                        nextToken, sentenceGraph).iteritems():
                    features[self.featureSet.getId(strDepthLeft +
                                                   tokenFeature)] = w
#                for entity in sentenceGraph.tokenIsEntityHead[nextToken]:
#                    if entity.get("isName") == "True":
#                        features[self.featureSet.getId("name_dist_"+strDepthLeft)] = 1
#                        features[self.featureSet.getId("name_dist_"+strDepthLeft+entity.get("type"))] = 1
#                features[self.featureSet.getId("POS_dist_"+strDepthLeft+nextToken.get("POS"))] = 1
#                tokenText = sentenceGraph.getTokenText(nextToken)
#                features[self.featureSet.getId("text_dist_"+strDepthLeft+tokenText)] = 1

                if sentenceGraph.tokenIsName[nextToken]:
                    features[self.featureSet.getId("name_chain_dist_" +
                                                   strDepthLeft + chain +
                                                   "-rev_" + edgeType)] = 1
                features[self.featureSet.getId("chain_dist_" + strDepthLeft +
                                               chain + "-rev_" + edgeType)] = 1
                self.buildChains(nextToken, sentenceGraph, features,
                                 depthLeft - 1, chain + "-rev_" + edgeType,
                                 edgeSet)

    def getNamedEntityHeadTokens(self, sentenceGraph):
        headTokens = []
        for entity in sentenceGraph.entities:
            if entity.get(
                    "isName"
            ) == "True":  # known data which can be used for features
                headTokens.append(
                    sentenceGraph.entityHeadTokenByEntity[entity])
        return headTokens

    def buildPOSPairs(self, token, namedEntityHeadTokens, features):
        tokenPOS = token.get("POS")
        assert tokenPOS != None
        for headToken in namedEntityHeadTokens:
            headPOS = headToken.get("POS")
            features[self.featureSet.getId("POS_pair_NE_" + tokenPOS + "-" +
                                           headPOS)] = 1

    ######################################################
    # Unmerging-style features
    ######################################################

    def buildPredictionFeatures(
            self, sentenceGraph, paths, token,
            interactions):  #themeEntities, causeEntities=None):
        # NOTE!!!! TODO
        # add also features for arguments present, but not in this combination

        self.buildInterArgumentBagOfWords(interactions, sentenceGraph)

        if sentenceGraph.entitiesByToken.has_key(token):
            for eventEntity in sentenceGraph.entitiesByToken[token]:
                eventEntityType = eventEntity.get("type")
                self.setFeature("rootType_" + eventEntity.get("type"), 1)
                self.setFeature("predStrength" + eventEntityType,
                                self.getPredictionStrength(eventEntity))
                self.triggerFeatureBuilder.setFeatureVector(self.features)
                self.triggerFeatureBuilder.tag = "trg" + eventEntityType + "_"
                self.triggerFeatureBuilder.buildFeatures(token)
                self.triggerFeatureBuilder.tag = None

        argThemeCount = 0
        argCauseCount = 0
        # Current example's edge combination
        for i in range(len(interactions)):
            arg = interactions[i]
            if arg.get("type") == "Theme":
                argThemeCount += 1
                self.buildArgumentFeatures(sentenceGraph, paths, self.features,
                                           token, arg, "argTheme")
                self.buildArgumentFeatures(sentenceGraph, paths, self.features,
                                           token, arg, "argTheme" + str(i))
            else:  # Cause
                argCauseCount += 1
                self.buildArgumentFeatures(sentenceGraph, paths, self.features,
                                           token, arg, "argCause")
                self.buildArgumentFeatures(sentenceGraph, paths, self.features,
                                           token, arg, "argCause" + str(i))

        self.setFeature("argCount", len(interactions))
        self.setFeature("argCount_" + str(len(interactions)), 1)

        self.setFeature("argThemeCount", argThemeCount)
        self.setFeature("argThemeCount_" + str(argThemeCount), 1)
        self.setFeature("argCauseCount", argCauseCount)
        self.setFeature("argCauseCount_" + str(argCauseCount), 1)

        self.triggerFeatureBuilder.tag = ""
        self.triggerFeatureBuilder.setFeatureVector(None)

    def buildArgumentFeatures(self, sentenceGraph, paths, features, eventToken,
                              arg, tag):
        argEntity = sentenceGraph.entitiesById[arg.get("e2")]
        argToken = sentenceGraph.entityHeadTokenByEntity[argEntity]
        self.buildEdgeFeatures(sentenceGraph, paths, features, eventToken,
                               argToken, tag)
        self.triggerFeatureBuilder.tag = tag + "trg_"
        self.triggerFeatureBuilder.buildFeatures(argToken)
        if argEntity.get("isName") == "True":
            self.setFeature(tag + "Protein", 1)
        else:
            self.setFeature(tag + "Event", 1)
            self.setFeature("nestingEvent", 1)
        self.setFeature(tag + "_" + argEntity.get("type"), 1)

    def buildEdgeFeatures(self, sentenceGraph, paths, features, eventToken,
                          argToken, tag):
        #eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode]
        #argToken = sentenceGraph.entityHeadTokenByEntity[argNode]
        self.multiEdgeFeatureBuilder.tag = tag + "_"
        self.multiEdgeFeatureBuilder.setFeatureVector(features, None, None,
                                                      False)

        self.setFeature(tag + "_present", 1)

        if eventToken != argToken and paths.has_key(
                eventToken) and paths[eventToken].has_key(argToken):
            path = paths[eventToken][argToken]
            edges = self.multiEdgeFeatureBuilder.getEdges(
                sentenceGraph.dependencyGraph, path)
        else:
            path = [eventToken, argToken]
            edges = None

        if not "disable_entity_features" in self.styles:
            self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph)
        self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path)
        #if not "disable_terminus_features" in self.styles:
        #    self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast
        if not "disable_single_element_features" in self.styles:
            self.multiEdgeFeatureBuilder.buildSingleElementFeatures(
                path, edges, sentenceGraph)
        if not "disable_ngram_features" in self.styles:
            self.multiEdgeFeatureBuilder.buildPathGrams(
                2, path, edges, sentenceGraph)  # remove for fast
            self.multiEdgeFeatureBuilder.buildPathGrams(
                3, path, edges, sentenceGraph)  # remove for fast
            self.multiEdgeFeatureBuilder.buildPathGrams(
                4, path, edges, sentenceGraph)  # remove for fast
        if not "disable_path_edge_features" in self.styles:
            self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(
                path, edges, sentenceGraph)
        #self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph)
        self.multiEdgeFeatureBuilder.setFeatureVector(None, None, None, False)
        self.multiEdgeFeatureBuilder.tag = ""

    def buildInterArgumentBagOfWords(self, arguments, sentenceGraph):
        if len(arguments) < 2:
            return

        indexByToken = {}
        for i in range(len(sentenceGraph.tokens)):
            indexByToken[sentenceGraph.tokens[i]] = i

        argTokenIndices = set()
        for arg in arguments:
            argEntity = sentenceGraph.entitiesById[arg.get("e2")]
            argToken = sentenceGraph.entityHeadTokenByEntity[argEntity]
            argTokenIndices.add(indexByToken[argToken])
        minIndex = min(argTokenIndices)
        maxIndex = max(argTokenIndices)
        self.setFeature("argBoWRange", (maxIndex - minIndex))
        self.setFeature("argBoWRange_" + str(maxIndex - minIndex), 1)
        bow = set()
        for i in range(minIndex + 1, maxIndex):
            token = sentenceGraph.tokens[i]
            if len(sentenceGraph.tokenIsEntityHead[token]
                   ) == 0 and not sentenceGraph.tokenIsName[token]:
                bow.add(token.get("text"))
        bow = sorted(list(bow))
        for word in bow:
            self.setFeature("argBoW_" + word, 1)
            if word in ["/", "-"]:
                self.setFeature("argBoW_slashOrHyphen", 1)
        if len(bow) == 1:
            self.setFeature("argBoWonly_" + bow[0], 1)
            if bow[0] in ["/", "-"]:
                self.setFeature("argBoWonly_slashOrHyphen", 1)
예제 #6
0
class AsymmetricEventExampleBuilder(ExampleBuilder):
    def __init__(self,
                 style=["typed", "directed"],
                 length=None,
                 types=[],
                 featureSet=None,
                 classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert (classSet.getId("neg") == 1)

        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        if style.find(",") != -1:
            style = style.split(",")
        self.styles = style

        self.negFrac = None
        self.posPairGaz = POSPairGazetteer()
        for s in style:
            if s.find("negFrac") != -1:
                self.negFrac = float(s.split("_")[-1])
                print >> sys.stderr, "Downsampling negatives to", self.negFrac
                self.negRand = random.Random(15)
            elif s.find("posPairGaz") != -1:
                self.posPairGaz = POSPairGazetteer(
                    loadFrom=s.split("_", 1)[-1])

        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        if "graph_kernel" in self.styles:
            from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder
            self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(
                self.featureSet)
        if "noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True
        self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        if "ontology" in self.styles:
            self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(
                self.featureSet)
        if "nodalida" in self.styles:
            self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(
                self.featureSet)
        #IF LOCAL
        if "bioinfer_limits" in self.styles:
            self.bioinferOntologies = OntologyUtils.getBioInferTempOntology()
            #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName)
        #ENDIF
        self.pathLengths = length
        assert (self.pathLengths == None)
        self.types = types
        if "random" in self.styles:
            from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder
            self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)

        #self.outFile = open("exampleTempFile.txt","wt")

    @classmethod
    def run(cls, input, output, parse, tokenization, style, idFileTag=None):
        classSet, featureSet = cls.getIdSets(idFileTag)
        if style != None:
            e = cls(style=style, classSet=classSet, featureSet=featureSet)
        else:
            e = cls(classSet=classSet, featureSet=featureSet)
        sentences = cls.getSentences(input, parse, tokenization)
        e.buildExamplesForSentences(sentences, output, idFileTag)
        if "printClassIds" in e.styles:
            print >> sys.stderr, e.classSet.Ids

    def definePredictedValueRange(self, sentences, elementName):
        self.multiEdgeFeatureBuilder.definePredictedValueRange(
            sentences, elementName)

    def getPredictedValueRange(self):
        return self.multiEdgeFeatureBuilder.predictedRange

    def filterEdgesByType(self, edges, typesToInclude):
        if len(typesToInclude) == 0:
            return edges
        edgesToKeep = []
        for edge in edges:
            if edge.get("type") in typesToInclude:
                edgesToKeep.append(edge)
        return edgesToKeep

    def getCategoryNameFromTokens(self, sentenceGraph, t1, t2, directed=True):
        types = set()
        themeE1Types = set()
        intEdges = []
        if sentenceGraph.interactionGraph.has_edge(t1, t2):
            intEdges = sentenceGraph.interactionGraph.get_edge_data(t1,
                                                                    t2,
                                                                    default={})
            # NOTE: Only works if keys are ordered integers
            for i in range(len(intEdges)):
                types.add(intEdges[i]["element"].get("type"))

#        if (not directed) and sentenceGraph.interactionGraph.has_edge(t2, t1):
#            intEdgesReverse = sentenceGraph.interactionGraph.get_edge(t2, t1, default={})
#            # NOTE: Only works if keys are ordered integers
#            for i in range(len(intEdgesReverse)):
#                intElement = intEdgesReverse[i]["element"]
#                intType = intElement.get("type")
#                types.add(intType)
#            intEdges.extend(intEdgesReverse)

        for i in range(len(intEdges)):
            intElement = intEdges[i]["element"]
            intType = intElement.get("type")
            if intType == "Theme":
                e1Entity = sentenceGraph.entitiesById[intElement.get("e1")]
                themeE1Types.add(e1Entity.get("type"))
            #types.add(intType)

        if len(themeE1Types) != 0:
            themeE1Types = list(themeE1Types)
            themeE1Types.sort()
            categoryName = ""
            for name in themeE1Types:
                if categoryName != "":
                    categoryName += "---"
                categoryName += name
            return categoryName
        else:
            types = list(types)
            types.sort()
            categoryName = ""
            for name in types:
                if categoryName != "":
                    categoryName += "---"
                categoryName += name
            if categoryName != "":
                return categoryName
            else:
                return "neg"

    def getCategoryName(self, sentenceGraph, e1, e2, directed=True):
        interactions = sentenceGraph.getInteractions(e1, e2)
        if not directed:
            interactions.extend(sentenceGraph.getInteractions(e2, e1))

        types = set()
        for interaction in interactions:
            types.add(interaction.attrib["type"])
        types = list(types)
        types.sort()
        categoryName = ""
        for name in types:
            if categoryName != "":
                categoryName += "---"
            categoryName += name
        if categoryName != "":
            return categoryName
        else:
            return "neg"

    def preProcessExamples(self, allExamples):
        # Duplicates cannot be removed here, as they should only be removed from the training set. This is done
        # in the classifier.
        #        if "no_duplicates" in self.styles:
        #            count = len(allExamples)
        #            print >> sys.stderr, " Removing duplicates,",
        #            allExamples = ExampleUtils.removeDuplicates(allExamples)
        #            print >> sys.stderr, "removed", count - len(allExamples)
        if "normalize" in self.styles:
            print >> sys.stderr, " Normalizing feature vectors"
            ExampleUtils.normalizeFeatureVectors(allExamples)
        return allExamples

    def isPotentialGeniaInteraction(self, e1, e2):
        if e1.get("isName") == "True":
            return False
        else:
            return True

    #IF LOCAL
    def getBioInferParentType(self, eType):
        if eType == "Physical_entity" or OntologyUtils.hasParent(
                eType, "Physical_entity", self.bioinferOntologies):
            return "Physical"
        elif eType == "Property_entity" or OntologyUtils.hasParent(
                eType, "Property_entity", self.bioinferOntologies):
            return "Property"
        elif OntologyUtils.hasParent(eType, "Relationship",
                                     self.bioinferOntologies):
            return "Process"
        else:
            assert False, eType

#        if self.bioinferOntologies["Entity"].has_key(eType):
#            if OntologyUtils.hasParent(eType, "Physical_entity", self.bioinferOntologies):
#                assert not OntologyUtils.hasParent(eType, "Property_entity", self.bioinferOntologies), eType
#                return "Physical"
#            else:
#                assert OntologyUtils.hasParent(eType, "Property_entity", self.bioinferOntologies), eType
#                return "Property"
#
#        else:
#            assert self.bioinferOntologies.has_key(eType), eType
#            #assert OntologyUtils.hasParent(eType, "Process_entity", self.bioinferOntologies["Relationship"]), eType
#            return "Process"

    def isPotentialBioInferInteraction(self, e1, e2, categoryName):
        e1Type = self.getBioInferParentType(e1.get("type"))
        e2Type = self.getBioInferParentType(e2.get("type"))
        if e1Type == "Process" or e1Type == "Property":
            return True
        elif e1Type == "Physical" and e2Type == "Physical":
            return True
        elif e1Type == "Physical" and e2Type == "Process":  # hack
            return True
        else:
            assert (
                categoryName == "neg"
            ), categoryName + " category for " + e1Type + " and " + e2Type
            return False

    #ENDIF

    def nxMultiDiGraphToUndirected(self, graph):
        undirected = NX10.MultiGraph(name=graph.name)
        undirected.add_nodes_from(graph)
        undirected.add_edges_from(graph.edges_iter())
        return undirected

    def buildExamples(self, sentenceGraph):
        examples = []
        exampleIndex = 0

        clearGraph = sentenceGraph.getCleared()

        #undirected = sentenceGraph.getUndirectedDependencyGraph()
        undirected = self.nxMultiDiGraphToUndirected(
            sentenceGraph.dependencyGraph)
        ##undirected = sentenceGraph.dependencyGraph.to_undirected()
        ###undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work
        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)

        self.triggerFeatureBuilder.initSentence(clearGraph)

        # Generate examples based on interactions between entities or interactions between tokens
        if "entities" in self.styles:
            loopRange = len(sentenceGraph.entities)
        else:
            loopRange = len(sentenceGraph.tokens)
        #for i in range(loopRange-1):
        for i in range(loopRange):  # allow self-interactions
            #for j in range(i+1,loopRange):
            for j in range(i, loopRange):  # allow self-interactions
                eI = None
                eJ = None
                if "entities" in self.styles:
                    eI = sentenceGraph.entities[i]
                    eJ = sentenceGraph.entities[j]
                    tI = sentenceGraph.entityHeadTokenByEntity[eI]
                    tJ = sentenceGraph.entityHeadTokenByEntity[eJ]
                    #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True":
                    #    continue
                    if eI.get("type") == "neg" or eJ.get("type") == "neg":
                        continue
                else:
                    tI = sentenceGraph.tokens[i]
                    tJ = sentenceGraph.tokens[j]
#                # only consider paths between entities (NOTE! entities, not only named entities)
#                if "headsOnly" in self.styles:
#                    if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0):
#                        continue

                if "directed" in self.styles:
                    # define forward
                    if "entities" in self.styles:
                        categoryName = self.getCategoryName(
                            sentenceGraph, eI, eJ, True)
                    else:
                        categoryName = self.getCategoryNameFromTokens(
                            sentenceGraph, tI, tJ, True)
                    self.exampleStats.beginExample(categoryName)
                    if self.negFrac == None or categoryName != "neg" or (
                            categoryName == "neg"
                            and self.negRand.random() < self.negFrac):
                        makeExample = True
                        if ("genia_limits" in self.styles
                            ) and not self.isPotentialGeniaInteraction(eI, eJ):
                            makeExample = False
                            self.exampleStats.filter("genia_limits")
                        if self.posPairGaz.getNegFrac(
                            (tI.get("POS"), tJ.get("POS"))) == 1.0:
                            makeExample = False
                            self.exampleStats.filter("pos_pair")
                        if makeExample:
                            if not sentenceGraph.tokenIsName[tI]:
                                examples.append(
                                    self.buildExample(tI, tJ, paths,
                                                      clearGraph, categoryName,
                                                      exampleIndex, eI, eJ))
                                exampleIndex += 1
                            else:
                                self.exampleStats.filter("genia_token_limits")
                    else:
                        self.exampleStats.filter("neg_frac")
                    self.exampleStats.endExample()

                    # define reverse
                    if "entities" in self.styles:
                        categoryName = self.getCategoryName(
                            sentenceGraph, eJ, eI, True)
                    else:
                        categoryName = self.getCategoryNameFromTokens(
                            sentenceGraph, tJ, tI, True)
                    self.exampleStats.beginExample(categoryName)
                    if self.negFrac == None or categoryName != "neg" or (
                            categoryName == "neg"
                            and self.negRand.random() < self.negFrac):
                        makeExample = True
                        if ("genia_limits" in self.styles
                            ) and not self.isPotentialGeniaInteraction(eJ, eI):
                            makeExample = False
                            self.exampleStats.filter("genia_limits")
                        if ("bioinfer_limits" in self.styles
                            ) and not self.isPotentialBioInferInteraction(
                                eJ, eI, categoryName):
                            makeExample = False
                            self.exampleStats.filter("bioinfer_limits")
                        if self.posPairGaz.getNegFrac(
                            (tJ.get("POS"), tI.get("POS"))) == 1.0:
                            makeExample = False
                            self.exampleStats.filter("pos_pair")
                        if makeExample:
                            if not sentenceGraph.tokenIsName[tJ]:
                                examples.append(
                                    self.buildExample(tJ, tI, paths,
                                                      clearGraph, categoryName,
                                                      exampleIndex, eJ, eI))
                                exampleIndex += 1
                            else:
                                self.exampleStats.filter("genia_token_limits")
                    else:
                        self.exampleStats.filter("neg_frac")
                    self.exampleStats.endExample()
#                else:
#                    if "entities" in self.styles:
#                        categoryName = self.getCategoryName(sentenceGraph, eI, eJ, False)
#                    else:
#                        categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, False)
#                    forwardExample = self.buildExample(tI, tJ, paths, clearGraph, categoryName, exampleIndex, eI, eJ)
#                    if not "graph_kernel" in self.styles:
#                        reverseExample = self.buildExample(tJ, tI, paths, clearGraph, categoryName, exampleIndex, eJ, eI)
#                        forwardExample[2].update(reverseExample[2])
#                    examples.append(forwardExample)
#                    exampleIndex += 1

        return examples

    def buildExample(self,
                     token1,
                     token2,
                     paths,
                     sentenceGraph,
                     categoryName,
                     exampleIndex,
                     entity1=None,
                     entity2=None):
        # define features
        features = {}
        if True:  #token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2):
            if token1 != token2 and paths.has_key(
                    token1) and paths[token1].has_key(token2):
                path = paths[token1][token2]
            else:
                path = [token1, token2]
            assert (self.pathLengths == None)
            if self.pathLengths == None or len(path) - 1 in self.pathLengths:
                if not "no_trigger":
                    self.triggerFeatureBuilder.setFeatureVector(self.features)
                    self.triggerFeatureBuilder.tag = "trg_t1_"
                    self.triggerFeatureBuilder.buildFeatures(eventToken)
                    self.triggerFeatureBuilder.tag = "trg_t2_"
                    self.triggerFeatureBuilder.buildFeatures(eventToken)
#                if not "no_ontology" in self.styles:
#                    self.ontologyFeatureBuilder.setFeatureVector(features)
#                    self.ontologyFeatureBuilder.buildOntologyFeaturesForPath(sentenceGraph, path)
#                    self.ontologyFeatureBuilder.setFeatureVector(None)
                if "graph_kernel" in self.styles or not "no_dependency" in self.styles:
                    if token1 != token2 and paths.has_key(
                            token1) and paths[token1].has_key(token2):
                        edges = self.multiEdgeFeatureBuilder.getEdges(
                            sentenceGraph.dependencyGraph, path)
                    else:
                        edges = None
                if "graph_kernel" in self.styles:
                    self.graphKernelFeatureBuilder.setFeatureVector(
                        features, entity1, entity2)
                    self.graphKernelFeatureBuilder.buildGraphKernelFeatures(
                        sentenceGraph, path, edges)
                    self.graphKernelFeatureBuilder.setFeatureVector(None)
                if "entity_type" in self.styles:
                    features[self.featureSet.getId("e1_" +
                                                   entity1.attrib["type"])] = 1
                    features[self.featureSet.getId("e2_" +
                                                   entity2.attrib["type"])] = 1
                    features[self.featureSet.getId("distance_" +
                                                   str(len(path)))] = 1
                if not "no_dependency" in self.styles:
                    if token1 == token2:
                        features[self.featureSet.getId("tokenSelfLoop")] = 1

                    self.multiEdgeFeatureBuilder.setFeatureVector(
                        features, entity1, entity2)
                    #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast
                    if not "disable_entity_features" in self.styles:
                        self.multiEdgeFeatureBuilder.buildEntityFeatures(
                            sentenceGraph)
                    self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path)
                    if not "disable_terminus_features" in self.styles:
                        self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(
                            path, sentenceGraph)  # remove for fast
                    if not "disable_single_element_features" in self.styles:
                        self.multiEdgeFeatureBuilder.buildSingleElementFeatures(
                            path, edges, sentenceGraph)
                    if not "disable_ngram_features" in self.styles:
                        self.multiEdgeFeatureBuilder.buildPathGrams(
                            2, path, edges, sentenceGraph)  # remove for fast
                        self.multiEdgeFeatureBuilder.buildPathGrams(
                            3, path, edges, sentenceGraph)  # remove for fast
                        self.multiEdgeFeatureBuilder.buildPathGrams(
                            4, path, edges, sentenceGraph)  # remove for fast
                    #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast
                    #if edges != None:
                    #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast
                    #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast
                    if not "disable_path_edge_features" in self.styles:
                        self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(
                            path, edges, sentenceGraph)
                    self.multiEdgeFeatureBuilder.buildSentenceFeatures(
                        sentenceGraph)
                    self.multiEdgeFeatureBuilder.setFeatureVector(None)
                if "nodalida" in self.styles:
                    self.nodalidaFeatureBuilder.setFeatureVector(
                        features, entity1, entity2)
                    shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths(
                        sentenceGraph.dependencyGraph, path)
                    print shortestPaths
                    if len(shortestPaths) > 0:
                        self.nodalidaFeatureBuilder.buildNGrams(
                            shortestPaths, sentenceGraph)
                    self.nodalidaFeatureBuilder.setFeatureVector(None)
                if not "no_linear" in self.styles:
                    self.tokenFeatureBuilder.setFeatureVector(features)
                    for i in range(len(sentenceGraph.tokens)):
                        if sentenceGraph.tokens[i] == token1:
                            token1Index = i
                        if sentenceGraph.tokens[i] == token2:
                            token2Index = i
                    linearPreTag = "linfw_"
                    if token1Index > token2Index:
                        token1Index, token2Index = token2Index, token1Index
                        linearPreTag = "linrv_"
                    self.tokenFeatureBuilder.buildLinearOrderFeatures(
                        token1Index, sentenceGraph, 2, 2, preTag="linTok1")
                    self.tokenFeatureBuilder.buildLinearOrderFeatures(
                        token2Index, sentenceGraph, 2, 2, preTag="linTok2")
                    # Before, middle, after
                    #                self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf")
                    #                self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw")
                    #                self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af")
                    # before-middle, middle, middle-after
                    #                    self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2)
                    #                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2)
                    #                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2)
                    self.tokenFeatureBuilder.setFeatureVector(None)
                if "random" in self.styles:
                    self.randomFeatureBuilder.setFeatureVector(features)
                    self.randomFeatureBuilder.buildRandomFeatures(100, 0.01)
                    self.randomFeatureBuilder.setFeatureVector(None)
                if "genia_limits" in self.styles:
                    e1Type = entity1.get("type")
                    e2Type = entity2.get("type")
                    assert (entity1.get("isName") == "False")
                    if entity2.get("isName") == "True":
                        features[self.featureSet.getId(
                            "GENIA_target_protein")] = 1
                    else:
                        features[self.featureSet.getId(
                            "GENIA_nested_event")] = 1
                    if e1Type.find(
                            "egulation"
                    ) != -1:  # leave r out to avoid problems with capitalization
                        if entity2.get("isName") == "True":
                            features[self.featureSet.getId(
                                "GENIA_regulation_of_protein")] = 1
                        else:
                            features[self.featureSet.getId(
                                "GENIA_regulation_of_event")] = 1
            else:
                features[self.featureSet.getId("always_negative")] = 1
                if "subset" in self.styles:
                    features[self.featureSet.getId("out_of_scope")] = 1
        else:
            features[self.featureSet.getId("always_negative")] = 1
            if "subset" in self.styles:
                features[self.featureSet.getId("out_of_scope")] = 1
            path = [token1, token2]

        self.triggerFeatureBuilder.tag = ""
        self.triggerFeatureBuilder.setFeatureVector(None)

        # define extra attributes
        #        if int(path[0].attrib["id"].split("_")[-1]) < int(path[-1].attrib["id"].split("_")[-1]):
        #            #extra = {"xtype":"edge","type":"i","t1":path[0],"t2":path[-1]}
        #            extra = {"xtype":"asym","type":"i","t1":path[0].get("id"),"t2":path[-1].get("id")}
        #            extra["deprev"] = False
        #        else:
        #            #extra = {"xtype":"edge","type":"i","t1":path[-1],"t2":path[0]}
        #            extra = {"xtype":"asym","type":"i","t1":path[-1].get("id"),"t2":path[0].get("id")}
        #            extra["deprev"] = True

        extra = {
            "xtype": "asym",
            "type": "i",
            "t1": token1.get("id"),
            "t2": token2.get("id")
        }
        if entity1 != None:
            #extra["e1"] = entity1
            extra["e1"] = entity1.get("id")
        if entity2 != None:
            #extra["e2"] = entity2
            extra["e2"] = entity2.get("id")
        extra["categoryName"] = categoryName
        sentenceOrigId = sentenceGraph.sentenceElement.get("origId")
        if sentenceOrigId != None:
            extra["SOID"] = sentenceOrigId
        # make example
        if "binary" in self.styles:
            if categoryName != "neg":
                category = 1
            else:
                category = -1
            categoryName = "i"
        else:
            category = self.classSet.getId(categoryName)

        return (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex),
                category, features, extra)
예제 #7
0
class MultiEdgeExampleBuilder(ExampleBuilder):
    """
    This example builder makes edge examples, i.e. examples describing
    the event arguments.
    """
    def __init__(self,
                 style=None,
                 length=None,
                 types=[],
                 featureSet=None,
                 classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert (classSet.getId("neg") == 1
                or (len(classSet.Ids) == 2 and classSet.getId("neg") == -1))

        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)

        self.styles = self.getParameters(style, [
            "typed", "directed", "headsOnly", "graph_kernel", "noAnnType",
            "noMasking", "maxFeatures", "genia_limits", "epi_limits",
            "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits",
            "genia_task1", "ontology", "nodalida", "bacteria_renaming",
            "trigger_features", "rel_features", "ddi_features", "evex",
            "giuliano", "random", "themeOnly", "causeOnly", "no_path",
            "entities", "skip_extra_triggers", "headsOnly", "graph_kernel",
            "trigger_features", "no_task", "no_dependency",
            "disable_entity_features", "disable_terminus_features",
            "disable_single_element_features", "disable_ngram_features",
            "disable_path_edge_features", "no_linear", "subset", "binary",
            "pos_only", "entity_type"
        ])
        if style == None:  # no parameters given
            style["typed"] = style["directed"] = style["headsOnly"] = True
#        self.styles = style
#        if "selftrain_group" in self.styles:
#            self.selfTrainGroups = set()
#            if "selftrain_group-1" in self.styles:
#                self.selfTrainGroups.add("-1")
#            if "selftrain_group0" in self.styles:
#                self.selfTrainGroups.add("0")
#            if "selftrain_group1" in self.styles:
#                self.selfTrainGroups.add("1")
#            if "selftrain_group2" in self.styles:
#                self.selfTrainGroups.add("2")
#            if "selftrain_group3" in self.styles:
#                self.selfTrainGroups.add("3")
#            print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups

        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        # NOTE Temporarily re-enabling predicted range
        #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None)
        if self.styles["graph_kernel"]:
            from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder
            self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(
                self.featureSet)
        if self.styles["noAnnType"]:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if self.styles["noMasking"]:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if self.styles["maxFeatures"]:
            self.multiEdgeFeatureBuilder.maximum = True
        if self.styles["genia_task1"]:
            self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity")
        self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        if self.styles["ontology"]:
            self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(
                self.featureSet)
        if self.styles["nodalida"]:
            self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(
                self.featureSet)
        if self.styles["bacteria_renaming"]:
            self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder(
                self.featureSet)
        if self.styles["trigger_features"]:
            self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
            self.triggerFeatureBuilder.useNonNameEntities = True
            if self.styles["genia_task1"]:
                self.triggerFeatureBuilder.filterAnnTypes.add("Entity")
            #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName)
        if self.styles["rel_features"]:
            self.relFeatureBuilder = RELFeatureBuilder(featureSet)
        if self.styles["ddi_features"]:
            self.drugFeatureBuilder = DrugFeatureBuilder(featureSet)
        if self.styles["evex"]:
            self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet)
        if self.styles["giuliano"]:
            self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
        self.pathLengths = length
        assert (self.pathLengths == None)
        self.types = types
        if self.styles["random"]:
            from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder
            self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)

    def definePredictedValueRange(self, sentences, elementName):
        self.multiEdgeFeatureBuilder.definePredictedValueRange(
            sentences, elementName)

    def getPredictedValueRange(self):
        return self.multiEdgeFeatureBuilder.predictedRange

    def filterEdgesByType(self, edges, typesToInclude):
        if len(typesToInclude) == 0:
            return edges
        edgesToKeep = []
        for edge in edges:
            if edge.get("type") in typesToInclude:
                edgesToKeep.append(edge)
        return edgesToKeep

    def getCategoryNameFromTokens(self, sentenceGraph, t1, t2, directed=True):
        """
        Example class. Multiple overlapping edges create a merged type.
        """
        types = set()
        #        if sentenceGraph.interactionGraph.has_edge(t1, t2):
        #            intEdges = sentenceGraph.interactionGraph.get_edge_data(t1, t2, default={})
        #            # NOTE: Only works if keys are ordered integers
        #            for i in range(len(intEdges)):
        #                types.add(intEdges[i]["element"].get("type"))
        #        if (not directed) and sentenceGraph.interactionGraph.has_edge(t2, t1):
        #            intEdges = sentenceGraph.interactionGraph.get_edge(t2, t1, default={})
        #            # NOTE: Only works if keys are ordered integers
        #            for i in range(len(intEdges)):
        #                types.add(intEdges[i]["element"].get("type"))
        intEdges = sentenceGraph.interactionGraph.getEdges(t1, t2)
        if (not directed):
            intEdges = intEdges + sentenceGraph.interactionGraph.getEdges(
                t2, t1)
        for intEdge in intEdges:
            types.add(intEdge[2].get("type"))
        types = list(types)
        types.sort()
        categoryName = ""
        for name in types:
            if categoryName != "":
                categoryName += "---"
            categoryName += name
        if categoryName != "":
            return categoryName
        else:
            return "neg"

    def getCategoryName(self,
                        sentenceGraph,
                        e1,
                        e2,
                        directed=True,
                        duplicateEntities=None):
        """
        Example class. Multiple overlapping edges create a merged type.
        """
        #        interactions = []
        #        e1s = [e1]
        #        if duplicateEntities != None and e1 in duplicateEntities:
        #            e1s += duplicateEntities[e1]
        #        e2s = [e2]
        #        if duplicateEntities != None and e2 in duplicateEntities:
        #            e2s += duplicateEntities[e2]
        #        for entity1 in e1s:
        #            for entity2 in e2s:
        #                interactions = interactions + sentenceGraph.getInteractions(entity1, entity2)
        #                if not directed:
        #                    interactions = interactions + sentenceGraph.getInteractions(entity2, entity1)
        interactions = sentenceGraph.getInteractions(e1, e2, True)
        #print interactions

        types = set()
        for interaction in interactions:
            types.add(interaction[2].get("type"))
        types = list(types)
        types.sort()
        categoryName = ""
        for name in types:
            if self.styles["causeOnly"] and name != "Cause":
                continue
            if self.styles["themeOnly"] and name != "Theme":
                continue
            if categoryName != "":
                categoryName += "---"
            categoryName += name
        if categoryName != "":
            return categoryName
        else:
            return "neg"

    def isPotentialRELInteraction(self, e1, e2):
        if e1.get("type") == "Protein" and e2.get("type") == "Entity":
            return True
        else:
            return False

    def isPotentialBBInteraction(self, e1, e2, sentenceGraph):
        #if e1.get("type") == "Bacterium" and e2.get("type") in ["Host", "HostPart", "Geographical", "Environmental", "Food", "Medical", "Soil", "Water"]:
        # Note: "Environment" type is misspelled as "Environmental" in the BB-task documentation
        if e1.get("type") == "Bacterium" and e2.get("type") in [
                "Host", "HostPart", "Geographical", "Environment", "Food",
                "Medical", "Soil", "Water"
        ]:
            return True
        elif e1.get("type") == "Host" and e2.get("type") == "HostPart":
            return True
        else:
            return False

    def getBISuperType(self, eType):
        if eType in [
                "GeneProduct", "Protein", "ProteinFamily", "PolymeraseComplex"
        ]:
            return "ProteinEntity"
        elif eType in [
                "Gene", "GeneFamily", "GeneComplex", "Regulon", "Site",
                "Promoter"
        ]:
            return "GeneEntity"
        else:
            return None

    def isPotentialBIInteraction(self, e1, e2, sentenceGraph, stats):
        e1Type = e1.get("type")
        e1SuperType = self.getBISuperType(e1Type)
        e2Type = e2.get("type")
        e2SuperType = self.getBISuperType(e2Type)

        tag = "(" + e1Type + "/" + e2Type + ")"
        if e1Type == "Regulon":
            if e2SuperType in ["GeneEntity", "ProteinEntity"]:
                return True
        if e1SuperType == "ProteinEntity":
            if e2Type in ["Site", "Promoter", "Gene", "GeneComplex"]:
                return True
        if e1Type in ["Action", "Transcription", "Expression"]:
            return True
        if e1Type == "Site":
            if e2SuperType == "GeneEntity":
                return True
        if e1Type == "Promoter":
            if e2SuperType in ["GeneEntity", "ProteinEntity"]:
                return True
        if e1SuperType in ["GeneEntity", "ProteinEntity"]:
            if e2SuperType in ["GeneEntity", "ProteinEntity"]:
                return True
        stats.filter("bi_limits")  #+tag)
        return False

    def isPotentialEPIInteraction(self, e1, e2, sentenceGraph):
        if e1.get("type") != "Catalysis":
            if e1.get("type") in ["Protein", "Entity"]:
                return False
            elif e2.get("type") in ["Protein", "Entity"]:
                return True
            else:
                return False
        else:  # Catalysis
            if e2.get("type") != "Entity":
                return True
            else:
                return False
        assert False, (e1.get("type"), e2.get("type"))

    def isPotentialIDInteraction(self, e1, e2, sentenceGraph):
        e1Type = e1.get("type")
        e2Type = e2.get("type")
        e1IsCore = e1Type in [
            "Protein", "Regulon-operon", "Two-component-system", "Chemical",
            "Organism"
        ]
        e2IsCore = e2Type in [
            "Protein", "Regulon-operon", "Two-component-system", "Chemical",
            "Organism"
        ]
        if e1IsCore:
            return False
        elif e1Type in ["Gene_expression", "Transcription"]:
            if e2Type in ["Protein", "Regulon-operon"]:
                return True
            else:
                return False
        elif e1Type in ["Protein_catabolism", "Phosphorylation"]:
            if e2Type == "Protein":
                return True
            else:
                return False
        elif e1Type == "Localization":
            if e2IsCore or e2Type == "Entity":
                return True
            else:
                return False
        elif e1Type in ["Binding", "Process"]:
            if e2IsCore:
                return True
            else:
                return False
        elif "egulation" in e1Type:
            if e2Type != "Entity":
                return True
            else:
                return False
        elif e1Type == "Entity":
            if e2IsCore:
                return True
            else:
                return False
        assert False, (e1Type, e2Type)

    def isPotentialCOInteraction(self, e1, e2, sentenceGraph):
        if e1.get("type") == "Exp" and e2.get("type") == "Exp":
            anaphoraTok = sentenceGraph.entityHeadTokenByEntity[e1]
            antecedentTok = sentenceGraph.entityHeadTokenByEntity[e2]
            antecedentTokenFound = False
            for token in sentenceGraph.tokens:
                if token == antecedentTok:
                    antecedentTokenFound = True
                if token == anaphoraTok:  # if, not elif, to take into accoutn cases where e1Tok == e2Tok
                    if antecedentTokenFound:
                        return True
                    else:
                        return False
            assert False
        elif e1.get("type") == "Exp" and e2.get("type") == "Protein":
            return True
        else:
            return False

    def isPotentialGeniaInteraction(self, e1, e2):
        e1Type = e1.get("type")
        e2Type = e2.get("type")
        if e1Type == "Protein":
            return False
        elif e1Type in [
                "Entity", "Gene_expression", "Transcription",
                "Protein_catabolism", "Phosphorylation", "Binding"
        ]:
            if e2Type == "Protein":
                return True
            else:
                return False
        elif e1Type == "Localization":
            if e2Type in ["Protein", "Entity"]:
                return True
            else:
                return False
        elif "egulation" in e1Type:
            if e2Type != "Entity":
                return True
            else:
                return False
        assert False, (e1Type, e2Type)

    def getGoldCategoryName(self,
                            goldGraph,
                            entityToGold,
                            e1,
                            e2,
                            directed=True):
        if len(entityToGold[e1]) > 0 and len(entityToGold[e2]) > 0:
            return self.getCategoryName(goldGraph,
                                        entityToGold[e1][0],
                                        entityToGold[e2][0],
                                        directed=directed)
        else:
            return "neg"

    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        #examples = []
        exampleIndex = 0

        if self.styles["trigger_features"]:
            self.triggerFeatureBuilder.initSentence(sentenceGraph)
        if self.styles["evex"]:
            self.evexFeatureBuilder.initSentence(sentenceGraph)

        # Filter entities, if needed
        #mergedIds = None
        #duplicateEntities = None
        #entities = sentenceGraph.entities
        #entities, mergedIds, duplicateEntities = self.mergeEntities(sentenceGraph, False) # "no_duplicates" in self.styles)
        sentenceGraph.mergeInteractionGraph(True)
        entities = sentenceGraph.mergedEntities
        entityToDuplicates = sentenceGraph.mergedEntityToDuplicates
        self.exampleStats.addValue("Duplicate entities skipped",
                                   len(sentenceGraph.entities) - len(entities))

        # Connect to optional gold graph
        if goldGraph != None:
            entityToGold = EvaluateInteractionXML.mapEntities(
                entities, goldGraph.entities)

        paths = None
        if not self.styles["no_path"]:
            ##undirected = sentenceGraph.getUndirectedDependencyGraph()
            #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
            ###undirected = sentenceGraph.dependencyGraph.to_undirected()
            ####undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work
            undirected = sentenceGraph.dependencyGraph.toUndirected()
            #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
            paths = undirected

        #for edge in sentenceGraph.dependencyGraph.edges:
        #    assert edge[2] != None
        #for edge in undirected.edges:
        #    assert edge[2] != None
        #if sentenceGraph.sentenceElement.get("id") == "GENIA.d70.s5":
        #    print [(x[0].get("id"), x[1].get("id"), x[2].get("id")) for x in sentenceGraph.dependencyGraph.edges]

        # Generate examples based on interactions between entities or interactions between tokens
        if self.styles["entities"]:
            loopRange = len(entities)
        else:
            loopRange = len(sentenceGraph.tokens)
        for i in range(loopRange - 1):
            for j in range(i + 1, loopRange):
                eI = None
                eJ = None
                if self.styles["entities"]:
                    eI = entities[i]
                    eJ = entities[j]
                    tI = sentenceGraph.entityHeadTokenByEntity[eI]
                    tJ = sentenceGraph.entityHeadTokenByEntity[eJ]
                    #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True":
                    #    continue
                    if eI.get("type") == "neg" or eJ.get("type") == "neg":
                        continue
                    if self.styles["skip_extra_triggers"]:
                        if eI.get("source") != None or eJ.get(
                                "source") != None:
                            continue
                else:
                    tI = sentenceGraph.tokens[i]
                    tJ = sentenceGraph.tokens[j]
                # only consider paths between entities (NOTE! entities, not only named entities)
                if self.styles["headsOnly"]:
                    if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(
                            sentenceGraph.tokenIsEntityHead[tJ]) == 0):
                        continue

                if self.styles["directed"]:
                    # define forward
                    if self.styles["entities"]:
                        categoryName = self.getCategoryName(
                            sentenceGraph, eI, eJ, True)
                        if goldGraph != None:
                            categoryName = self.getGoldCategoryName(
                                goldGraph, entityToGold, eI, eJ, True)
                    else:
                        categoryName = self.getCategoryNameFromTokens(
                            sentenceGraph, tI, tJ, True)
                    # make forward
                    self.exampleStats.beginExample(categoryName)
                    makeExample = True
                    if self.styles[
                            "genia_limits"] and not self.isPotentialGeniaInteraction(
                                eI, eJ):
                        makeExample = False
                        self.exampleStats.filter("genia_limits")
                    if self.styles["genia_task1"] and (
                            eI.get("type") == "Entity"
                            or eJ.get("type") == "Entity"):
                        makeExample = False
                        self.exampleStats.filter("genia_task1")
                    if self.styles[
                            "rel_limits"] and not self.isPotentialRELInteraction(
                                eI, eJ):
                        makeExample = False
                        self.exampleStats.filter("rel_limits")
                    if self.styles[
                            "co_limits"] and not self.isPotentialCOInteraction(
                                eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("co_limits")
                    if self.styles[
                            "bb_limits"] and not self.isPotentialBBInteraction(
                                eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("bb_limits")
                        if categoryName != "neg":
                            self.exampleStats.filter("bb_limits(" +
                                                     categoryName + ":" +
                                                     eI.get("type") + "/" +
                                                     eJ.get("type") + ")")
                    if self.styles[
                            "bi_limits"] and not self.isPotentialBIInteraction(
                                eI, eJ, sentenceGraph, self.exampleStats):
                        makeExample = False
                        #self.exampleStats.filter("bi_limits")
                    if self.styles[
                            "epi_limits"] and not self.isPotentialEPIInteraction(
                                eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("epi_limits")
                    if self.styles[
                            "id_limits"] and not self.isPotentialIDInteraction(
                                eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("id_limits")
#                    if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_limits")
#                    if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_group")
                    if self.styles["pos_only"] and categoryName == "neg":
                        makeExample = False
                        self.exampleStats.filter("pos_only")
                    if makeExample:
                        #examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) )
                        ExampleUtils.appendExamples([
                            self.buildExample(tI, tJ, paths, sentenceGraph,
                                              categoryName, exampleIndex, eI,
                                              eJ)
                        ], outfile)
                        exampleIndex += 1
                    self.exampleStats.endExample()

                    # define reverse
                    if self.styles["entities"]:
                        categoryName = self.getCategoryName(
                            sentenceGraph, eJ, eI, True)
                        if goldGraph != None:
                            categoryName = self.getGoldCategoryName(
                                goldGraph, entityToGold, eJ, eI, True)
                    else:
                        categoryName = self.getCategoryNameFromTokens(
                            sentenceGraph, tJ, tI, True)
                    # make reverse
                    self.exampleStats.beginExample(categoryName)
                    makeExample = True
                    if self.styles[
                            "genia_limits"] and not self.isPotentialGeniaInteraction(
                                eJ, eI):
                        makeExample = False
                        self.exampleStats.filter("genia_limits")
                    if self.styles["genia_task1"] and (
                            eI.get("type") == "Entity"
                            or eJ.get("type") == "Entity"):
                        makeExample = False
                        self.exampleStats.filter("genia_task1")
                    if self.styles[
                            "rel_limits"] and not self.isPotentialRELInteraction(
                                eJ, eI):
                        makeExample = False
                        self.exampleStats.filter("rel_limits")
                    if self.styles[
                            "co_limits"] and not self.isPotentialCOInteraction(
                                eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("co_limits")
                    if self.styles[
                            "bb_limits"] and not self.isPotentialBBInteraction(
                                eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("bb_limits")
                        if categoryName != "neg":
                            self.exampleStats.filter("bb_limits(" +
                                                     categoryName + ":" +
                                                     eJ.get("type") + "/" +
                                                     eI.get("type") + ")")
                    if self.styles[
                            "bi_limits"] and not self.isPotentialBIInteraction(
                                eJ, eI, sentenceGraph, self.exampleStats):
                        makeExample = False
                        #self.exampleStats.filter("bi_limits")
                    if self.styles[
                            "epi_limits"] and not self.isPotentialEPIInteraction(
                                eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("epi_limits")
                    if self.styles[
                            "id_limits"] and not self.isPotentialIDInteraction(
                                eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("id_limits")
#                    if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_limits")
#                    if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_group")
                    if self.styles["pos_only"] and categoryName == "neg":
                        makeExample = False
                        self.exampleStats.filter("pos_only")
                    if makeExample:
                        #examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) )
                        ExampleUtils.appendExamples([
                            self.buildExample(tJ, tI, paths, sentenceGraph,
                                              categoryName, exampleIndex, eJ,
                                              eI)
                        ], outfile)
                        exampleIndex += 1
                    self.exampleStats.endExample()
                else:
                    if self.styles["entities"]:
                        categoryName = self.getCategoryName(
                            sentenceGraph, eI, eJ, False)
                    else:
                        categoryName = self.getCategoryNameFromTokens(
                            sentenceGraph, tI, tJ, False)
                    self.exampleStats.beginExample(categoryName)
                    forwardExample = self.buildExample(tI, tJ, paths,
                                                       sentenceGraph,
                                                       categoryName,
                                                       exampleIndex, eI, eJ)
                    if not self.styles["graph_kernel"]:
                        reverseExample = self.buildExample(
                            tJ, tI, paths, sentenceGraph, categoryName,
                            exampleIndex, eJ, eI)
                        forwardExample[2].update(reverseExample[2])
                    #examples.append(forwardExample)
                    ExampleUtils.appendExamples([forwardExample], outfile)
                    exampleIndex += 1
                    self.exampleStats.endExample()

        #return examples
        return exampleIndex

    def buildExample(self,
                     token1,
                     token2,
                     paths,
                     sentenceGraph,
                     categoryName,
                     exampleIndex,
                     entity1=None,
                     entity2=None):
        """
        Build a single directed example for the potential edge between token1 and token2
        """
        # dummy return for speed testing
        #return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),1,{},{})

        # define features
        features = {}
        if True:  #token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2):
            #if token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2):
            #    path = paths[token1][token2]
            #else:
            #    path = [token1, token2]
            if not self.styles["no_path"]:
                # directedPath reduces performance by 0.01 pp
                #directedPath = sentenceGraph.dependencyGraph.getPaths(token1, token2)
                #if len(directedPath) == 0:
                #    directedPath = sentenceGraph.dependencyGraph.getPaths(token2, token1)
                #    for dp in directedPath:
                #        dp.reverse()
                #if len(directedPath) == 0:
                #    path = paths.getPaths(token1, token2)
                #else:
                #    path = directedPath

                path = paths.getPaths(token1, token2)
                if len(path) > 0:
                    #if len(path) > 1:
                    #    print len(path)
                    path = path[0]
                    pathExists = True
                else:
                    path = [token1, token2]
                    pathExists = False
            else:
                path = [token1, token2]
                pathExists = False
            #print token1.get("id"), token2.get("id")
            assert (self.pathLengths == None)
            if self.pathLengths == None or len(path) - 1 in self.pathLengths:
                #                if not "no_ontology" in self.styles:
                #                    self.ontologyFeatureBuilder.setFeatureVector(features)
                #                    self.ontologyFeatureBuilder.buildOntologyFeaturesForPath(sentenceGraph, path)
                #                    self.ontologyFeatureBuilder.setFeatureVector(None)
                if self.styles["trigger_features"]:  # F 85.52 -> 85.55
                    self.triggerFeatureBuilder.setFeatureVector(features)
                    self.triggerFeatureBuilder.tag = "trg1_"
                    self.triggerFeatureBuilder.buildFeatures(token1)
                    self.triggerFeatureBuilder.tag = "trg2_"
                    self.triggerFeatureBuilder.buildFeatures(token2)
                    self.triggerFeatureBuilder.setFeatureVector(None)
                # REL features
                if self.styles["rel_features"] and not self.styles["no_task"]:
                    self.relFeatureBuilder.setFeatureVector(features)
                    self.relFeatureBuilder.tag = "rel1_"
                    self.relFeatureBuilder.buildAllFeatures(
                        sentenceGraph.tokens,
                        sentenceGraph.tokens.index(token1))
                    self.relFeatureBuilder.tag = "rel2_"
                    self.relFeatureBuilder.buildAllFeatures(
                        sentenceGraph.tokens,
                        sentenceGraph.tokens.index(token2))
                    self.relFeatureBuilder.setFeatureVector(None)
                if self.styles[
                        "bacteria_renaming"] and not self.styles["no_task"]:
                    self.bacteriaRenamingFeatureBuilder.setFeatureVector(
                        features)
                    self.bacteriaRenamingFeatureBuilder.buildPairFeatures(
                        entity1, entity2)
                    #self.bacteriaRenamingFeatureBuilder.buildSubstringFeatures(entity1, entity2) # decreases perf. 74.76 -> 72.41
                    self.bacteriaRenamingFeatureBuilder.setFeatureVector(None)
                if self.styles["co_limits"] and not self.styles["no_task"]:
                    e1Offset = Range.charOffsetToSingleTuple(
                        entity1.get("charOffset"))
                    e2Offset = Range.charOffsetToSingleTuple(
                        entity2.get("charOffset"))
                    if Range.contains(e1Offset, e2Offset):
                        features[self.featureSet.getId("e1_contains_e2")] = 1
                        if entity2.get("isName") == "True":
                            features[self.featureSet.getId(
                                "e1_contains_e2name")] = 1
                    if Range.contains(e2Offset, e1Offset):
                        features[self.featureSet.getId("e2_contains_e1")] = 1
                        if entity1.get("isName") == "True":
                            features[self.featureSet.getId(
                                "e2_contains_e1name")] = 1
                if self.styles["ddi_features"]:
                    self.drugFeatureBuilder.setFeatureVector(features)
                    self.drugFeatureBuilder.tag = "ddi_"
                    self.drugFeatureBuilder.buildPairFeatures(entity1, entity2)
                    if self.styles["ddi_mtmx"]:
                        self.drugFeatureBuilder.buildMTMXFeatures(
                            entity1, entity2)
                    self.drugFeatureBuilder.setFeatureVector(None)
                #if "graph_kernel" in self.styles or not "no_dependency" in self.styles:
                #    #print "Getting edges"
                #    if token1 != token2 and pathExists:
                #        #print "g1"
                #        edges = self.multiEdgeFeatureBuilder.getEdges(sentenceGraph.dependencyGraph, path)
                #        #print "g2"
                #    else:
                #        edges = None
                if self.styles["graph_kernel"]:
                    self.graphKernelFeatureBuilder.setFeatureVector(
                        features, entity1, entity2)
                    self.graphKernelFeatureBuilder.buildGraphKernelFeatures(
                        sentenceGraph, path)
                    self.graphKernelFeatureBuilder.setFeatureVector(None)
                if self.styles["entity_type"]:
                    features[self.featureSet.getId("e1_" +
                                                   entity1.get("type"))] = 1
                    features[self.featureSet.getId("e2_" +
                                                   entity2.get("type"))] = 1
                    features[self.featureSet.getId("distance_" +
                                                   str(len(path)))] = 1
                if not self.styles["no_dependency"]:
                    #print "Dep features"
                    self.multiEdgeFeatureBuilder.setFeatureVector(
                        features, entity1, entity2)
                    #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast
                    if not self.styles["disable_entity_features"]:
                        self.multiEdgeFeatureBuilder.buildEntityFeatures(
                            sentenceGraph)
                    self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path)
                    if not self.styles["disable_terminus_features"]:
                        self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(
                            path, sentenceGraph)  # remove for fast
                    if not self.styles["disable_single_element_features"]:
                        self.multiEdgeFeatureBuilder.buildSingleElementFeatures(
                            path, sentenceGraph)
                    if not self.styles["disable_ngram_features"]:
                        #print "NGrams"
                        self.multiEdgeFeatureBuilder.buildPathGrams(
                            2, path, sentenceGraph)  # remove for fast
                        self.multiEdgeFeatureBuilder.buildPathGrams(
                            3, path, sentenceGraph)  # remove for fast
                        self.multiEdgeFeatureBuilder.buildPathGrams(
                            4, path, sentenceGraph)  # remove for fast
                    #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast
                    #if edges != None:
                    #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast
                    #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast
                    if not self.styles["disable_path_edge_features"]:
                        self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(
                            path, sentenceGraph)
                    self.multiEdgeFeatureBuilder.buildSentenceFeatures(
                        sentenceGraph)
                    self.multiEdgeFeatureBuilder.setFeatureVector(None)
                if self.styles["nodalida"]:
                    self.nodalidaFeatureBuilder.setFeatureVector(
                        features, entity1, entity2)
                    shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths(
                        sentenceGraph.dependencyGraph, path)
                    print shortestPaths
                    if len(shortestPaths) > 0:
                        self.nodalidaFeatureBuilder.buildNGrams(
                            shortestPaths, sentenceGraph)
                    self.nodalidaFeatureBuilder.setFeatureVector(None)
                if not self.styles["no_linear"]:
                    self.tokenFeatureBuilder.setFeatureVector(features)
                    for i in range(len(sentenceGraph.tokens)):
                        if sentenceGraph.tokens[i] == token1:
                            token1Index = i
                        if sentenceGraph.tokens[i] == token2:
                            token2Index = i
                    linearPreTag = "linfw_"
                    if token1Index > token2Index:
                        token1Index, token2Index = token2Index, token1Index
                        linearPreTag = "linrv_"
                    self.tokenFeatureBuilder.buildLinearOrderFeatures(
                        token1Index, sentenceGraph, 2, 2, preTag="linTok1")
                    self.tokenFeatureBuilder.buildLinearOrderFeatures(
                        token2Index, sentenceGraph, 2, 2, preTag="linTok2")
                    # Before, middle, after
                    #                self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf")
                    #                self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw")
                    #                self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af")
                    # before-middle, middle, middle-after
                    #                    self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2)
                    #                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2)
                    #                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2)
                    self.tokenFeatureBuilder.setFeatureVector(None)
                if self.styles["random"]:
                    self.randomFeatureBuilder.setFeatureVector(features)
                    self.randomFeatureBuilder.buildRandomFeatures(100, 0.01)
                    self.randomFeatureBuilder.setFeatureVector(None)
                if self.styles["genia_limits"] and not self.styles["no_task"]:
                    e1Type = entity1.get("type")
                    e2Type = entity2.get("type")
                    assert (entity1.get("isName") == "False")
                    if entity2.get("isName") == "True":
                        features[self.featureSet.getId(
                            "GENIA_target_protein")] = 1
                    else:
                        features[self.featureSet.getId(
                            "GENIA_nested_event")] = 1
                    if e1Type.find(
                            "egulation"
                    ) != -1:  # leave r out to avoid problems with capitalization
                        if entity2.get("isName") == "True":
                            features[self.featureSet.getId(
                                "GENIA_regulation_of_protein")] = 1
                        else:
                            features[self.featureSet.getId(
                                "GENIA_regulation_of_event")] = 1
                if self.styles["bi_limits"]:
                    # Make features based on entity types
                    e1Type = entity1.get("type")
                    e2Type = entity2.get("type")
                    e1SuperType = str(self.getBISuperType(e1Type))
                    e2SuperType = str(self.getBISuperType(e2Type))
                    features[self.featureSet.getId("BI_e1_" + e1Type)] = 1
                    features[self.featureSet.getId("BI_e2_" + e2Type)] = 1
                    features[self.featureSet.getId("BI_e1sup_" +
                                                   e1SuperType)] = 1
                    features[self.featureSet.getId("BI_e2sup_" +
                                                   e2SuperType)] = 1
                    features[self.featureSet.getId("BI_e1e2_" + e1Type + "_" +
                                                   e2Type)] = 1
                    features[self.featureSet.getId("BI_e1e2sup_" +
                                                   e1SuperType + "_" +
                                                   e2SuperType)] = 1
                if self.styles["evex"]:
                    self.evexFeatureBuilder.setFeatureVector(
                        features, entity1, entity2)
                    self.evexFeatureBuilder.buildEdgeFeatures(
                        entity1, entity2, token1, token2, path, sentenceGraph)
                    self.evexFeatureBuilder.setFeatureVector(None)
                if self.styles["giuliano"]:
                    self.giulianoFeatureBuilder.setFeatureVector(
                        features, entity1, entity2)
                    self.giulianoFeatureBuilder.buildEdgeFeatures(
                        entity1, entity2, token1, token2, path, sentenceGraph)
                    self.giulianoFeatureBuilder.setFeatureVector(None)
            else:
                features[self.featureSet.getId("always_negative")] = 1
                if self.styles["subset"]:
                    features[self.featureSet.getId("out_of_scope")] = 1
        else:
            features[self.featureSet.getId("always_negative")] = 1
            if self.styles["subset"]:
                features[self.featureSet.getId("out_of_scope")] = 1
            path = [token1, token2]
        # define extra attributes
        #if int(path[0].get("id").split("_")[-1]) < int(path[-1].get("id").split("_")[-1]):
        if int(path[0].get("charOffset").split("-")[0]) < int(
                path[-1].get("charOffset").split("-")[0]):
            #extra = {"xtype":"edge","type":"i","t1":path[0],"t2":path[-1]}
            extra = {
                "xtype": "edge",
                "type": "i",
                "t1": path[0].get("id"),
                "t2": path[-1].get("id")
            }
            extra["deprev"] = False
        else:
            #extra = {"xtype":"edge","type":"i","t1":path[-1],"t2":path[0]}
            extra = {
                "xtype": "edge",
                "type": "i",
                "t1": path[-1].get("id"),
                "t2": path[0].get("id")
            }
            extra["deprev"] = True
        if entity1 != None:
            #extra["e1"] = entity1
            extra["e1"] = entity1.get("id")
            if sentenceGraph.mergedEntityToDuplicates != None:
                #extra["e1GoldIds"] = mergedEntityIds[entity1]
                extra["e1DuplicateIds"] = ",".join([
                    x.get("id")
                    for x in sentenceGraph.mergedEntityToDuplicates[entity1]
                ])
        if entity2 != None:
            #extra["e2"] = entity2
            extra["e2"] = entity2.get("id")
            if sentenceGraph.mergedEntityToDuplicates != None:
                extra["e2DuplicateIds"] = ",".join([
                    x.get("id")
                    for x in sentenceGraph.mergedEntityToDuplicates[entity2]
                ])
                #extra["e2GoldIds"] = mergedEntityIds[entity2]
        extra["categoryName"] = categoryName
        if self.styles["bacteria_renaming"]:
            if entity1.get("text") != None and entity1.get("text") != "":
                extra["e1t"] = entity1.get("text").replace(" ", "---").replace(
                    ":", "-COL-")
            if entity2.get("text") != None and entity2.get("text") != "":
                extra["e2t"] = entity2.get("text").replace(" ", "---").replace(
                    ":", "-COL-")
        sentenceOrigId = sentenceGraph.sentenceElement.get("origId")
        if sentenceOrigId != None:
            extra["SOID"] = sentenceOrigId
        # make example
        if self.styles["binary"]:
            if categoryName != "neg":
                category = 1
            else:
                category = -1
            categoryName = "i"
        else:
            category = self.classSet.getId(categoryName)

        # NOTE: temporarily disable for replicating 110310 experiment
        #features[self.featureSet.getId("extra_constant")] = 1
        return (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex),
                category, features, extra)
class AsymmetricEventExampleBuilder(ExampleBuilder):
    def __init__(self, style=["typed","directed"], length=None, types=[], featureSet=None, classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert( classSet.getId("neg") == 1 )
        
        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        if style.find(",") != -1:
            style = style.split(",")
        self.styles = style
        
        self.negFrac = None
        self.posPairGaz = POSPairGazetteer()
        for s in style:
            if s.find("negFrac") != -1:      
                self.negFrac = float(s.split("_")[-1])
                print >> sys.stderr, "Downsampling negatives to", self.negFrac
                self.negRand = random.Random(15)
            elif s.find("posPairGaz") != -1:
                self.posPairGaz = POSPairGazetteer(loadFrom=s.split("_", 1)[-1])
        
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        if "graph_kernel" in self.styles:
            from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder
            self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet)
        if "noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True
        self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        if "ontology" in self.styles:
            self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
        if "nodalida" in self.styles:
            self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet)
        #IF LOCAL
        if "bioinfer_limits" in self.styles:
            self.bioinferOntologies = OntologyUtils.getBioInferTempOntology()
            #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName)
        #ENDIF
        self.pathLengths = length
        assert(self.pathLengths == None)
        self.types = types
        if "random" in self.styles:
            from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder
            self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)

        #self.outFile = open("exampleTempFile.txt","wt")

    @classmethod
    def run(cls, input, output, parse, tokenization, style, idFileTag=None):
        classSet, featureSet = cls.getIdSets(idFileTag)
        if style != None:
            e = cls(style=style, classSet=classSet, featureSet=featureSet)
        else:
            e = cls(classSet=classSet, featureSet=featureSet)
        sentences = cls.getSentences(input, parse, tokenization)
        e.buildExamplesForSentences(sentences, output, idFileTag)
        if "printClassIds" in e.styles:
            print >> sys.stderr, e.classSet.Ids
  
    def definePredictedValueRange(self, sentences, elementName):
        self.multiEdgeFeatureBuilder.definePredictedValueRange(sentences, elementName)                        
    
    def getPredictedValueRange(self):
        return self.multiEdgeFeatureBuilder.predictedRange
    
    def filterEdgesByType(self, edges, typesToInclude):
        if len(typesToInclude) == 0:
            return edges
        edgesToKeep = []
        for edge in edges:
            if edge.get("type") in typesToInclude:
                edgesToKeep.append(edge)
        return edgesToKeep
    
    def getCategoryNameFromTokens(self, sentenceGraph, t1, t2, directed=True):
        types = set()
        themeE1Types = set()
        intEdges = []
        if sentenceGraph.interactionGraph.has_edge(t1, t2):
            intEdges = sentenceGraph.interactionGraph.get_edge_data(t1, t2, default={})
            # NOTE: Only works if keys are ordered integers
            for i in range(len(intEdges)):
                types.add(intEdges[i]["element"].get("type"))

#        if (not directed) and sentenceGraph.interactionGraph.has_edge(t2, t1):
#            intEdgesReverse = sentenceGraph.interactionGraph.get_edge(t2, t1, default={})
#            # NOTE: Only works if keys are ordered integers
#            for i in range(len(intEdgesReverse)):
#                intElement = intEdgesReverse[i]["element"]
#                intType = intElement.get("type")
#                types.add(intType)
#            intEdges.extend(intEdgesReverse)

        for i in range(len(intEdges)):
            intElement = intEdges[i]["element"]
            intType = intElement.get("type")
            if intType == "Theme":
                e1Entity = sentenceGraph.entitiesById[intElement.get("e1")]
                themeE1Types.add(e1Entity.get("type"))
            #types.add(intType)
        
        if len(themeE1Types) != 0:
            themeE1Types = list(themeE1Types)
            themeE1Types.sort()
            categoryName = ""
            for name in themeE1Types:
                if categoryName != "":
                    categoryName += "---"
                categoryName += name
            return categoryName            
        else:
            types = list(types)
            types.sort()
            categoryName = ""
            for name in types:
                if categoryName != "":
                    categoryName += "---"
                categoryName += name
            if categoryName != "":
                return categoryName
            else:
                return "neg"
        
    def getCategoryName(self, sentenceGraph, e1, e2, directed=True):
        interactions = sentenceGraph.getInteractions(e1, e2)
        if not directed:
            interactions.extend(sentenceGraph.getInteractions(e2, e1))
        
        types = set()
        for interaction in interactions:
            types.add(interaction.attrib["type"])
        types = list(types)
        types.sort()
        categoryName = ""
        for name in types:
            if categoryName != "":
                categoryName += "---"
            categoryName += name
        if categoryName != "":
            return categoryName
        else:
            return "neg"           
    
    def preProcessExamples(self, allExamples):
        # Duplicates cannot be removed here, as they should only be removed from the training set. This is done
        # in the classifier.
#        if "no_duplicates" in self.styles:
#            count = len(allExamples)
#            print >> sys.stderr, " Removing duplicates,", 
#            allExamples = ExampleUtils.removeDuplicates(allExamples)
#            print >> sys.stderr, "removed", count - len(allExamples)
        if "normalize" in self.styles:
            print >> sys.stderr, " Normalizing feature vectors"
            ExampleUtils.normalizeFeatureVectors(allExamples)
        return allExamples   
    
    def isPotentialGeniaInteraction(self, e1, e2):
        if e1.get("isName") == "True":
            return False
        else:
            return True
    
    #IF LOCAL
    def getBioInferParentType(self, eType):
        if eType == "Physical_entity" or OntologyUtils.hasParent(eType, "Physical_entity", self.bioinferOntologies):
            return "Physical"
        elif eType == "Property_entity" or OntologyUtils.hasParent(eType, "Property_entity", self.bioinferOntologies):
            return "Property"
        elif OntologyUtils.hasParent(eType, "Relationship", self.bioinferOntologies):
            return "Process"
        else:
            assert False, eType
        
#        if self.bioinferOntologies["Entity"].has_key(eType):
#            if OntologyUtils.hasParent(eType, "Physical_entity", self.bioinferOntologies):
#                assert not OntologyUtils.hasParent(eType, "Property_entity", self.bioinferOntologies), eType
#                return "Physical"
#            else:
#                assert OntologyUtils.hasParent(eType, "Property_entity", self.bioinferOntologies), eType
#                return "Property"
#                
#        else:
#            assert self.bioinferOntologies.has_key(eType), eType
#            #assert OntologyUtils.hasParent(eType, "Process_entity", self.bioinferOntologies["Relationship"]), eType
#            return "Process"
    
    def isPotentialBioInferInteraction(self, e1, e2, categoryName):
        e1Type = self.getBioInferParentType(e1.get("type"))
        e2Type = self.getBioInferParentType(e2.get("type"))
        if e1Type == "Process" or e1Type == "Property":
            return True
        elif e1Type == "Physical" and e2Type == "Physical":
            return True
        elif e1Type == "Physical" and e2Type == "Process": # hack
            return True
        else:
            assert(categoryName == "neg"), categoryName + " category for " + e1Type + " and " + e2Type
            return False
    #ENDIF
    
    def nxMultiDiGraphToUndirected(self, graph):
        undirected = NX10.MultiGraph(name=graph.name)
        undirected.add_nodes_from(graph)
        undirected.add_edges_from(graph.edges_iter())
        return undirected
            
    def buildExamples(self, sentenceGraph):
        examples = []
        exampleIndex = 0
        
        clearGraph = sentenceGraph.getCleared()
        
        #undirected = sentenceGraph.getUndirectedDependencyGraph()
        undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
        ##undirected = sentenceGraph.dependencyGraph.to_undirected()
        ###undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work
        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
        
        self.triggerFeatureBuilder.initSentence(clearGraph)
        
        # Generate examples based on interactions between entities or interactions between tokens
        if "entities" in self.styles:
            loopRange = len(sentenceGraph.entities)
        else:
            loopRange = len(sentenceGraph.tokens)
        #for i in range(loopRange-1):
        for i in range(loopRange): # allow self-interactions
            #for j in range(i+1,loopRange):
            for j in range(i,loopRange): # allow self-interactions
                eI = None
                eJ = None
                if "entities" in self.styles:
                    eI = sentenceGraph.entities[i]
                    eJ = sentenceGraph.entities[j]
                    tI = sentenceGraph.entityHeadTokenByEntity[eI]
                    tJ = sentenceGraph.entityHeadTokenByEntity[eJ]
                    #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True":
                    #    continue
                    if eI.get("type") == "neg" or eJ.get("type") == "neg":
                        continue
                else:
                    tI = sentenceGraph.tokens[i]
                    tJ = sentenceGraph.tokens[j]
#                # only consider paths between entities (NOTE! entities, not only named entities)
#                if "headsOnly" in self.styles:
#                    if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0):
#                        continue
                
                if "directed" in self.styles:
                    # define forward
                    if "entities" in self.styles:
                        categoryName = self.getCategoryName(sentenceGraph, eI, eJ, True)
                    else:
                        categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, True)
                    self.exampleStats.beginExample(categoryName)
                    if self.negFrac == None or categoryName != "neg" or (categoryName == "neg" and self.negRand.random() < self.negFrac):
                        makeExample = True
                        if ("genia_limits" in self.styles) and not self.isPotentialGeniaInteraction(eI, eJ):
                            makeExample = False
                            self.exampleStats.filter("genia_limits")
                        if self.posPairGaz.getNegFrac((tI.get("POS"), tJ.get("POS"))) == 1.0:
                            makeExample = False
                            self.exampleStats.filter("pos_pair")
                        if makeExample:
                            if not sentenceGraph.tokenIsName[tI]:
                                examples.append( self.buildExample(tI, tJ, paths, clearGraph, categoryName, exampleIndex, eI, eJ) )
                                exampleIndex += 1
                            else:
                                self.exampleStats.filter("genia_token_limits")
                    else:
                        self.exampleStats.filter("neg_frac")
                    self.exampleStats.endExample()
                    
                    # define reverse
                    if "entities" in self.styles:
                        categoryName = self.getCategoryName(sentenceGraph, eJ, eI, True)
                    else:
                        categoryName = self.getCategoryNameFromTokens(sentenceGraph, tJ, tI, True)
                    self.exampleStats.beginExample(categoryName)
                    if self.negFrac == None or categoryName != "neg" or (categoryName == "neg" and self.negRand.random() < self.negFrac):
                        makeExample = True
                        if ("genia_limits" in self.styles) and not self.isPotentialGeniaInteraction(eJ, eI):
                            makeExample = False
                            self.exampleStats.filter("genia_limits")
                        if ("bioinfer_limits" in self.styles) and not self.isPotentialBioInferInteraction(eJ, eI, categoryName):
                            makeExample = False
                            self.exampleStats.filter("bioinfer_limits")
                        if self.posPairGaz.getNegFrac((tJ.get("POS"), tI.get("POS"))) == 1.0:
                            makeExample = False
                            self.exampleStats.filter("pos_pair")
                        if makeExample:
                            if not sentenceGraph.tokenIsName[tJ]:
                                examples.append( self.buildExample(tJ, tI, paths, clearGraph, categoryName, exampleIndex, eJ, eI) )
                                exampleIndex += 1
                            else:
                                self.exampleStats.filter("genia_token_limits")
                    else:
                        self.exampleStats.filter("neg_frac")
                    self.exampleStats.endExample()
#                else:
#                    if "entities" in self.styles:
#                        categoryName = self.getCategoryName(sentenceGraph, eI, eJ, False)
#                    else:
#                        categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, False)
#                    forwardExample = self.buildExample(tI, tJ, paths, clearGraph, categoryName, exampleIndex, eI, eJ)
#                    if not "graph_kernel" in self.styles:
#                        reverseExample = self.buildExample(tJ, tI, paths, clearGraph, categoryName, exampleIndex, eJ, eI)
#                        forwardExample[2].update(reverseExample[2])
#                    examples.append(forwardExample)
#                    exampleIndex += 1
        
        return examples
    
    def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, exampleIndex, entity1=None, entity2=None):
        # define features
        features = {}
        if True: #token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2):
            if token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2):
                path = paths[token1][token2]
            else:
                path = [token1, token2]
            assert(self.pathLengths == None)
            if self.pathLengths == None or len(path)-1 in self.pathLengths:
                if not "no_trigger":
                    self.triggerFeatureBuilder.setFeatureVector(self.features)
                    self.triggerFeatureBuilder.tag = "trg_t1_"
                    self.triggerFeatureBuilder.buildFeatures(eventToken)
                    self.triggerFeatureBuilder.tag = "trg_t2_"
                    self.triggerFeatureBuilder.buildFeatures(eventToken)
#                if not "no_ontology" in self.styles:
#                    self.ontologyFeatureBuilder.setFeatureVector(features)
#                    self.ontologyFeatureBuilder.buildOntologyFeaturesForPath(sentenceGraph, path)
#                    self.ontologyFeatureBuilder.setFeatureVector(None)
                if "graph_kernel" in self.styles or not "no_dependency" in self.styles:
                    if token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2):
                        edges = self.multiEdgeFeatureBuilder.getEdges(sentenceGraph.dependencyGraph, path)
                    else:
                        edges = None
                if "graph_kernel" in self.styles:
                    self.graphKernelFeatureBuilder.setFeatureVector(features, entity1, entity2)
                    self.graphKernelFeatureBuilder.buildGraphKernelFeatures(sentenceGraph, path, edges)
                    self.graphKernelFeatureBuilder.setFeatureVector(None)
                if "entity_type" in self.styles:
                    features[self.featureSet.getId("e1_"+entity1.attrib["type"])] = 1
                    features[self.featureSet.getId("e2_"+entity2.attrib["type"])] = 1
                    features[self.featureSet.getId("distance_"+str(len(path)))] = 1
                if not "no_dependency" in self.styles:
                    if token1 == token2:
                        features[self.featureSet.getId("tokenSelfLoop")] = 1
                    
                    self.multiEdgeFeatureBuilder.setFeatureVector(features, entity1, entity2)
                    #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast
                    if not "disable_entity_features" in self.styles:
                        self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph)
                    self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path)
                    if not "disable_terminus_features" in self.styles:
                        self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast
                    if not "disable_single_element_features" in self.styles:
                        self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, edges, sentenceGraph)
                    if not "disable_ngram_features" in self.styles:
                        self.multiEdgeFeatureBuilder.buildPathGrams(2, path, edges, sentenceGraph) # remove for fast
                        self.multiEdgeFeatureBuilder.buildPathGrams(3, path, edges, sentenceGraph) # remove for fast
                        self.multiEdgeFeatureBuilder.buildPathGrams(4, path, edges, sentenceGraph) # remove for fast
                    #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast
                    #if edges != None:
                    #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast
                    #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast
                    if not "disable_path_edge_features" in self.styles:
                        self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, edges, sentenceGraph)
                    self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph)
                    self.multiEdgeFeatureBuilder.setFeatureVector(None)
                if "nodalida" in self.styles:
                    self.nodalidaFeatureBuilder.setFeatureVector(features, entity1, entity2)
                    shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths(sentenceGraph.dependencyGraph, path)
                    print shortestPaths
                    if len(shortestPaths) > 0:
                        self.nodalidaFeatureBuilder.buildNGrams(shortestPaths, sentenceGraph)
                    self.nodalidaFeatureBuilder.setFeatureVector(None)
                if not "no_linear" in self.styles:
                    self.tokenFeatureBuilder.setFeatureVector(features)
                    for i in range(len(sentenceGraph.tokens)):
                        if sentenceGraph.tokens[i] == token1:
                            token1Index = i
                        if sentenceGraph.tokens[i] == token2:
                            token2Index = i
                    linearPreTag = "linfw_"
                    if token1Index > token2Index: 
                        token1Index, token2Index = token2Index, token1Index
                        linearPreTag = "linrv_"
                    self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1")
                    self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2")
                    # Before, middle, after
    #                self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf")
    #                self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw")
    #                self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af")
                    # before-middle, middle, middle-after
#                    self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2)
#                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2)
#                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2)
                    self.tokenFeatureBuilder.setFeatureVector(None)
                if "random" in self.styles:
                    self.randomFeatureBuilder.setFeatureVector(features)
                    self.randomFeatureBuilder.buildRandomFeatures(100, 0.01)
                    self.randomFeatureBuilder.setFeatureVector(None)
                if "genia_limits" in self.styles:
                    e1Type = entity1.get("type")
                    e2Type = entity2.get("type")
                    assert(entity1.get("isName") == "False")
                    if entity2.get("isName") == "True":
                        features[self.featureSet.getId("GENIA_target_protein")] = 1
                    else:
                        features[self.featureSet.getId("GENIA_nested_event")] = 1
                    if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization
                        if entity2.get("isName") == "True":
                            features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1
                        else:
                            features[self.featureSet.getId("GENIA_regulation_of_event")] = 1
            else:
                features[self.featureSet.getId("always_negative")] = 1
                if "subset" in self.styles:
                    features[self.featureSet.getId("out_of_scope")] = 1
        else:
            features[self.featureSet.getId("always_negative")] = 1
            if "subset" in self.styles:
                features[self.featureSet.getId("out_of_scope")] = 1
            path = [token1, token2]
        
        self.triggerFeatureBuilder.tag = ""
        self.triggerFeatureBuilder.setFeatureVector(None)
        
        # define extra attributes
#        if int(path[0].attrib["id"].split("_")[-1]) < int(path[-1].attrib["id"].split("_")[-1]):
#            #extra = {"xtype":"edge","type":"i","t1":path[0],"t2":path[-1]}
#            extra = {"xtype":"asym","type":"i","t1":path[0].get("id"),"t2":path[-1].get("id")}
#            extra["deprev"] = False
#        else:
#            #extra = {"xtype":"edge","type":"i","t1":path[-1],"t2":path[0]}
#            extra = {"xtype":"asym","type":"i","t1":path[-1].get("id"),"t2":path[0].get("id")}
#            extra["deprev"] = True

        extra = {"xtype":"asym","type":"i","t1":token1.get("id"),"t2":token2.get("id")}
        if entity1 != None:
            #extra["e1"] = entity1
            extra["e1"] = entity1.get("id")
        if entity2 != None:
            #extra["e2"] = entity2
            extra["e2"] = entity2.get("id")
        extra["categoryName"] = categoryName
        sentenceOrigId = sentenceGraph.sentenceElement.get("origId")
        if sentenceOrigId != None:
            extra["SOID"] = sentenceOrigId       
        # make example
        if "binary" in self.styles:
            if categoryName != "neg":
                category = 1
            else:
                category = -1
            categoryName = "i"
        else:
            category = self.classSet.getId(categoryName)
        
        return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)
예제 #9
0
class UnmergedEdgeExampleBuilder(ExampleBuilder):
    def __init__(self,
                 style=["typed", "directed", "headsOnly"],
                 length=None,
                 types=[],
                 featureSet=None,
                 classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert (classSet.getId("neg") == 1)

        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        self.styles = style

        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        if "noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True
        self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        self.pathLengths = length
        assert (self.pathLengths == None)
        self.types = types
        if "random" in self.styles:
            from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder
            self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)

        #self.outFile = open("exampleTempFile.txt","wt")

    @classmethod
    def run(cls, input, output, parse, tokenization, style, idFileTag=None):
        classSet, featureSet = cls.getIdSets(idFileTag)
        if style == None:
            e = UnmergedEdgeExampleBuilder(classSet=classSet,
                                           featureSet=featureSet)
        else:
            e = UnmergedEdgeExampleBuilder(style=style,
                                           classSet=classSet,
                                           featureSet=featureSet)
        sentences = cls.getSentences(input, parse, tokenization)
        e.buildExamplesForSentences(sentences, output, idFileTag)
        print e.classSet.Ids

    def definePredictedValueRange(self, sentences, elementName):
        self.multiEdgeFeatureBuilder.definePredictedValueRange(
            sentences, elementName)

    def getPredictedValueRange(self):
        return self.multiEdgeFeatureBuilder.predictedRange

    def filterEdgesByType(self, edges, typesToInclude):
        if len(typesToInclude) == 0:
            return edges
        edgesToKeep = []
        for edge in edges:
            if edge.get("type") in typesToInclude:
                edgesToKeep.append(edge)
        return edgesToKeep

    def getCategoryName(self, sentenceGraph, e1, e2, directed=True):
        # Dummies are potential entities that do not exist in the
        # training data. If both entities of an interaction are dummies
        # it can't exist in the training data and is therefore a negative
        if e1[2] or e2[2]:
            return "neg"

        e1 = e1[0]
        e2 = e2[0]

        interactions = sentenceGraph.getInteractions(e1, e2)
        if not directed:
            interactions.extend(sentenceGraph.getInteractions(e2, e1))

        types = set()
        for interaction in interactions:
            types.add(interaction.attrib["type"])
        types = list(types)
        types.sort()
        categoryName = ""
        for name in types:
            if categoryName != "":
                categoryName += "---"
            categoryName += name
        if categoryName != "":
            return categoryName
        else:
            return "neg"

    def preProcessExamples(self, allExamples):
        if "normalize" in self.styles:
            print >> sys.stderr, " Normalizing feature vectors"
            ExampleUtils.normalizeFeatureVectors(allExamples)
        return allExamples

    def isPotentialGeniaInteraction(self, e1, e2):
        if e1.get("isName") == "True":
            return False
        else:
            return True

    def nxMultiDiGraphToUndirected(self, graph):
        undirected = NX10.MultiGraph(name=graph.name)
        undirected.add_nodes_from(graph)
        undirected.add_edges_from(graph.edges_iter())
        return undirected

    def getInteractionEdgeLengths(self, sentenceGraph, paths):
        """
        Return dependency and linear length of all interaction edges
        (measured between the two tokens).
        """
        interactionLengths = {}
        for interaction in sentenceGraph.interactions:
            # Calculated interaction edge dep and lin length
            e1 = sentenceGraph.entitiesById[interaction.get("e1")]
            e2 = sentenceGraph.entitiesById[interaction.get("e2")]
            t1 = sentenceGraph.entityHeadTokenByEntity[e1]
            t2 = sentenceGraph.entityHeadTokenByEntity[e2]
            # Get dep path length
            if t1 != t2 and paths.has_key(t1) and paths[t1].has_key(t2):
                pathLength = len(paths[t1][t2])
            else:  # no dependencyPath
                pathLength = 999999  # more than any real path
            # Linear distance
            t1Pos = -1
            t2Pos = -1
            for i in range(len(sentenceGraph.tokens)):
                if sentenceGraph.tokens[i] == t1:
                    t1Pos = i
                    if t2Pos != -1:
                        break
                if sentenceGraph.tokens[i] == t2:
                    t2Pos = i
                    if t1Pos != -1:
                        break
            linLength = abs(t1Pos - t2Pos)
            interactionLengths[interaction] = (pathLength, linLength)
        return interactionLengths

    def getPrecedenceLevels(self, sentenceGraph, paths):
        """
        Get overlapping entity precedence
        """
        interactionLengths = self.getInteractionEdgeLengths(
            sentenceGraph, paths)

        interactionsByEntity = {}  # Convenience mapping
        entityPrecedenceValues = {}
        for entity in sentenceGraph.entities:
            interactionsByEntity[entity] = []
            eId = entity.get("id")
            # Add access to interactions
            argDepDist = 0  # Sum of lengths of shortest paths
            argLinDist = 0  # Sum of linear distances
            for interaction in sentenceGraph.interactions:
                if interaction.get(
                        "e1"
                ) == eId:  # An argument of the entity defined by the node
                    interactionsByEntity[entity].append(interaction)
                    argDepDist += interactionLengths[interaction][0]
                    argLinDist += interactionLengths[interaction][1]
            # Store precedence counts (num args, sum of dep lengths, sum of lin lengths)
            entityPrecedenceValues[entity] = (len(interactionsByEntity),
                                              argDepDist, argLinDist, entity)

        # Determine level of entity from precedence counts
        levelByEntity = {}  # slot number
        #levelByInteraction = {} # slot number of parent node
        # There is one slot group per token, per type
        for token in sentenceGraph.tokens:  # per token
            entitiesByType = {}
            for entity in sentenceGraph.tokenIsEntityHead[token]:  # per type
                if entity.get(
                        "isName") == "True":  # Names can never have duplicates
                    assert not levelByEntity.has_key(entity)
                    levelByEntity[entity] = 0
                    continue
                eType = entity.get("type")
                if eType == "neg":
                    continue
                if not entitiesByType.has_key(eType):
                    entitiesByType[eType] = []
                entitiesByType[eType].append(entity)
            for eType in sorted(entitiesByType.keys()):
                # Slot ordering by precedence
                sortedEntities = []
                for entity in entitiesByType[eType]:
                    sortedEntities.append(entityPrecedenceValues[entity])
                sortedEntities.sort(compareEntityPrecedence)
                level = 0
                for precedenceTuple in sortedEntities:
                    entity = precedenceTuple[3]
                    assert not levelByEntity.has_key(entity)
                    levelByEntity[entity] = level
                    # Interactions have the same slot as their parent entity
                    #for interaction in interactionsByEntity[entity]:
                    #    assert not levelByInteraction.has_key(interaction)
                    #    levelByInteraction[interaction] = level
                    level += 1
        return levelByEntity  #, levelByInteraction

    def buildExamples(self, sentenceGraph):
        examples = []
        exampleIndex = 0

        #undirected = sentenceGraph.getUndirectedDependencyGraph()
        undirected = self.nxMultiDiGraphToUndirected(
            sentenceGraph.dependencyGraph)
        ##undirected = sentenceGraph.dependencyGraph.to_undirected()
        ###undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work
        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)

        # Determine overlapping entity precedence
        #levelByEntity, levelByInteraction = self.getPrecedenceLevels(sentenceGraph, paths)
        levelByEntity = self.getPrecedenceLevels(sentenceGraph, paths)

        entities = []
        # There is one entity group for each token, for each type of entity
        for token in sentenceGraph.tokens:  # per token
            entitiesByType = {}
            for entity in sentenceGraph.tokenIsEntityHead[token]:  # per type
                if entity.get(
                        "isName") == "True":  # Names can never have duplicates
                    entities.append((entity, 0, False))
                    continue
                eType = entity.get("type")
                if eType == "neg":
                    continue
                if not entitiesByType.has_key(eType):
                    entitiesByType[eType] = []
                entitiesByType[eType].append(entity)
            # Create slot groups for tokens for which exists at least one entity
            eTypes = sorted(entitiesByType.keys())
            if len(eTypes) == 0:
                continue
            # Create slot groups and insert GS data there
            for eType in eTypes:
                # Use first entity of a type as the dummy entity for unfilled slots
                dummyEntity = entitiesByType[eType][0]
                # Define entity slots
                entityGroup = [None, None, None, None]
                #entityGroup = [None, None]
                # Insert existing entities into slots
                for entity in entitiesByType[eType]:
                    if levelByEntity.has_key(entity):
                        level = levelByEntity[entity]
                        if level < len(entityGroup):
                            entityGroup[level] = (entity, level, False)
                # Create dummies for potential entities
                for i in range(len(entityGroup)):
                    if entityGroup[i] == None:
                        entityGroup[i] = (dummyEntity, i, True)
                # Put all slots into one potential entity list
                #print entityGroup
                for e in entityGroup:
                    entities.append(e)

        # Generate examples based on interactions between entities
        for i in range(len(entities) - 1):
            for j in range(i + 1, len(entities)):
                eI = entities[i][0]
                eJ = entities[j][0]
                tI = sentenceGraph.entityHeadTokenByEntity[eI]
                tJ = sentenceGraph.entityHeadTokenByEntity[eJ]

                # define forward example
                categoryName = self.getCategoryName(sentenceGraph, entities[i],
                                                    entities[j], True)
                if (not "genia_limits"
                        in self.styles) or self.isPotentialGeniaInteraction(
                            eI, eJ):
                    examples.append(
                        self.buildExample(tI, tJ, paths, sentenceGraph,
                                          categoryName, exampleIndex,
                                          entities[i], entities[j]))
                    exampleIndex += 1

                # define reverse
                categoryName = self.getCategoryName(sentenceGraph, entities[j],
                                                    entities[i], True)
                if (not "genia_limits"
                        in self.styles) or self.isPotentialGeniaInteraction(
                            eJ, eI):
                    examples.append(
                        self.buildExample(tJ, tI, paths, sentenceGraph,
                                          categoryName, exampleIndex,
                                          entities[j], entities[i]))
                    exampleIndex += 1

        return examples

    def buildExample(self,
                     token1,
                     token2,
                     paths,
                     sentenceGraph,
                     categoryName,
                     exampleIndex,
                     e1=None,
                     e2=None):
        entity1 = e1[0]
        entity2 = e2[0]
        # define features
        features = {}
        features[self.featureSet.getId("gov_level")] = e1[1]
        features[self.featureSet.getId("gov_level_" + str(e1[1]))] = 1
        features[self.featureSet.getId("dep_level")] = e2[1]
        features[self.featureSet.getId("dep_level_" + str(e2[1]))] = 1
        features[self.featureSet.getId("level_pair_" + str(e1[1]) + "_" +
                                       str(e2[1]))] = 1
        if True:  #token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2):
            if token1 != token2 and paths.has_key(
                    token1) and paths[token1].has_key(token2):
                path = paths[token1][token2]
            else:
                path = [token1, token2]
            assert (self.pathLengths == None)
            if self.pathLengths == None or len(path) - 1 in self.pathLengths:
                if not "no_dependency" in self.styles:
                    if token1 != token2 and paths.has_key(
                            token1) and paths[token1].has_key(token2):
                        edges = self.multiEdgeFeatureBuilder.getEdges(
                            sentenceGraph.dependencyGraph, path)
                    else:
                        edges = None
                if "entity_type" in self.styles:
                    features[self.featureSet.getId("e1_" +
                                                   entity1.attrib["type"])] = 1
                    features[self.featureSet.getId("e2_" +
                                                   entity2.attrib["type"])] = 1
                    features[self.featureSet.getId("distance_" +
                                                   str(len(path)))] = 1
                if not "no_dependency" in self.styles:
                    self.multiEdgeFeatureBuilder.setFeatureVector(
                        features, entity1, entity2)
                    #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast
                    if not "disable_entity_features" in self.styles:
                        self.multiEdgeFeatureBuilder.buildEntityFeatures(
                            sentenceGraph)
                    self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path)
                    if not "disable_terminus_features" in self.styles:
                        self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(
                            path, sentenceGraph)  # remove for fast
                    if not "disable_single_element_features" in self.styles:
                        self.multiEdgeFeatureBuilder.buildSingleElementFeatures(
                            path, edges, sentenceGraph)
                    if not "disable_ngram_features" in self.styles:
                        self.multiEdgeFeatureBuilder.buildPathGrams(
                            2, path, edges, sentenceGraph)  # remove for fast
                        self.multiEdgeFeatureBuilder.buildPathGrams(
                            3, path, edges, sentenceGraph)  # remove for fast
                        self.multiEdgeFeatureBuilder.buildPathGrams(
                            4, path, edges, sentenceGraph)  # remove for fast
                    #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast
                    #if edges != None:
                    #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast
                    #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast
                    if not "disable_path_edge_features" in self.styles:
                        self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(
                            path, edges, sentenceGraph)
                    self.multiEdgeFeatureBuilder.buildSentenceFeatures(
                        sentenceGraph)
                    self.multiEdgeFeatureBuilder.setFeatureVector(None)
                if not "no_linear" in self.styles:
                    self.tokenFeatureBuilder.setFeatureVector(features)
                    for i in range(len(sentenceGraph.tokens)):
                        if sentenceGraph.tokens[i] == token1:
                            token1Index = i
                        if sentenceGraph.tokens[i] == token2:
                            token2Index = i
                    linearPreTag = "linfw_"
                    if token1Index > token2Index:
                        token1Index, token2Index = token2Index, token1Index
                        linearPreTag = "linrv_"
                    self.tokenFeatureBuilder.buildLinearOrderFeatures(
                        token1Index, sentenceGraph, 2, 2, preTag="linTok1")
                    self.tokenFeatureBuilder.buildLinearOrderFeatures(
                        token2Index, sentenceGraph, 2, 2, preTag="linTok2")
                    # Before, middle, after
                    #                self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf")
                    #                self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw")
                    #                self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af")
                    # before-middle, middle, middle-after
                    #                    self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2)
                    #                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2)
                    #                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2)
                    self.tokenFeatureBuilder.setFeatureVector(None)
                if "random" in self.styles:
                    self.randomFeatureBuilder.setFeatureVector(features)
                    self.randomFeatureBuilder.buildRandomFeatures(100, 0.01)
                    self.randomFeatureBuilder.setFeatureVector(None)
                if "genia_limits" in self.styles:
                    e1Type = entity1.get("type")
                    e2Type = entity2.get("type")
                    assert (entity1.get("isName") == "False")
                    if entity2.get("isName") == "True":
                        features[self.featureSet.getId(
                            "GENIA_target_protein")] = 1
                    else:
                        features[self.featureSet.getId(
                            "GENIA_nested_event")] = 1
                    if e1Type.find(
                            "egulation"
                    ) != -1:  # leave r out to avoid problems with capitalization
                        if entity2.get("isName") == "True":
                            features[self.featureSet.getId(
                                "GENIA_regulation_of_protein")] = 1
                        else:
                            features[self.featureSet.getId(
                                "GENIA_regulation_of_event")] = 1
            else:
                features[self.featureSet.getId("always_negative")] = 1
                if "subset" in self.styles:
                    features[self.featureSet.getId("out_of_scope")] = 1
        else:
            features[self.featureSet.getId("always_negative")] = 1
            if "subset" in self.styles:
                features[self.featureSet.getId("out_of_scope")] = 1
            path = [token1, token2]
        # define extra attributes
        if int(path[0].attrib["id"].split("_")[-1]) < int(
                path[-1].attrib["id"].split("_")[-1]):
            #extra = {"xtype":"edge","type":"i","t1":path[0],"t2":path[-1]}
            extra = {
                "xtype": "ue",
                "type": "i",
                "t1": path[0].get("id"),
                "t2": path[-1].get("id")
            }
            extra["deprev"] = False
        else:
            #extra = {"xtype":"edge","type":"i","t1":path[-1],"t2":path[0]}
            extra = {
                "xtype": "ue",
                "type": "i",
                "t1": path[-1].get("id"),
                "t2": path[0].get("id")
            }
            extra["deprev"] = True
        if entity1 != None:
            extra["e1"] = entity1.get("id")
            extra["l1"] = str(e1[1])
            extra["d1"] = str(e1[2])[
                0]  # is a dummy node (an entity not in existing triggers)
        if entity2 != None:
            extra["e2"] = entity2.get("id")
            extra["l2"] = str(e2[1])
            extra["d2"] = str(e2[2])[
                0]  # is a dummy node (an entity not in existing triggers)
        extra["categoryName"] = categoryName
        sentenceOrigId = sentenceGraph.sentenceElement.get("origId")
        if sentenceOrigId != None:
            extra["SOID"] = sentenceOrigId
        # make example
        if "binary" in self.styles:
            if categoryName != "neg":
                category = 1
            else:
                category = -1
            categoryName = "i"
        else:
            category = self.classSet.getId(categoryName)

        return (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex),
                category, features, extra)
class EntityRelationExampleBuilder(ExampleBuilder):
    """
    BioNLP'11 REL subtask examples
    """
    def __init__(self, style=["typed","directed","headsOnly"], featureSet=None, classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert( classSet.getId("neg") == 1 )
        
        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        self.styles = style
        
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        #if "noAnnType" in self.styles:
        self.multiEdgeFeatureBuilder.noAnnType = True
        #if "noMasking" in self.styles:
        self.multiEdgeFeatureBuilder.maskNamedEntities = False
        #if "maxFeatures" in self.styles:
        self.multiEdgeFeatureBuilder.maximum = True
        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder.useNonNameEntities = False

    @classmethod
    def run(cls, input, output, parse, tokenization, style, idFileTag=None):
        """
        An interface for running the example builder without needing to create a class
        """
        classSet, featureSet = cls.getIdSets(idFileTag)
        if style != None:
            e = EntityRelationExampleBuilder(style=style, classSet=classSet, featureSet=featureSet)
        else:
            e = EntityRelationExampleBuilder(classSet=classSet, featureSet=featureSet)
        sentences = cls.getSentences(input, parse, tokenization)
        e.buildExamplesForSentences(sentences, output, idFileTag)
    
    def getCategoryNameFromTokens(self, sentenceGraph, t1, t2, directed=True):
        """
        Example class. Multiple overlapping edges create a merged type.
        """
        types = set()
        intEdges = sentenceGraph.interactionGraph.getEdges(t1, t2)
        if (not directed):
            intEdges = intEdges + sentenceGraph.interactionGraph.getEdges(t2, t1)
        for intEdge in intEdges:
            types.add(intEdge[2].get("type"))
        types = list(types)
        types.sort()
        categoryName = ""
        for name in types:
            if categoryName != "":
                categoryName += "---"
            categoryName += name
        if categoryName != "":
            return categoryName
        else:
            return "neg"
    
#    def isPotentialTargetEntityHead(self, namedEntityToken, token):
#        if token.get("POS") in ["CD","JJ","NN","NNS","RB"]:
#            return True
#        else:
#            return False
            
    def buildExamples(self, sentenceGraph):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        examples = []
        exampleIndex = 0
        
        if "trigger_features" in self.styles: 
            self.triggerFeatureBuilder.initSentence(sentenceGraph)
        
        undirectedDepGraph = sentenceGraph.dependencyGraph.toUndirected()
        
        namedEntities = []
        for entity in sentenceGraph.entities:
            if entity.get("isName") == "True":
                namedEntities.append(entity)
        
        potentialTargetEntities = {}
        for i in range(len(sentenceGraph.tokens)):
            potentialTargetEntities[i] = sentenceGraph.tokens[i].get("POS") in ["CD","JJ","NN","NNS","RB"]

        for namedEntity in namedEntities:
            for i in range(len(sentenceGraph.tokens)):
                if not potentialTargetEntities[i]:
                    continue
                namedEntityToken = sentenceGraph.entityHeadTokenByEntity[namedEntity]
                token = sentenceGraph.tokens[i]
                categoryName = self.getCategoryNameFromTokens(sentenceGraph, namedEntityToken, token, True)
                #if (not "genia_limits" in self.styles) or self.isPotentialRelation(namedEntityToken, token):
                examples.append( self.buildExample(entity, i, undirectedDepGraph, sentenceGraph, categoryName, exampleIndex) )
                exampleIndex += 1
        
        return examples
    
    def buildExample(self, namedEntity, tokenIndex, undirectedDepGraph, sentenceGraph, categoryName, exampleIndex):
        """
        Build a single directed example for the potential edge between token1 and token2
        """
        namedEntityToken = sentenceGraph.entityHeadTokenByEntity[namedEntity]
        token = sentenceGraph.tokens[tokenIndex]
        # define features
        features = {}
        paths = undirectedDepGraph.getPaths(namedEntityToken, token)
        if len(paths) > 0:
            path = paths[0]
        else:
            path = [namedEntityToken, token]
        if "trigger_features" in self.styles:
            self.triggerFeatureBuilder.setFeatureVector(features)
            self.triggerFeatureBuilder.tag = "trg1_"
            self.triggerFeatureBuilder.buildFeatures(namedEntityToken)
            self.triggerFeatureBuilder.tag = "trg2_"
            self.triggerFeatureBuilder.buildFeatures(token)
            self.triggerFeatureBuilder.setFeatureVector(None)
        if not "no_dependency" in self.styles:
            #print "Dep features"
            self.multiEdgeFeatureBuilder.setFeatureVector(features, None, None)
            #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast
            if not "disable_entity_features" in self.styles:
                self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph)
            self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path)
            if not "disable_terminus_features" in self.styles:
                self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast
            if not "disable_single_element_features" in self.styles:
                self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, sentenceGraph)
            if not "disable_ngram_features" in self.styles:
                #print "NGrams"
                self.multiEdgeFeatureBuilder.buildPathGrams(2, path, sentenceGraph) # remove for fast
                self.multiEdgeFeatureBuilder.buildPathGrams(3, path, sentenceGraph) # remove for fast
                self.multiEdgeFeatureBuilder.buildPathGrams(4, path, sentenceGraph) # remove for fast
            #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast
            #if edges != None:
            #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast
            #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast
            if not "disable_path_edge_features" in self.styles:
                self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, sentenceGraph)
            self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph)
            self.multiEdgeFeatureBuilder.setFeatureVector(None)
        if not "no_linear" in self.styles:
            self.tokenFeatureBuilder.setFeatureVector(features)
            for i in range(len(sentenceGraph.tokens)):
                if sentenceGraph.tokens[i] == token1:
                    token1Index = i
                if sentenceGraph.tokens[i] == token2:
                    token2Index = i
            linearPreTag = "linfw_"
            if token1Index > token2Index: 
                token1Index, token2Index = token2Index, token1Index
                linearPreTag = "linrv_"
            self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1")
            self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2")
            # Before, middle, after
#                self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf")
#                self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw")
#                self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af")
            # before-middle, middle, middle-after
#                    self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2)
#                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2)
#                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2)
            self.tokenFeatureBuilder.setFeatureVector(None)
        # define extra attributes
        extra = {"xtype":"entRel","type":"i","t1":namedEntityToken.get("id"),"t2":token.get("id")}
        extra["e1"] = namedEntity.get("id")
        # list gold entities in extra, if present
        e2s = set()
        for entity in sentenceGraph.tokenIsEntityHead[token]:
            e2s.add(entity.get("id"))
        if len(e2s) != 0:
            extra["e2"] = ",".join(sorted(e2s))
        else:
            extra["e2"] = "None"
        extra["categoryName"] = categoryName
        sentenceOrigId = sentenceGraph.sentenceElement.get("origId")
        if sentenceOrigId != None:
            extra["SOID"] = sentenceOrigId       
        # make example
        category = self.classSet.getId(categoryName)
        
        return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)
예제 #11
0
class UnmergingExampleBuilder(ExampleBuilder):
    """
    This example builder makes unmerging examples, i.e. examples describing
    potential events.
    """
    def __init__(
            self,
            style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures",
            length=None,
            types=[],
            featureSet=None,
            classSet=None):
        # reset style regardless of input
        style = "trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures"
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert (classSet.getId("neg") == 1)

        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)

        self.styles = self.getParameters(style, [
            "trigger_features", "typed", "directed", "no_linear", "entities",
            "genia_limits", "noAnnType", "noMasking", "maxFeatures",
            "no_merge", "disable_entity_features",
            "disable_single_element_features", "disable_ngram_features",
            "disable_path_edge_features"
        ])
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"]
        self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles[
            "noMasking"]
        self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"]
        self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        self.pathLengths = length
        assert (self.pathLengths == None)
        self.types = types

        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder.useNonNameEntities = True

        #self.outFile = open("exampleTempFile.txt","wt")

    def getInteractionEdgeLengths(self, sentenceGraph, paths):
        """
        Return dependency and linear length of all interaction edges
        (measured between the two tokens).
        """
        interactionLengths = {}
        for interaction in sentenceGraph.interactions:
            # Calculated interaction edge dep and lin length
            e1 = sentenceGraph.entitiesById[interaction.get("e1")]
            e2 = sentenceGraph.entitiesById[interaction.get("e2")]
            t1 = sentenceGraph.entityHeadTokenByEntity[e1]
            t2 = sentenceGraph.entityHeadTokenByEntity[e2]
            # Get dep path length
            if t1 != t2:
                path = paths.getPaths(t1, t2)
            if t1 != t2 and len(path) > 0:
                pathLength = min(len(x) for x in path)  #len(paths[t1][t2])
            else:  # no dependencyPath
                pathLength = 999999  # more than any real path
            # Linear distance
            t1Pos = -1
            t2Pos = -1
            for i in range(len(sentenceGraph.tokens)):
                if sentenceGraph.tokens[i] == t1:
                    t1Pos = i
                    if t2Pos != -1:
                        break
                if sentenceGraph.tokens[i] == t2:
                    t2Pos = i
                    if t1Pos != -1:
                        break
            linLength = abs(t1Pos - t2Pos)
            interactionLengths[interaction] = (interaction, pathLength,
                                               linLength, t2Pos)
        return interactionLengths

    def eventIsGold(self, entity, arguments, sentenceGraph, goldGraph,
                    goldEntitiesByOffset):
        offset = entity.get("headOffset")
        if not goldEntitiesByOffset.has_key(offset):
            return False
        eType = entity.get("type")
        goldEntities = goldEntitiesByOffset[offset]

        # Check all gold entities for a match
        for goldEntity in goldEntities:
            isGold = True

            # The entity type must match
            if goldEntity.get("type") != eType:
                isGold = False
                continue
            goldEntityId = goldEntity.get("id")

            # Collect the gold interactions
            goldInteractions = []
            for goldInteraction in goldGraph.interactions:
                if goldInteraction.get("e1") == goldEntityId:
                    goldInteractions.append(goldInteraction)

            # Argument count rules
            if len(goldInteractions) != len(
                    arguments):  # total number of edges differs
                isGold = False
                continue
            # count number of edges per type
            argTypeCounts = {}
            for argument in arguments:
                argType = argument.get("type")
                if not argTypeCounts.has_key(argType):
                    argTypeCounts[argType] = 0
                argTypeCounts[argType] += 1
            # count number of gold edges per type
            goldTypeCounts = {}
            for argument in goldInteractions:
                argType = argument.get("type")
                if not goldTypeCounts.has_key(argType):
                    goldTypeCounts[argType] = 0
                goldTypeCounts[argType] += 1
            # argument edge counts per type must match
            if argTypeCounts != goldTypeCounts:
                isGold = False
                continue

            # Exact argument matching
            for argument in arguments:  # check all edges
                e1 = argument.get("e1")
                e2 = argument.get("e2")
                e2Entity = sentenceGraph.entitiesById[e2]
                e2Offset = e2Entity.get("headOffset")
                e2Type = e2Entity.get("type")
                argType = argument.get("type")

                found = False
                for goldInteraction in goldInteractions:
                    if goldInteraction.get("type") == argType:
                        goldE2Entity = goldGraph.entitiesById[
                            goldInteraction.get("e2")]
                        if goldE2Entity.get(
                                "headOffset") == e2Offset and goldE2Entity.get(
                                    "type") == e2Type:
                            found = True
                            break
                if found == False:  # this edge did not have a corresponding gold edge
                    isGold = False
                    break

            # Event is in gold
            if isGold:
                break

        return isGold

    def getArgumentCombinations(self, eType, interactions, entityId=None):
        combs = []
        if eType == "Binding":
            # Making examples for only all-together/all-separate cases
            # doesn't work, since even gold data has several cases of
            # overlapping bindings with different numbers of arguments
            #if len(interactions) > 0:
            #    return [interactions]
            #else:
            #    return interactions

            # Skip causes
            themes = []
            for interaction in interactions:
                if interaction.get("type") == "Theme":
                    themes.append(interaction)

            for i in range(len(themes)):
                # Looking at a2-normalize.pl reveals that there can be max 6 themes
                # Based on training+devel data, four is maximum
                if i < 10:  #4:
                    for j in combinations(themes, i + 1):
                        combs.append(j)
#                if len(combs) >= 100:
#                    print >> sys.stderr, "Warning, truncating unmerging examples at 100 for Binding entity", entityId
#                    break
            return combs
        elif eType == "Process":  # For ID-task
            argCombinations = []
            argCombinations.append([])  # process can have 0 interactions
            for interaction in interactions:
                if interaction.get("type") == "Participant":
                    argCombinations.append([interaction])
            return argCombinations
        else:  # one of the regulation-types, or one of the simple types
            themes = []
            causes = []
            siteArgs = []
            contextGenes = []
            sideChains = []
            locTargets = []
            for interaction in interactions:
                iType = interaction.get("type")
                #assert iType in ["Theme", "Cause"], (iType, ETUtils.toStr(interaction))
                if iType not in [
                        "Theme", "Cause", "SiteArg", "Contextgene", "Sidechain"
                ]:  # "AtLoc", "ToLoc"]:
                    continue
                if iType == "Theme":
                    themes.append(interaction)
                elif iType == "Cause":
                    causes.append(interaction)
                elif iType == "SiteArg":
                    siteArgs.append(interaction)
                elif iType == "Contextgene":
                    contextGenes.append(interaction)
                elif iType == "Sidechain":
                    sideChains.append(interaction)
                elif iType in ["AtLoc", "ToLoc"]:
                    locTargets.append(iType)
                else:
                    assert False, (iType, interaction.get("id"))
            # Limit arguments to event types that can have them
            if eType.find("egulation") == -1 and eType != "Catalysis":
                causes = []
            if eType != "Glycosylation": sideChains = []
            if eType not in ["Acetylation", "Methylation"]: contextGenes = []
            if eType == "Catalysis": siteArgs = []
            # Themes can always appear alone
            themeAloneCombinations = []
            for theme in themes:
                themeAloneCombinations.append([theme])
            #print "Combine", combine.combine(themes, causes), "TA", themeAloneCombinations
            return combine.combine(themes, causes) \
                   + combine.combine(themes, siteArgs) \
                   + combine.combine(themes, sideChains) \
                   + combine.combine(themes, contextGenes) \
                   + combine.combine(themes, siteArgs, sideChains) \
                   + combine.combine(themes, siteArgs, contextGenes) \
                   + combine.combine(themes, locTargets) \
                   + themeAloneCombinations

    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True)
        self.triggerFeatureBuilder.initSentence(sentenceGraph)

        #examples = []
        exampleIndex = 0

        #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
        #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
        undirected = sentenceGraph.dependencyGraph.toUndirected()
        paths = undirected

        # Get argument order
        self.interactionLenghts = self.getInteractionEdgeLengths(
            sentenceGraph, paths)

        # Map tokens to character offsets
        tokenByOffset = {}
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            if goldGraph != None:  # check that the tokenizations match
                goldToken = goldGraph.tokens[i]
                assert token.get("id") == goldToken.get("id") and token.get(
                    "charOffset") == goldToken.get("charOffset")
            tokenByOffset[token.get("charOffset")] = token.get("id")

        # Map gold entities to their head offsets
        goldEntitiesByOffset = {}
        if goldGraph != None:
            for entity in goldGraph.entities:
                offset = entity.get("headOffset")
                assert offset != None
                if not goldEntitiesByOffset.has_key(offset):
                    goldEntitiesByOffset[offset] = []
                goldEntitiesByOffset[offset].append(entity)

        # Generate examples based on interactions between entities or interactions between tokens
#        interactionsByEntityId = {}
#        for entity in sentenceGraph.entities:
#            interactionsByEntityId[entity.get("id")] = []
#        for interaction in sentenceGraph.interactions:
#            if interaction.get("type") == "neg":
#                continue
#            e1Id = interaction.get("e1")
#            interactionsByEntityId[e1Id].append(interaction)
        if self.styles["no_merge"]:
            mergeInput = False
            entities = sentenceGraph.entities
        else:
            mergeInput = True
            sentenceGraph.mergeInteractionGraph(True)
            entities = sentenceGraph.mergedEntities

        exampleIndex = 0
        for entity in entities:  # sentenceGraph.entities:
            eType = entity.get("type")
            assert eType != None, entity.attrib
            eType = str(eType)
            #if eType not in ["Binding", "Positive_regulation", "Negative_regulation", "Regulation"]:
            #    continue

            #if not goldEntitiesByOffset.has_key(entity.get("headOffset")):
            #    continue

            #interactions = interactionsByEntityId[entity.get("id")]
            interactions = [
                x[2]
                for x in sentenceGraph.getOutInteractions(entity, mergeInput)
            ]
            argCombinations = self.getArgumentCombinations(
                eType, interactions, entity.get("id"))
            #if len(argCombinations) <= 1:
            #    continue
            assert argCombinations != None, (entity.get("id"),
                                             entity.get("type"))
            for argCombination in argCombinations:
                if eType != "Process":
                    assert len(argCombination
                               ) > 0, eType + ": " + str(argCombinations)
                # Originally binary classification
                if goldGraph != None:
                    isGoldEvent = self.eventIsGold(entity, argCombination,
                                                   sentenceGraph, goldGraph,
                                                   goldEntitiesByOffset)
                    #if eType == "Binding":
                    #    print argCombination[0].get("e1"), len(argCombination), isGoldEvent
                else:
                    isGoldEvent = False
                # Named (multi-)class
                if isGoldEvent:
                    #category = "event"
                    category = eType
                    if category.find("egulation") != -1:
                        category = "All_regulation"
                    elif category != "Binding":
                        category = "Other"  #"simple6"
                else:
                    category = "neg"

                features = {}

                argString = ""
                for arg in argCombination:
                    argString += "," + arg.get("id")
                extra = {
                    "xtype": "um",
                    "e": entity.get("id"),
                    "i": argString[1:],
                    "etype": eType,
                    "class": category
                }
                assert type(extra["etype"]) == types.StringType, extra
                self.exampleStats.addExample(category)
                example = self.buildExample(sentenceGraph, paths, entity,
                                            argCombination, interactions)
                example[0] = sentenceGraph.getSentenceId() + ".x" + str(
                    exampleIndex)
                example[1] = self.classSet.getId(category)
                example[3] = extra
                #examples.append( example )
                ExampleUtils.appendExamples([example], outfile)
                exampleIndex += 1

        #return examples
        return exampleIndex

    def buildExample(self, sentenceGraph, paths, eventEntity, argCombination,
                     allInteractions):  #themeEntities, causeEntities=None):
        # NOTE!!!! TODO
        # add also features for arguments present, but not in this combination

        features = {}
        self.features = features

        self.buildInterArgumentBagOfWords(argCombination, sentenceGraph)

        eventEntityType = eventEntity.get("type")
        if eventEntityType == "Binding":
            interactionIndex = {}
            groupInteractionLengths = []
            for interaction in allInteractions:
                groupInteractionLengths.append(
                    self.interactionLenghts[interaction])
            groupInteractionLengths.sort(compareInteractionPrecedence)
            #print groupInteractionLengths
            for i in range(len(groupInteractionLengths)):
                interactionIndex[groupInteractionLengths[i][0]] = i

        eventToken = sentenceGraph.entityHeadTokenByEntity[eventEntity]
        self.triggerFeatureBuilder.setFeatureVector(self.features)
        self.triggerFeatureBuilder.tag = "trg_"
        self.triggerFeatureBuilder.buildFeatures(eventToken)
        self.triggerFeatureBuilder.tag = None

        #self.setFeature("rootType_"+eventEntity.get("type"), 1)

        #argThemeCount = 0
        #argCauseCount = 0
        argCounts = {}
        # Current example's edge combination
        for arg in argCombination:
            argType = arg.get("type")
            if argType not in argCounts:
                argCounts[argType] = 0
            argCounts[argType] += 1
            tag = "arg" + argType
            if eventEntityType == "Binding" and argType == "Theme":
                tag += str(interactionIndex[arg])
            self.buildArgumentFeatures(sentenceGraph, paths, features,
                                       eventToken, arg, tag)
##            if arg.get("type") == "Theme":
##                #argThemeCount += 1
##                tag = "argTheme"
##                self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, tag)
#            #elif arg.get("type") == "Cause": # Cause
#            #    #argCauseCount += 1
#            #    self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, "argCause")
#            else:
#                self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, "arg"+argType)

# Edge group context
#contextThemeCount = 0
#contextCauseCount = 0
        contextCounts = {}
        for interaction in allInteractions:
            if interaction in argCombination:  # Already part of current example's combination
                continue
            contextArgType = interaction.get("type")
            if contextArgType not in contextCounts:
                contextCounts[contextArgType] = 0
            contextCounts[contextArgType] += 1
            tag = "conArg" + contextArgType
            if eventEntityType == "Binding" and contextArgType == "Theme":
                tag += str(interactionIndex[interaction])
            self.buildArgumentFeatures(sentenceGraph, paths, features,
                                       eventToken, interaction, tag)
#            if interaction.get("type") == "Theme":
#                contextThemeCount += 1
#                tag = "conTheme"
#                self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag)
#                if eventEntityType == "Binding":
#                    tag += str(interactionIndex[interaction])
#                    self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag)
#            else: # Cause
#                contextCauseCount += 1
#                self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, "conCause")

        self.setFeature("argCount", len(argCombination))
        self.setFeature("argCount_" + str(len(argCombination)), 1)
        self.setFeature("interactionCount", len(allInteractions))
        self.setFeature("interactionCount_" + str(len(allInteractions)), 1)

        #self.setFeature("argThemeCount", argThemeCount)
        #self.setFeature("argThemeCount_" + str(argThemeCount), 1)
        #self.setFeature("argCauseCount", argCauseCount)
        #self.setFeature("argCauseCount_" + str(argCauseCount), 1)
        for key in sorted(argCounts.keys()):
            self.setFeature("arg" + key + "Count", argCounts[key])
            self.setFeature("arg" + key + "Count_" + str(argCounts[key]), 1)

        #self.setFeature("interactionThemeCount", contextThemeCount)
        #self.setFeature("interactionThemeCount_" + str(contextThemeCount), 1)
        #self.setFeature("interactionCauseCount", contextCauseCount)
        #self.setFeature("interactionCauseCount_" + str(contextCauseCount), 1)
        for key in sorted(contextCounts.keys()):
            self.setFeature("contextArg" + key + "Count", contextCounts[key])
            self.setFeature(
                "contextArg" + key + "Count_" + str(contextCounts[key]), 1)

        self.triggerFeatureBuilder.tag = ""
        self.triggerFeatureBuilder.setFeatureVector(None)

        # Common features
        #        if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization
        #            if entity2.get("isName") == "True":
        #                features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1
        #            else:
        #                features[self.featureSet.getId("GENIA_regulation_of_event")] = 1

        # define extra attributes
        return [None, None, features, None]

    def buildArgumentFeatures(self, sentenceGraph, paths, features, eventToken,
                              arg, tag):
        argEntity = sentenceGraph.entitiesById[arg.get("e2")]
        argToken = sentenceGraph.entityHeadTokenByEntity[argEntity]
        self.buildEdgeFeatures(sentenceGraph, paths, features, eventToken,
                               argToken, tag)
        self.triggerFeatureBuilder.tag = tag + "trg_"
        self.triggerFeatureBuilder.buildFeatures(argToken)
        if argEntity.get("isName") == "True":
            self.setFeature(tag + "Protein", 1)
        else:
            self.setFeature(tag + "Event", 1)
            self.setFeature("nestingEvent", 1)
        self.setFeature(tag + "_" + argEntity.get("type"), 1)

    def buildEdgeFeatures(self, sentenceGraph, paths, features, eventToken,
                          argToken, tag):
        #eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode]
        #argToken = sentenceGraph.entityHeadTokenByEntity[argNode]
        self.multiEdgeFeatureBuilder.tag = tag + "_"
        self.multiEdgeFeatureBuilder.setFeatureVector(features, None, None,
                                                      False)

        self.setFeature(tag + "_present", 1)

        path = paths.getPaths(eventToken, argToken)
        if eventToken != argToken and len(path) > 0:
            path = path[0]
        else:
            path = [eventToken, argToken]
            #edges = None

        if not self.styles["disable_entity_features"]:
            self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph)
        self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path)
        #if not "disable_terminus_features" in self.styles:
        #    self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast
        if not self.styles["disable_single_element_features"]:
            self.multiEdgeFeatureBuilder.buildSingleElementFeatures(
                path, sentenceGraph)
        if not self.styles["disable_ngram_features"]:
            self.multiEdgeFeatureBuilder.buildPathGrams(
                2, path, sentenceGraph)  # remove for fast
            self.multiEdgeFeatureBuilder.buildPathGrams(
                3, path, sentenceGraph)  # remove for fast
            self.multiEdgeFeatureBuilder.buildPathGrams(
                4, path, sentenceGraph)  # remove for fast
        if not self.styles["disable_path_edge_features"]:
            self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(
                path, sentenceGraph)
        #self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph)
        self.multiEdgeFeatureBuilder.setFeatureVector(None, None, None, False)
        self.multiEdgeFeatureBuilder.tag = ""

    def buildInterArgumentBagOfWords(self, arguments, sentenceGraph):
        if len(arguments) < 2:
            return

        indexByToken = {}
        for i in range(len(sentenceGraph.tokens)):
            indexByToken[sentenceGraph.tokens[i]] = i

        argTokenIndices = set()
        for arg in arguments:
            argEntity = sentenceGraph.entitiesById[arg.get("e2")]
            argToken = sentenceGraph.entityHeadTokenByEntity[argEntity]
            argTokenIndices.add(indexByToken[argToken])
        minIndex = min(argTokenIndices)
        maxIndex = max(argTokenIndices)
        self.setFeature("argBoWRange", (maxIndex - minIndex))
        self.setFeature("argBoWRange_" + str(maxIndex - minIndex), 1)
        bow = set()
        for i in range(minIndex + 1, maxIndex):
            token = sentenceGraph.tokens[i]
            if len(sentenceGraph.tokenIsEntityHead[token]
                   ) == 0 and not sentenceGraph.tokenIsName[token]:
                bow.add(token.get("text"))
        bow = sorted(list(bow))
        for word in bow:
            self.setFeature("argBoW_" + word, 1)
            if word in ["/", "-"]:
                self.setFeature("argBoW_slashOrHyphen", 1)
        if len(bow) == 1:
            self.setFeature("argBoWonly_" + bow[0], 1)
            if bow[0] in ["/", "-"]:
                self.setFeature("argBoWonly_slashOrHyphen", 1)
예제 #12
0
class UnmergingExampleBuilder(ExampleBuilder):
    """
    This example builder makes unmerging examples, i.e. examples describing
    potential events.
    """
    def __init__(self, style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures", length=None, types=[], featureSet=None, classSet=None):
        # reset style regardless of input
        style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures"
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert( classSet.getId("neg") == 1 )
        
        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        
        self.styles = self.getParameters(style, ["trigger_features","typed","directed","no_linear","entities","genia_limits",
            "noAnnType", "noMasking", "maxFeatures", "no_merge", "disable_entity_features", 
            "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features"])
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"]
        self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles["noMasking"]
        self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"]
        self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        self.pathLengths = length
        assert(self.pathLengths == None)
        self.types = types

        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder.useNonNameEntities = True
        
        #self.outFile = open("exampleTempFile.txt","wt")
    
    def getInteractionEdgeLengths(self, sentenceGraph, paths):
        """
        Return dependency and linear length of all interaction edges
        (measured between the two tokens).
        """
        interactionLengths = {}
        for interaction in sentenceGraph.interactions:
            # Calculated interaction edge dep and lin length
            e1 = sentenceGraph.entitiesById[interaction.get("e1")]
            e2 = sentenceGraph.entitiesById[interaction.get("e2")]
            t1 = sentenceGraph.entityHeadTokenByEntity[e1]
            t2 = sentenceGraph.entityHeadTokenByEntity[e2]
            # Get dep path length
            if t1 != t2:
                path = paths.getPaths(t1, t2)
            if t1 != t2 and len(path) > 0:
                pathLength = min(len(x) for x in path) #len(paths[t1][t2])
            else: # no dependencyPath
                pathLength = 999999 # more than any real path
            # Linear distance
            t1Pos = -1
            t2Pos = -1
            for i in range(len(sentenceGraph.tokens)):
                if sentenceGraph.tokens[i] == t1:
                    t1Pos = i
                    if t2Pos != -1:
                        break
                if sentenceGraph.tokens[i] == t2:
                    t2Pos = i
                    if t1Pos != -1:
                        break
            linLength = abs(t1Pos - t2Pos)
            interactionLengths[interaction] = (interaction, pathLength, linLength, t2Pos)
        return interactionLengths
    
    def eventIsGold(self, entity, arguments, sentenceGraph, goldGraph, goldEntitiesByOffset):
        offset = entity.get("headOffset")
        if not goldEntitiesByOffset.has_key(offset):
            return False
        eType = entity.get("type")
        goldEntities = goldEntitiesByOffset[offset]
        
        # Check all gold entities for a match
        for goldEntity in goldEntities:
            isGold = True
            
            # The entity type must match
            if goldEntity.get("type") != eType:
                isGold = False
                continue
            goldEntityId = goldEntity.get("id")
            
            # Collect the gold interactions
            goldInteractions = []
            for goldInteraction in goldGraph.interactions:
                if goldInteraction.get("e1") == goldEntityId:
                    goldInteractions.append(goldInteraction)
            
            # Argument count rules
            if len(goldInteractions) != len(arguments): # total number of edges differs
                isGold = False
                continue
            # count number of edges per type
            argTypeCounts = {}
            for argument in arguments:
                argType = argument.get("type")
                if not argTypeCounts.has_key(argType): argTypeCounts[argType] = 0
                argTypeCounts[argType] += 1
            # count number of gold edges per type
            goldTypeCounts = {}
            for argument in goldInteractions:
                argType = argument.get("type")
                if not goldTypeCounts.has_key(argType): goldTypeCounts[argType] = 0
                goldTypeCounts[argType] += 1
            # argument edge counts per type must match
            if argTypeCounts != goldTypeCounts:
                isGold = False
                continue
            
            # Exact argument matching
            for argument in arguments: # check all edges
                e1 = argument.get("e1")
                e2 = argument.get("e2")
                e2Entity = sentenceGraph.entitiesById[e2]
                e2Offset = e2Entity.get("headOffset")
                e2Type = e2Entity.get("type")
                argType = argument.get("type")
                
                found = False
                for goldInteraction in goldInteractions:
                    if goldInteraction.get("type") == argType:
                        goldE2Entity = goldGraph.entitiesById[goldInteraction.get("e2")] 
                        if goldE2Entity.get("headOffset") == e2Offset and goldE2Entity.get("type") == e2Type:
                            found = True
                            break
                if found == False: # this edge did not have a corresponding gold edge
                    isGold = False
                    break

            # Event is in gold
            if isGold:
                break
        
        return isGold
    
    def getArgumentCombinations(self, eType, interactions, entityId=None):
        combs = []
        if eType == "Binding":
            # Making examples for only all-together/all-separate cases
            # doesn't work, since even gold data has several cases of
            # overlapping bindings with different numbers of arguments
            #if len(interactions) > 0:
            #    return [interactions]
            #else:
            #    return interactions
            
            # Skip causes
            themes = []
            for interaction in interactions:
                if interaction.get("type") == "Theme":
                    themes.append(interaction)
                
            for i in range(len(themes)):
                # Looking at a2-normalize.pl reveals that there can be max 6 themes
                # Based on training+devel data, four is maximum
                if i < 10: #4: 
                    for j in combinations(themes, i+1):
                        combs.append(j)
#                if len(combs) >= 100:
#                    print >> sys.stderr, "Warning, truncating unmerging examples at 100 for Binding entity", entityId
#                    break
            return combs
        elif eType == "Process": # For ID-task
            argCombinations = []
            argCombinations.append([]) # process can have 0 interactions
            for interaction in interactions:
                if interaction.get("type") == "Participant":
                    argCombinations.append([interaction])
            return argCombinations
        else: # one of the regulation-types, or one of the simple types
            themes = []
            causes = []
            siteArgs = []
            contextGenes = []
            sideChains = []
            locTargets = []
            for interaction in interactions:
                iType = interaction.get("type")
                #assert iType in ["Theme", "Cause"], (iType, ETUtils.toStr(interaction))
                if iType not in ["Theme", "Cause", "SiteArg", "Contextgene", "Sidechain"]: # "AtLoc", "ToLoc"]:
                    continue
                if iType == "Theme":
                    themes.append(interaction)
                elif iType == "Cause":
                    causes.append(interaction)
                elif iType == "SiteArg":
                    siteArgs.append(interaction)
                elif iType == "Contextgene":
                    contextGenes.append(interaction)
                elif iType == "Sidechain":
                    sideChains.append(interaction)
                elif iType in ["AtLoc", "ToLoc"]:
                    locTargets.append(iType)
                else:
                    assert False, (iType, interaction.get("id"))
            # Limit arguments to event types that can have them
            if eType.find("egulation") == -1 and eType != "Catalysis": 
                causes = []
            if eType != "Glycosylation": sideChains = []
            if eType not in ["Acetylation", "Methylation"]: contextGenes = []
            if eType == "Catalysis": siteArgs = []
            # Themes can always appear alone
            themeAloneCombinations = []
            for theme in themes:
                themeAloneCombinations.append([theme])
            #print "Combine", combine.combine(themes, causes), "TA", themeAloneCombinations
            return combine.combine(themes, causes) \
                   + combine.combine(themes, siteArgs) \
                   + combine.combine(themes, sideChains) \
                   + combine.combine(themes, contextGenes) \
                   + combine.combine(themes, siteArgs, sideChains) \
                   + combine.combine(themes, siteArgs, contextGenes) \
                   + combine.combine(themes, locTargets) \
                   + themeAloneCombinations
            
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True)
        self.triggerFeatureBuilder.initSentence(sentenceGraph)
        
        #examples = []
        exampleIndex = 0
        
        #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
        #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
        undirected = sentenceGraph.dependencyGraph.toUndirected()
        paths = undirected
        
        # Get argument order
        self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths)
        
        # Map tokens to character offsets
        tokenByOffset = {}
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            if goldGraph != None: # check that the tokenizations match
                goldToken = goldGraph.tokens[i]
                assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset")
            tokenByOffset[token.get("charOffset")] = token.get("id")
        
        # Map gold entities to their head offsets
        goldEntitiesByOffset = {}
        if goldGraph != None:
            for entity in goldGraph.entities:
                offset = entity.get("headOffset")
                assert offset != None
                if not goldEntitiesByOffset.has_key(offset):
                    goldEntitiesByOffset[offset] = []
                goldEntitiesByOffset[offset].append(entity)
        
        # Generate examples based on interactions between entities or interactions between tokens
#        interactionsByEntityId = {}
#        for entity in sentenceGraph.entities:
#            interactionsByEntityId[entity.get("id")] = []
#        for interaction in sentenceGraph.interactions:
#            if interaction.get("type") == "neg":
#                continue
#            e1Id = interaction.get("e1")
#            interactionsByEntityId[e1Id].append(interaction)
        if self.styles["no_merge"]:
            mergeInput = False
            entities = sentenceGraph.entities
        else:
            mergeInput = True
            sentenceGraph.mergeInteractionGraph(True)
            entities = sentenceGraph.mergedEntities
        
        exampleIndex = 0
        for entity in entities: # sentenceGraph.entities:
            eType = entity.get("type")
            assert eType != None, entity.attrib
            eType = str(eType)
            #if eType not in ["Binding", "Positive_regulation", "Negative_regulation", "Regulation"]:
            #    continue
            
            #if not goldEntitiesByOffset.has_key(entity.get("headOffset")):
            #    continue
            
            #interactions = interactionsByEntityId[entity.get("id")]
            interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)]
            argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id"))
            #if len(argCombinations) <= 1:
            #    continue
            assert argCombinations != None, (entity.get("id"), entity.get("type"))
            for argCombination in argCombinations:
                if eType != "Process":
                    assert len(argCombination) > 0, eType + ": " + str(argCombinations)
                # Originally binary classification
                if goldGraph != None:
                    isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset)
                    #if eType == "Binding":
                    #    print argCombination[0].get("e1"), len(argCombination), isGoldEvent
                else:
                    isGoldEvent = False
                # Named (multi-)class
                if isGoldEvent:
                    #category = "event"
                    category = eType
                    if category.find("egulation") != -1:
                        category = "All_regulation"
                    elif category != "Binding":
                        category = "Other" #"simple6"
                else:
                    category = "neg"
                    
                features = {}
                
                argString = ""
                for arg in argCombination:
                    argString += "," + arg.get("id")
                extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category}
                assert type(extra["etype"]) == types.StringType, extra
                self.exampleStats.addExample(category)
                example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions)
                example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex)
                example[1] = self.classSet.getId(category)
                example[3] = extra
                #examples.append( example )
                ExampleUtils.appendExamples([example], outfile)
                exampleIndex += 1
            
        #return examples
        return exampleIndex
    
    def buildExample(self, sentenceGraph, paths, eventEntity, argCombination, allInteractions): #themeEntities, causeEntities=None):
        # NOTE!!!! TODO
        # add also features for arguments present, but not in this combination
        
        features = {}
        self.features = features
        
        self.buildInterArgumentBagOfWords(argCombination, sentenceGraph)
        
        eventEntityType = eventEntity.get("type")
        if eventEntityType == "Binding":
            interactionIndex = {}
            groupInteractionLengths = []
            for interaction in allInteractions:
                groupInteractionLengths.append(self.interactionLenghts[interaction])
            groupInteractionLengths.sort(compareInteractionPrecedence)
            #print groupInteractionLengths
            for i in range(len(groupInteractionLengths)):
                interactionIndex[groupInteractionLengths[i][0]] = i
        
        eventToken = sentenceGraph.entityHeadTokenByEntity[eventEntity]
        self.triggerFeatureBuilder.setFeatureVector(self.features)
        self.triggerFeatureBuilder.tag = "trg_"
        self.triggerFeatureBuilder.buildFeatures(eventToken)
        self.triggerFeatureBuilder.tag = None
        
        #self.setFeature("rootType_"+eventEntity.get("type"), 1)
        
        #argThemeCount = 0
        #argCauseCount = 0
        argCounts = {}
        # Current example's edge combination
        for arg in argCombination:
            argType = arg.get("type")
            if argType not in argCounts: 
                argCounts[argType] = 0
            argCounts[argType] += 1
            tag = "arg" + argType
            if eventEntityType == "Binding" and argType == "Theme":
                tag += str(interactionIndex[arg])
            self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, tag)
##            if arg.get("type") == "Theme":
##                #argThemeCount += 1
##                tag = "argTheme"
##                self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, tag)
#            #elif arg.get("type") == "Cause": # Cause
#            #    #argCauseCount += 1
#            #    self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, "argCause")
#            else:               
#                self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, "arg"+argType)         
        
        # Edge group context
        #contextThemeCount = 0
        #contextCauseCount = 0
        contextCounts = {}
        for interaction in allInteractions:
            if interaction in argCombination: # Already part of current example's combination
                continue
            contextArgType = interaction.get("type")
            if contextArgType not in contextCounts: 
                contextCounts[contextArgType] = 0
            contextCounts[contextArgType] += 1
            tag = "conArg" + contextArgType
            if eventEntityType == "Binding" and contextArgType == "Theme":
                tag += str(interactionIndex[interaction])
            self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag)
#            if interaction.get("type") == "Theme":
#                contextThemeCount += 1
#                tag = "conTheme"
#                self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag)
#                if eventEntityType == "Binding":
#                    tag += str(interactionIndex[interaction])
#                    self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag)
#            else: # Cause
#                contextCauseCount += 1
#                self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, "conCause")
        
        self.setFeature("argCount", len(argCombination))
        self.setFeature("argCount_" + str(len(argCombination)), 1)
        self.setFeature("interactionCount", len(allInteractions))
        self.setFeature("interactionCount_" + str(len(allInteractions)), 1)
        
        #self.setFeature("argThemeCount", argThemeCount)
        #self.setFeature("argThemeCount_" + str(argThemeCount), 1)
        #self.setFeature("argCauseCount", argCauseCount)
        #self.setFeature("argCauseCount_" + str(argCauseCount), 1)
        for key in sorted(argCounts.keys()):
            self.setFeature("arg" + key + "Count", argCounts[key])
            self.setFeature("arg" + key + "Count_" + str(argCounts[key]), 1)
            
        #self.setFeature("interactionThemeCount", contextThemeCount)
        #self.setFeature("interactionThemeCount_" + str(contextThemeCount), 1)
        #self.setFeature("interactionCauseCount", contextCauseCount)
        #self.setFeature("interactionCauseCount_" + str(contextCauseCount), 1)
        for key in sorted(contextCounts.keys()):
            self.setFeature("contextArg" + key + "Count", contextCounts[key])
            self.setFeature("contextArg" + key + "Count_" + str(contextCounts[key]), 1)      
        
        self.triggerFeatureBuilder.tag = ""
        self.triggerFeatureBuilder.setFeatureVector(None)
        
        # Common features
#        if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization
#            if entity2.get("isName") == "True":
#                features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1
#            else:
#                features[self.featureSet.getId("GENIA_regulation_of_event")] = 1

        # define extra attributes
        return [None,None,features,None]

    def buildArgumentFeatures(self, sentenceGraph, paths, features, eventToken, arg, tag):
        argEntity = sentenceGraph.entitiesById[arg.get("e2")]
        argToken = sentenceGraph.entityHeadTokenByEntity[argEntity]
        self.buildEdgeFeatures(sentenceGraph, paths, features, eventToken, argToken, tag)
        self.triggerFeatureBuilder.tag = tag + "trg_"
        self.triggerFeatureBuilder.buildFeatures(argToken)
        if argEntity.get("isName") == "True":
            self.setFeature(tag+"Protein", 1)
        else:
            self.setFeature(tag+"Event", 1)
            self.setFeature("nestingEvent", 1)
        self.setFeature(tag+"_"+argEntity.get("type"), 1)
    
    def buildEdgeFeatures(self, sentenceGraph, paths, features, eventToken, argToken, tag):
        #eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode]
        #argToken = sentenceGraph.entityHeadTokenByEntity[argNode]
        self.multiEdgeFeatureBuilder.tag = tag + "_"
        self.multiEdgeFeatureBuilder.setFeatureVector(features, None, None, False)
        
        self.setFeature(tag+"_present", 1)
        
        path = paths.getPaths(eventToken, argToken)
        if eventToken != argToken and len(path) > 0:
            path = path[0]
        else:
            path = [eventToken, argToken]
            #edges = None
        
        if not self.styles["disable_entity_features"]:
            self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph)
        self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path)
        #if not "disable_terminus_features" in self.styles:
        #    self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast
        if not self.styles["disable_single_element_features"]:
            self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, sentenceGraph)
        if not self.styles["disable_ngram_features"]:
            self.multiEdgeFeatureBuilder.buildPathGrams(2, path, sentenceGraph) # remove for fast
            self.multiEdgeFeatureBuilder.buildPathGrams(3, path, sentenceGraph) # remove for fast
            self.multiEdgeFeatureBuilder.buildPathGrams(4, path, sentenceGraph) # remove for fast
        if not self.styles["disable_path_edge_features"]:
            self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, sentenceGraph)
        #self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph)
        self.multiEdgeFeatureBuilder.setFeatureVector(None, None, None, False)
        self.multiEdgeFeatureBuilder.tag = ""
    
    def buildInterArgumentBagOfWords(self, arguments, sentenceGraph):
        if len(arguments) < 2:
            return

        indexByToken = {}
        for i in range(len(sentenceGraph.tokens)):
            indexByToken[sentenceGraph.tokens[i]] = i
        
        argTokenIndices = set()
        for arg in arguments:
            argEntity = sentenceGraph.entitiesById[arg.get("e2")]
            argToken = sentenceGraph.entityHeadTokenByEntity[argEntity]
            argTokenIndices.add(indexByToken[argToken])
        minIndex = min(argTokenIndices)
        maxIndex = max(argTokenIndices)
        self.setFeature("argBoWRange", (maxIndex-minIndex))
        self.setFeature("argBoWRange_" + str(maxIndex-minIndex), 1)
        bow = set()
        for i in range(minIndex+1, maxIndex):
            token = sentenceGraph.tokens[i]
            if len(sentenceGraph.tokenIsEntityHead[token]) == 0 and not sentenceGraph.tokenIsName[token]:
                bow.add(token.get("text"))
        bow = sorted(list(bow))
        for word in bow:
            self.setFeature("argBoW_"+word, 1)
            if word in ["/", "-"]:
                self.setFeature("argBoW_slashOrHyphen", 1)
        if len(bow) == 1:
            self.setFeature("argBoWonly_"+bow[0], 1)
            if bow[0] in ["/", "-"]:
                self.setFeature("argBoWonly_slashOrHyphen", 1)
예제 #13
0
class UnmergingExampleBuilder(ExampleBuilder):
    """
    This example builder makes unmerging examples, i.e. examples describing
    potential events.
    """
    #def __init__(self, style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures", length=None, types=[], featureSet=None, classSet=None):
    def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None):
        # reset style regardless of input
        #style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures"
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert( classSet.getId("neg") == 1 )
        
        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        
        defaultNone = ["binary", "trigger_features","typed","directed","no_linear","entities","genia_limits",
            "noAnnType", "noMasking", "maxFeatures", "no_merge", "disable_entity_features", 
            "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features"]
        defaultParameters = {}
        for name in defaultNone:
            defaultParameters[name] = None
        defaultParameters["keep_intersentence"] = False
        defaultParameters["keep_intersentence_gold"] = True
        defaultParameters["no_arg_count_upper_limit"] = False
        self.styles = self._setDefaultParameters(defaultParameters)
        self.styles = self.getParameters(style)
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"]
        self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles["noMasking"]
        self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"]
        #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        self.pathLengths = length
        assert(self.pathLengths == None)
        self.types = types

        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder.useNonNameEntities = True
        
        #self.outFile = open("exampleTempFile.txt","wt")
    
    def getInteractionEdgeLengths(self, sentenceGraph, paths):
        """
        Return dependency and linear length of all interaction edges
        (measured between the two tokens).
        """
        interactionLengths = {}
        count = 0
        for interaction in sentenceGraph.interactions:
            # Calculated interaction edge dep and lin length
            e1Id = interaction.get("e1")
            e2Id = interaction.get("e2")
            if e2Id not in sentenceGraph.entitiesById: # intersentence interaction
                interactionLengths[interaction] = (interaction, -count, -count, -count)
                continue
            e1 = sentenceGraph.entitiesById[e1Id]
            e2 = sentenceGraph.entitiesById[e2Id]
            t1 = sentenceGraph.entityHeadTokenByEntity[e1]
            t2 = sentenceGraph.entityHeadTokenByEntity[e2]
            # Get dep path length
            if t1 != t2:
                path = paths.getPaths(t1, t2)
            if t1 != t2 and len(path) > 0:
                pathLength = min(len(x) for x in path) #len(paths[t1][t2])
            else: # no dependencyPath
                pathLength = 999999 # more than any real path
            # Linear distance
            t1Pos = -1
            t2Pos = -1
            for i in range(len(sentenceGraph.tokens)):
                if sentenceGraph.tokens[i] == t1:
                    t1Pos = i
                    if t2Pos != -1:
                        break
                if sentenceGraph.tokens[i] == t2:
                    t2Pos = i
                    if t1Pos != -1:
                        break
            linLength = abs(t1Pos - t2Pos)
            interactionLengths[interaction] = (interaction, pathLength, linLength, t2Pos)
            count += 1
        return interactionLengths
    
    def eventIsGold(self, entity, arguments, sentenceGraph, goldGraph, goldEntitiesByOffset, allGoldInteractions):
        offset = entity.get("headOffset")
        if not goldEntitiesByOffset.has_key(offset):
            return False
        eType = entity.get("type")
        goldEntities = goldEntitiesByOffset[offset]
        
        # Check all gold entities for a match
        for goldEntity in goldEntities:
            isGold = True
            
            # The entity type must match
            if goldEntity.get("type") != eType:
                isGold = False
                continue
            goldEntityId = goldEntity.get("id")
            
            # Collect the gold interactions
            goldInteractions = []
            for goldInteraction in allGoldInteractions: #goldGraph.interactions:
                if goldInteraction.get("e1") == goldEntityId and goldInteraction.get("event") == "True":
                    goldInteractions.append(goldInteraction)
            
            # Argument count rules
            if len(goldInteractions) != len(arguments): # total number of edges differs
                isGold = False
                continue
            # count number of edges per type
            argTypeCounts = {}
            for argument in arguments:
                argType = argument.get("type")
                if not argTypeCounts.has_key(argType): argTypeCounts[argType] = 0
                argTypeCounts[argType] += 1
            # count number of gold edges per type
            goldTypeCounts = {}
            for argument in goldInteractions:
                argType = argument.get("type")
                if not goldTypeCounts.has_key(argType): goldTypeCounts[argType] = 0
                goldTypeCounts[argType] += 1
            # argument edge counts per type must match
            if argTypeCounts != goldTypeCounts:
                isGold = False
                continue
            
            # Exact argument matching
            for argument in arguments: # check all edges
                e1 = argument.get("e1")
                e2 = argument.get("e2")
                if e2 not in sentenceGraph.entitiesById: # intersentence argument, assumed to be correct
                    found = True
                    continue
                e2Entity = sentenceGraph.entitiesById[e2]
                e2Offset = e2Entity.get("headOffset")
                e2Type = e2Entity.get("type")
                argType = argument.get("type")
                
                found = False
                for goldInteraction in goldInteractions:
                    if goldInteraction.get("type") == argType:
                        if goldInteraction.get("e2") in goldGraph.entitiesById: # if not, assume this goldInteraction is an intersentence interaction
                            goldE2Entity = goldGraph.entitiesById[goldInteraction.get("e2")] 
                            if goldE2Entity.get("headOffset") == e2Offset and goldE2Entity.get("type") == e2Type:
                                found = True
                                break
                if found == False: # this edge did not have a corresponding gold edge
                    isGold = False
                    break

            # Event is in gold
            if isGold:
                break
        
        return isGold
    
    def sortInteractionsById(self, interactions):
        # The order of the interactions affects the order of the unmerging examples, and this 
        # affects performance. It's not clear whether this is what really happens, or whether
        # the order of the interactions has some effect on the consistency of the unmerging
        # features (it shouldn't). However, in case it does, this function is left here for now,
        # although it shouldn't be needed at all. In any case the impact is minimal, for GE
        # 53.22 vs 53.28 on the development set.
        pairs = []
        for interaction in interactions:
            pairs.append( (int(interaction.get("id").split(".i")[-1]), interaction) )
        pairs.sort()
        return [x[1] for x in pairs]
    
    def processDocument(self, sentences, goldSentences, outfile, structureAnalyzer=None):
        self.documentEntitiesById = {}
        for sentence in sentences:
            for entity in sentence.entities:
                assert entity.get("id") not in self.documentEntitiesById
                self.documentEntitiesById[entity.get("id")] = entity
                      
        for i in range(len(sentences)):
            sentence = sentences[i]
            goldSentence = None
            if goldSentences != None:
                goldSentence = goldSentences[i]
            self.progress.update(1, "Building examples ("+sentence.sentence.get("id")+"): ")
            self.processSentence(sentence, outfile, goldSentence, structureAnalyzer=structureAnalyzer)
    
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True)
        self.triggerFeatureBuilder.initSentence(sentenceGraph)        
        
        exampleIndex = 0
        undirected = sentenceGraph.dependencyGraph.toUndirected()
        paths = undirected
        
        # Get argument order
        self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths)
        
        # Map tokens to character offsets
        tokenByOffset = {}
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            if goldGraph != None: # check that the tokenizations match
                goldToken = goldGraph.tokens[i]
                assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset")
            tokenByOffset[token.get("charOffset")] = token.get("id")
        
        # Map gold entities to their head offsets
        goldEntitiesByOffset = {}
        if goldGraph != None:
            for entity in goldGraph.entities:
                offset = entity.get("headOffset")
                assert offset != None
                if not goldEntitiesByOffset.has_key(offset):
                    goldEntitiesByOffset[offset] = []
                goldEntitiesByOffset[offset].append(entity)
        
        if self.styles["no_merge"]:
            mergeInput = False
            entities = sentenceGraph.entities
        else:
            mergeInput = True
            sentenceGraph.mergeInteractionGraph(True)
            entities = sentenceGraph.mergedEntities
            self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities))
        
        exampleIndex = 0
        for entity in entities: # sentenceGraph.entities:
            if type(entity) in types.StringTypes: # dummy entity for intersentence interactions
                continue
            
            eType = entity.get("type")
            assert eType != None, entity.attrib
            eType = str(eType)
            
            interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)]
            interactions = self.sortInteractionsById(interactions)
            interactionCounts = defaultdict(int)
            validInteractionsByType = defaultdict(list)
            for interaction in interactions:
                if interaction.get("event") != "True":
                    continue
                e1 = sentenceGraph.entitiesById[interaction.get("e1")]
                if interaction.get("e2") in sentenceGraph.entitiesById:
                    e2 = sentenceGraph.entitiesById[interaction.get("e2")]
                    if interaction.get("type") in structureAnalyzer.getValidEdgeTypes(e1.get("type"), e2.get("type")):
                        validInteractionsByType[interaction.get("type")].append(interaction)
                else: # intersentence
                    validInteractionsByType[interaction.get("type")].append(interaction)
                interactionCounts[interaction.get("type")] += 1
            interactionCountString = ",".join([key + "=" + str(interactionCounts[key]) for key in sorted(interactionCounts.keys())])
            #argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id"))
            intCombinations = []
            validIntTypeCount = 0
            maxArgCount = 0
            if self.debug:
                print >> sys.stderr, entity.get("id"), entity.get("type"), "int:" + interactionCountString, "validInt:" + str(validInteractionsByType)
            for intType in sorted(validInteractionsByType.keys()): # for each argument type the event can have
                validIntTypeCount += 1
                intCombinations.append([])
                minArgs, maxArgs = structureAnalyzer.getArgLimits(entity.get("type"), intType)
                if maxArgs > maxArgCount:
                    maxArgCount = maxArgs
                #if maxArgs > 1: # allow any number of arguments for cases like Binding
                #    maxArgs = len(validInteractionsByType[intType])
                for combLen in range(minArgs, maxArgs+1): # for each valid argument count, get all possible combinations. note that there may be zero-lenght combination
                    for singleTypeArgCombination in combinations(validInteractionsByType[intType], combLen):
                        intCombinations[-1].append(singleTypeArgCombination)
                # e.g. theme:[a,b], cause:[d] = [[
            # intCombinations now contains a list of lists, each of which has a tuple for each valid combination
            # of one argument type. Next, we'll make all valid combinations of multiple argument types
            if self.debug:
                print >> sys.stderr, " ", "intCombinations", intCombinations
            argCombinations = combine.combine(*intCombinations)
            if self.debug:
                print >> sys.stderr, " ", "argCombinations", argCombinations
            for i in range(len(argCombinations)):
                argCombinations[i] = sum(argCombinations[i], ())
            #sum(argCombinations, []) # flatten nested list
            if self.debug:
                print >> sys.stderr, " ", "argCombinations flat", argCombinations
            
            for argCombination in argCombinations:
                # Originally binary classification
                if goldGraph != None:
                    isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset, goldGraph.interactions)
                    #if eType == "Binding":
                    #    print argCombination[0].get("e1"), len(argCombination), isGoldEvent
                else:
                    isGoldEvent = False
                # Named (multi-)class
                if isGoldEvent:
#                    category = "zeroArg"
#                    if validIntTypeCount == 1:
#                        category = "singleArg" # event has 0-1 arguments (old simple6)
#                    if validIntTypeCount > 1:
#                        category = "multiType" # event has arguments of several types, 0-1 of each (old Regulation)
#                    if maxArgCount > 1:
#                        category = "multiArg" # event can have 2-n of at least one argument type (old Binding)
                    if self.styles["binary"]:
                        category = "pos"
                    else:
                        category = entity.get("type")
                        
                    assert category != None
                else:
                    category = "neg"
                self.exampleStats.beginExample(category)
                
                issues = defaultdict(int)
                # early out for proteins etc.
                if validIntTypeCount == 0 and entity.get("given") == "True":
                    self.exampleStats.filter("given-leaf:" + entity.get("type"))
                    if self.debug:
                        print >> sys.stderr, " ", category +"("+eType+")", "arg combination", argCombination, "LEAF"
                elif structureAnalyzer.isValidEntity(entity) or structureAnalyzer.isValidEvent(entity, argCombination, self.documentEntitiesById, noUpperLimitBeyondOne=self.styles["no_arg_count_upper_limit"], issues=issues):
                    if self.debug:
                        print >> sys.stderr, " ", category, "arg combination", argCombination, "VALID"
                    argString = ""
                    for arg in argCombination:
                        argString += "," + arg.get("type") + "=" + arg.get("id")
                    extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category}
                    extra["allInt"] = interactionCountString
                    assert type(extra["etype"]) in types.StringTypes, extra
                    assert type(extra["class"]) in types.StringTypes, category
                    assert type(extra["i"]) in types.StringTypes, argString
                    example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions)
                    example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex)
                    example[1] = self.classSet.getId(category)
                    example[3] = extra
                    #examples.append( example )
                    ExampleUtils.appendExamples([example], outfile)
                    exampleIndex += 1
                else: # not a valid event or valid entity
                    if len(issues) == 0: # must be > 0 so that it gets filtered
                        if not structureAnalyzer.isValidEntity(entity):
                            issues["INVALID_ENTITY:"+eType] += 1
                        else:
                            issues["UNKNOWN_ISSUE_FOR:"+eType] += 1
                    for key in issues:
                        self.exampleStats.filter(key)
                    if self.debug:
                        print >> sys.stderr, " ", category, "arg combination", argCombination, "INVALID", issues
                self.exampleStats.endExample()
            
        #return examples
        return exampleIndex
    
    def buildExample(self, sentenceGraph, paths, eventEntity, argCombination, allInteractions): #themeEntities, causeEntities=None):
        # NOTE!!!! TODO
        # add also features for arguments present, but not in this combination
        
        features = {}
        self.features = features
        
        self.buildInterArgumentBagOfWords(argCombination, sentenceGraph)
        
        eventEntityType = eventEntity.get("type")
        if eventEntityType == "Binding":
            interactionIndex = {}
            groupInteractionLengths = []
            for interaction in allInteractions:
                groupInteractionLengths.append(self.interactionLenghts[interaction])
            groupInteractionLengths.sort(compareInteractionPrecedence)
            #print groupInteractionLengths
            for i in range(len(groupInteractionLengths)):
                interactionIndex[groupInteractionLengths[i][0]] = i
        
        eventToken = sentenceGraph.entityHeadTokenByEntity[eventEntity]
        self.triggerFeatureBuilder.setFeatureVector(self.features)
        self.triggerFeatureBuilder.tag = "trg_"
        self.triggerFeatureBuilder.buildFeatures(eventToken)
        self.triggerFeatureBuilder.tag = None
        
        #self.setFeature("rootType_"+eventEntity.get("type"), 1)
        
        argThemeCount = 0
        argCauseCount = 0
        argCounts = {}
        # Current example's edge combination
        for arg in argCombination:
            if arg.get("type") == "Theme":
                argThemeCount += 1
                tag = "argTheme"
                self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, tag)
                if eventEntityType == "Binding":
                    tag += str(interactionIndex[arg])
                    self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, tag)
            elif arg.get("type") == "Cause": # Cause
                argCauseCount += 1
                self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, "argCause")
            else:
                argType = arg.get("type")
                if argType not in argCounts: argCounts[argType] = 0
                self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, "arg"+argType)
                argCounts[argType] += 1
        
        # Edge group context
        contextThemeCount = 0
        contextCauseCount = 0
        for interaction in allInteractions:
            if interaction in argCombination: # Already part of current example's combination
                continue
            if interaction.get("type") == "Theme":
                contextThemeCount += 1
                tag = "conTheme"
                self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag)
                if eventEntityType == "Binding":
                    tag += str(interactionIndex[interaction])
                    self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag)
            else: # Cause
                contextCauseCount += 1
                self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, "conCause")
        
        self.setFeature("argCount", len(argCombination))
        self.setFeature("argCount_" + str(len(argCombination)), 1)
        self.setFeature("interactionCount", len(allInteractions))
        self.setFeature("interactionCount_" + str(len(allInteractions)), 1)
        
        self.setFeature("argThemeCount", argThemeCount)
        self.setFeature("argThemeCount_" + str(argThemeCount), 1)
        self.setFeature("argCauseCount", argCauseCount)
        self.setFeature("argCauseCount_" + str(argCauseCount), 1)
        for key in sorted(argCounts.keys()):
            self.setFeature("arg" + key + "Count", argCounts[key])
            self.setFeature("arg" + key + "Count_" + str(argCounts[key]), 1)
            
        self.setFeature("interactionThemeCount", contextThemeCount)
        self.setFeature("interactionThemeCount_" + str(contextThemeCount), 1)
        self.setFeature("interactionCauseCount", contextCauseCount)
        self.setFeature("interactionCauseCount_" + str(contextCauseCount), 1)        
        
        self.triggerFeatureBuilder.tag = ""
        self.triggerFeatureBuilder.setFeatureVector(None)
    
        # Common features
#        if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization
#            if entity2.get("given") == "True":
#                features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1
#            else:
#                features[self.featureSet.getId("GENIA_regulation_of_event")] = 1

        # define extra attributes
        return [None,None,features,None]

    def buildArgumentFeatures(self, sentenceGraph, paths, features, eventToken, arg, tag):
        if arg.get("e2") not in sentenceGraph.entitiesById: # intersentence argument
            return
        argEntity = sentenceGraph.entitiesById[arg.get("e2")]
        argToken = sentenceGraph.entityHeadTokenByEntity[argEntity]
        self.buildEdgeFeatures(sentenceGraph, paths, features, eventToken, argToken, tag)
        self.triggerFeatureBuilder.tag = tag + "trg_"
        self.triggerFeatureBuilder.buildFeatures(argToken)
        if argEntity.get("given") == "True":
            self.setFeature(tag+"Protein", 1)
        else:
            self.setFeature(tag+"Event", 1)
            self.setFeature("nestingEvent", 1)
        self.setFeature(tag+"_"+argEntity.get("type"), 1)
    
    def buildEdgeFeatures(self, sentenceGraph, paths, features, eventToken, argToken, tag):
        #eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode]
        #argToken = sentenceGraph.entityHeadTokenByEntity[argNode]
        self.multiEdgeFeatureBuilder.tag = tag + "_"
        self.multiEdgeFeatureBuilder.setFeatureVector(features, None, None, False)
        
        self.setFeature(tag+"_present", 1)
        
        path = paths.getPaths(eventToken, argToken)
        if eventToken != argToken and len(path) > 0:
            path = path[0]
        else:
            path = [eventToken, argToken]
            #edges = None
        
        if not self.styles["disable_entity_features"]:
            self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph)
        self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path)
        #if not "disable_terminus_features" in self.styles:
        #    self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast
        if not self.styles["disable_single_element_features"]:
            self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, sentenceGraph)
        if not self.styles["disable_ngram_features"]:
            self.multiEdgeFeatureBuilder.buildPathGrams(2, path, sentenceGraph) # remove for fast
            self.multiEdgeFeatureBuilder.buildPathGrams(3, path, sentenceGraph) # remove for fast
            self.multiEdgeFeatureBuilder.buildPathGrams(4, path, sentenceGraph) # remove for fast
        if not self.styles["disable_path_edge_features"]:
            self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, sentenceGraph)
        #self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph)
        self.multiEdgeFeatureBuilder.setFeatureVector(None, None, None, False)
        self.multiEdgeFeatureBuilder.tag = ""
    
    def buildInterArgumentBagOfWords(self, arguments, sentenceGraph):
        if len(arguments) < 2:
            return

        indexByToken = {}
        for i in range(len(sentenceGraph.tokens)):
            indexByToken[sentenceGraph.tokens[i]] = i
        
        argTokenIndices = set()
        for arg in arguments:
            if arg.get("e2") in sentenceGraph.entitiesById: # skip intersentence interactions
                argEntity = sentenceGraph.entitiesById[arg.get("e2")]
                argToken = sentenceGraph.entityHeadTokenByEntity[argEntity]
                argTokenIndices.add(indexByToken[argToken])
        if len(argTokenIndices) < 1:
            return
        minIndex = min(argTokenIndices)
        maxIndex = max(argTokenIndices)
        self.setFeature("argBoWRange", (maxIndex-minIndex))
        self.setFeature("argBoWRange_" + str(maxIndex-minIndex), 1)
        bow = set()
        for i in range(minIndex+1, maxIndex):
            token = sentenceGraph.tokens[i]
            if len(sentenceGraph.tokenIsEntityHead[token]) == 0 and not sentenceGraph.tokenIsName[token]:
                bow.add(token.get("text"))
        bow = sorted(list(bow))
        for word in bow:
            self.setFeature("argBoW_"+word, 1)
            if word in ["/", "-"]:
                self.setFeature("argBoW_slashOrHyphen", 1)
        if len(bow) == 1:
            self.setFeature("argBoWonly_"+bow[0], 1)
            if bow[0] in ["/", "-"]:
                self.setFeature("argBoWonly_slashOrHyphen", 1)
class UnmergingExampleBuilder(ExampleBuilder):
    """
    This example builder makes unmerging examples, i.e. examples describing
    potential events.
    """
    #def __init__(self, style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures", length=None, types=[], featureSet=None, classSet=None):
    def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None):
        # reset style regardless of input
        #style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures"
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert( classSet.getId("neg") == 1 )
        
        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        
        defaultNone = ["binary", "trigger_features","typed","directed","no_linear","entities","genia_limits",
            "noAnnType", "noMasking", "maxFeatures", "no_merge", "disable_entity_features", 
            "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features"]
        defaultParameters = {}
        for name in defaultNone:
            defaultParameters[name] = None
        defaultParameters["keep_intersentence"] = False
        defaultParameters["keep_intersentence_gold"] = True
        self.styles = self._setDefaultParameters(defaultParameters)
        self.styles = self.getParameters(style)
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"]
        self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles["noMasking"]
        self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"]
        #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        self.pathLengths = length
        assert(self.pathLengths == None)
        self.types = types

        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder.useNonNameEntities = True
        
        #self.outFile = open("exampleTempFile.txt","wt")
    
    def getInteractionEdgeLengths(self, sentenceGraph, paths):
        """
        Return dependency and linear length of all interaction edges
        (measured between the two tokens).
        """
        interactionLengths = {}
        count = 0
        for interaction in sentenceGraph.interactions:
            # Calculated interaction edge dep and lin length
            e1Id = interaction.get("e1")
            e2Id = interaction.get("e2")
            if e2Id not in sentenceGraph.entitiesById: # intersentence interaction
                interactionLengths[interaction] = (interaction, -count, -count, -count)
                continue
            e1 = sentenceGraph.entitiesById[e1Id]
            e2 = sentenceGraph.entitiesById[e2Id]
            t1 = sentenceGraph.entityHeadTokenByEntity[e1]
            t2 = sentenceGraph.entityHeadTokenByEntity[e2]
            # Get dep path length
            if t1 != t2:
                path = paths.getPaths(t1, t2)
            if t1 != t2 and len(path) > 0:
                pathLength = min(len(x) for x in path) #len(paths[t1][t2])
            else: # no dependencyPath
                pathLength = 999999 # more than any real path
            # Linear distance
            t1Pos = -1
            t2Pos = -1
            for i in range(len(sentenceGraph.tokens)):
                if sentenceGraph.tokens[i] == t1:
                    t1Pos = i
                    if t2Pos != -1:
                        break
                if sentenceGraph.tokens[i] == t2:
                    t2Pos = i
                    if t1Pos != -1:
                        break
            linLength = abs(t1Pos - t2Pos)
            interactionLengths[interaction] = (interaction, pathLength, linLength, t2Pos)
            count += 1
        return interactionLengths
    
    def eventIsGold(self, entity, arguments, sentenceGraph, goldGraph, goldEntitiesByOffset, allGoldInteractions):
        offset = entity.get("headOffset")
        if not goldEntitiesByOffset.has_key(offset):
            return False
        eType = entity.get("type")
        goldEntities = goldEntitiesByOffset[offset]
        
        # Check all gold entities for a match
        for goldEntity in goldEntities:
            isGold = True
            
            # The entity type must match
            if goldEntity.get("type") != eType:
                isGold = False
                continue
            goldEntityId = goldEntity.get("id")
            
            # Collect the gold interactions
            goldInteractions = []
            for goldInteraction in allGoldInteractions: #goldGraph.interactions:
                if goldInteraction.get("e1") == goldEntityId and goldInteraction.get("event") == "True":
                    goldInteractions.append(goldInteraction)
            
            # Argument count rules
            if len(goldInteractions) != len(arguments): # total number of edges differs
                isGold = False
                continue
            # count number of edges per type
            argTypeCounts = {}
            for argument in arguments:
                argType = argument.get("type")
                if not argTypeCounts.has_key(argType): argTypeCounts[argType] = 0
                argTypeCounts[argType] += 1
            # count number of gold edges per type
            goldTypeCounts = {}
            for argument in goldInteractions:
                argType = argument.get("type")
                if not goldTypeCounts.has_key(argType): goldTypeCounts[argType] = 0
                goldTypeCounts[argType] += 1
            # argument edge counts per type must match
            if argTypeCounts != goldTypeCounts:
                isGold = False
                continue
            
            # Exact argument matching
            for argument in arguments: # check all edges
                e1 = argument.get("e1")
                e2 = argument.get("e2")
                if e2 not in sentenceGraph.entitiesById: # intersentence argument, assumed to be correct
                    found = True
                    continue
                e2Entity = sentenceGraph.entitiesById[e2]
                e2Offset = e2Entity.get("headOffset")
                e2Type = e2Entity.get("type")
                argType = argument.get("type")
                
                found = False
                for goldInteraction in goldInteractions:
                    if goldInteraction.get("type") == argType:
                        if goldInteraction.get("e2") in goldGraph.entitiesById: # if not, assume this goldInteraction is an intersentence interaction
                            goldE2Entity = goldGraph.entitiesById[goldInteraction.get("e2")] 
                            if goldE2Entity.get("headOffset") == e2Offset and goldE2Entity.get("type") == e2Type:
                                found = True
                                break
                if found == False: # this edge did not have a corresponding gold edge
                    isGold = False
                    break

            # Event is in gold
            if isGold:
                break
        
        return isGold
    
    def sortInteractionsById(self, interactions):
        # The order of the interactions affects the order of the unmerging examples, and this 
        # affects performance. It's not clear whether this is what really happens, or whether
        # the order of the interactions has some effect on the consistency of the unmerging
        # features (it shouldn't). However, in case it does, this function is left here for now,
        # although it shouldn't be needed at all. In any case the impact is minimal, for GE
        # 53.22 vs 53.28 on the development set.
        pairs = []
        for interaction in interactions:
            pairs.append( (int(interaction.get("id").split(".i")[-1]), interaction) )
        pairs.sort()
        return [x[1] for x in pairs]
    
    def processDocument(self, sentences, goldSentences, outfile, structureAnalyzer=None):
        self.documentEntitiesById = {}
        for sentence in sentences:
            for entity in sentence.entities:
                assert entity.get("id") not in self.documentEntitiesById
                self.documentEntitiesById[entity.get("id")] = entity
                      
        for i in range(len(sentences)):
            sentence = sentences[i]
            goldSentence = None
            if goldSentences != None:
                goldSentence = goldSentences[i]
            self.progress.update(1, "Building examples ("+sentence.sentence.get("id")+"): ")
            self.processSentence(sentence, outfile, goldSentence, structureAnalyzer=structureAnalyzer)
    
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True)
        self.triggerFeatureBuilder.initSentence(sentenceGraph)        
        
        exampleIndex = 0
        undirected = sentenceGraph.dependencyGraph.toUndirected()
        paths = undirected
        
        # Get argument order
        self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths)
        
        # Map tokens to character offsets
        tokenByOffset = {}
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            if goldGraph != None: # check that the tokenizations match
                goldToken = goldGraph.tokens[i]
                assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset")
            tokenByOffset[token.get("charOffset")] = token.get("id")
        
        # Map gold entities to their head offsets
        goldEntitiesByOffset = {}
        if goldGraph != None:
            for entity in goldGraph.entities:
                offset = entity.get("headOffset")
                assert offset != None
                if not goldEntitiesByOffset.has_key(offset):
                    goldEntitiesByOffset[offset] = []
                goldEntitiesByOffset[offset].append(entity)
        
        if self.styles["no_merge"]:
            mergeInput = False
            entities = sentenceGraph.entities
        else:
            mergeInput = True
            sentenceGraph.mergeInteractionGraph(True)
            entities = sentenceGraph.mergedEntities
            self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities))
        
        exampleIndex = 0
        for entity in entities: # sentenceGraph.entities:
            if type(entity) in types.StringTypes: # dummy entity for intersentence interactions
                continue
            
            eType = entity.get("type")
            assert eType != None, entity.attrib
            eType = str(eType)
            
            interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)]
            interactions = self.sortInteractionsById(interactions)
            interactionCounts = defaultdict(int)
            validInteractionsByType = defaultdict(list)
            for interaction in interactions:
                if interaction.get("event") != "True":
                    continue
                e1 = sentenceGraph.entitiesById[interaction.get("e1")]
                if interaction.get("e2") in sentenceGraph.entitiesById:
                    e2 = sentenceGraph.entitiesById[interaction.get("e2")]
                    if interaction.get("type") in structureAnalyzer.getValidEdgeTypes(e1.get("type"), e2.get("type")):
                        validInteractionsByType[interaction.get("type")].append(interaction)
                else: # intersentence
                    validInteractionsByType[interaction.get("type")].append(interaction)
                interactionCounts[interaction.get("type")] += 1
            interactionCountString = ",".join([key + "=" + str(interactionCounts[key]) for key in sorted(interactionCounts.keys())])
            #argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id"))
            intCombinations = []
            validIntTypeCount = 0
            maxArgCount = 0
            if self.debug:
                print >> sys.stderr, entity.get("id"), entity.get("type"), "int:" + interactionCountString, "validInt:" + str(validInteractionsByType)
            for intType in sorted(validInteractionsByType.keys()): # for each argument type the event can have
                validIntTypeCount += 1
                intCombinations.append([])
                minArgs, maxArgs = structureAnalyzer.getArgLimits(entity.get("type"), intType)
                if maxArgs > maxArgCount:
                    maxArgCount = maxArgs
                #if maxArgs > 1: # allow any number of arguments for cases like Binding
                #    maxArgs = len(validInteractionsByType[intType])
                for combLen in range(minArgs, maxArgs+1): # for each valid argument count, get all possible combinations. note that there may be zero-lenght combination
                    for singleTypeArgCombination in combinations(validInteractionsByType[intType], combLen):
                        intCombinations[-1].append(singleTypeArgCombination)
                # e.g. theme:[a,b], cause:[d] = [[
            # intCombinations now contains a list of lists, each of which has a tuple for each valid combination
            # of one argument type. Next, we'll make all valid combinations of multiple argument types
            if self.debug:
                print >> sys.stderr, " ", "intCombinations", intCombinations
            argCombinations = combine.combine(*intCombinations)
            if self.debug:
                print >> sys.stderr, " ", "argCombinations", argCombinations
            for i in range(len(argCombinations)):
                argCombinations[i] = sum(argCombinations[i], ())
            #sum(argCombinations, []) # flatten nested list
            if self.debug:
                print >> sys.stderr, " ", "argCombinations flat", argCombinations
            
            for argCombination in argCombinations:
                # Originally binary classification
                if goldGraph != None:
                    isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset, goldGraph.interactions)
                    #if eType == "Binding":
                    #    print argCombination[0].get("e1"), len(argCombination), isGoldEvent
                else:
                    isGoldEvent = False
                # Named (multi-)class
                if isGoldEvent:
#                    category = "zeroArg"
#                    if validIntTypeCount == 1:
#                        category = "singleArg" # event has 0-1 arguments (old simple6)
#                    if validIntTypeCount > 1:
#                        category = "multiType" # event has arguments of several types, 0-1 of each (old Regulation)
#                    if maxArgCount > 1:
#                        category = "multiArg" # event can have 2-n of at least one argument type (old Binding)
                    if self.styles["binary"]:
                        category = "pos"
                    else:
                        category = entity.get("type")
                        
                    assert category != None
                else:
                    category = "neg"
                self.exampleStats.beginExample(category)
                
                issues = defaultdict(int)
                # early out for proteins etc.
                if validIntTypeCount == 0 and entity.get("given") == "True":
                    self.exampleStats.filter("given-leaf:" + entity.get("type"))
                    if self.debug:
                        print >> sys.stderr, " ", category +"("+eType+")", "arg combination", argCombination, "LEAF"
                elif not structureAnalyzer.isValidEvent(entity, argCombination, self.documentEntitiesById, issues=issues):
                    for key in issues:
                        self.exampleStats.filter(key)
                    if self.debug:
                        print >> sys.stderr, " ", category, "arg combination", argCombination, "INVALID", issues
                else:
                    if self.debug:
                        print >> sys.stderr, " ", category, "arg combination", argCombination, "VALID"                
                    features = {}
                    argString = ""
                    for arg in argCombination:
                        argString += "," + arg.get("type") + "=" + arg.get("id")
                    extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category}
                    extra["allInt"] = interactionCountString
                    assert type(extra["etype"]) in types.StringTypes, extra
                    assert type(extra["class"]) in types.StringTypes, category
                    assert type(extra["i"]) in types.StringTypes, argString
                    example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions)
                    example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex)
                    example[1] = self.classSet.getId(category)
                    example[3] = extra
                    #examples.append( example )
                    ExampleUtils.appendExamples([example], outfile)
                    exampleIndex += 1
                self.exampleStats.endExample()
            
        #return examples
        return exampleIndex
    
    def buildExample(self, sentenceGraph, paths, eventEntity, argCombination, allInteractions): #themeEntities, causeEntities=None):
        # NOTE!!!! TODO
        # add also features for arguments present, but not in this combination
        
        features = {}
        self.features = features
        
        self.buildInterArgumentBagOfWords(argCombination, sentenceGraph)
        
        eventEntityType = eventEntity.get("type")
        if eventEntityType == "Binding":
            interactionIndex = {}
            groupInteractionLengths = []
            for interaction in allInteractions:
                groupInteractionLengths.append(self.interactionLenghts[interaction])
            groupInteractionLengths.sort(compareInteractionPrecedence)
            #print groupInteractionLengths
            for i in range(len(groupInteractionLengths)):
                interactionIndex[groupInteractionLengths[i][0]] = i
        
        eventToken = sentenceGraph.entityHeadTokenByEntity[eventEntity]
        self.triggerFeatureBuilder.setFeatureVector(self.features)
        self.triggerFeatureBuilder.tag = "trg_"
        self.triggerFeatureBuilder.buildFeatures(eventToken)
        self.triggerFeatureBuilder.tag = None
        
        #self.setFeature("rootType_"+eventEntity.get("type"), 1)
        
        argThemeCount = 0
        argCauseCount = 0
        argCounts = {}
        # Current example's edge combination
        for arg in argCombination:
            if arg.get("type") == "Theme":
                argThemeCount += 1
                tag = "argTheme"
                self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, tag)
                if eventEntityType == "Binding":
                    tag += str(interactionIndex[arg])
                    self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, tag)
            elif arg.get("type") == "Cause": # Cause
                argCauseCount += 1
                self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, "argCause")
            else:
                argType = arg.get("type")
                if argType not in argCounts: argCounts[argType] = 0
                self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, arg, "arg"+argType)
                argCounts[argType] += 1
        
        # Edge group context
        contextThemeCount = 0
        contextCauseCount = 0
        for interaction in allInteractions:
            if interaction in argCombination: # Already part of current example's combination
                continue
            if interaction.get("type") == "Theme":
                contextThemeCount += 1
                tag = "conTheme"
                self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag)
                if eventEntityType == "Binding":
                    tag += str(interactionIndex[interaction])
                    self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, tag)
            else: # Cause
                contextCauseCount += 1
                self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, interaction, "conCause")
        
        self.setFeature("argCount", len(argCombination))
        self.setFeature("argCount_" + str(len(argCombination)), 1)
        self.setFeature("interactionCount", len(allInteractions))
        self.setFeature("interactionCount_" + str(len(allInteractions)), 1)
        
        self.setFeature("argThemeCount", argThemeCount)
        self.setFeature("argThemeCount_" + str(argThemeCount), 1)
        self.setFeature("argCauseCount", argCauseCount)
        self.setFeature("argCauseCount_" + str(argCauseCount), 1)
        for key in sorted(argCounts.keys()):
            self.setFeature("arg" + key + "Count", argCounts[key])
            self.setFeature("arg" + key + "Count_" + str(argCounts[key]), 1)
            
        self.setFeature("interactionThemeCount", contextThemeCount)
        self.setFeature("interactionThemeCount_" + str(contextThemeCount), 1)
        self.setFeature("interactionCauseCount", contextCauseCount)
        self.setFeature("interactionCauseCount_" + str(contextCauseCount), 1)        
        
        self.triggerFeatureBuilder.tag = ""
        self.triggerFeatureBuilder.setFeatureVector(None)
    
        # Common features
#        if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization
#            if entity2.get("given") == "True":
#                features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1
#            else:
#                features[self.featureSet.getId("GENIA_regulation_of_event")] = 1

        # define extra attributes
        return [None,None,features,None]

    def buildArgumentFeatures(self, sentenceGraph, paths, features, eventToken, arg, tag):
        if arg.get("e2") not in sentenceGraph.entitiesById: # intersentence argument
            return
        argEntity = sentenceGraph.entitiesById[arg.get("e2")]
        argToken = sentenceGraph.entityHeadTokenByEntity[argEntity]
        self.buildEdgeFeatures(sentenceGraph, paths, features, eventToken, argToken, tag)
        self.triggerFeatureBuilder.tag = tag + "trg_"
        self.triggerFeatureBuilder.buildFeatures(argToken)
        if argEntity.get("given") == "True":
            self.setFeature(tag+"Protein", 1)
        else:
            self.setFeature(tag+"Event", 1)
            self.setFeature("nestingEvent", 1)
        self.setFeature(tag+"_"+argEntity.get("type"), 1)
    
    def buildEdgeFeatures(self, sentenceGraph, paths, features, eventToken, argToken, tag):
        #eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode]
        #argToken = sentenceGraph.entityHeadTokenByEntity[argNode]
        self.multiEdgeFeatureBuilder.tag = tag + "_"
        self.multiEdgeFeatureBuilder.setFeatureVector(features, None, None, False)
        
        self.setFeature(tag+"_present", 1)
        
        path = paths.getPaths(eventToken, argToken)
        if eventToken != argToken and len(path) > 0:
            path = path[0]
        else:
            path = [eventToken, argToken]
            #edges = None
        
        if not self.styles["disable_entity_features"]:
            self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph)
        self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path)
        #if not "disable_terminus_features" in self.styles:
        #    self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast
        if not self.styles["disable_single_element_features"]:
            self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, sentenceGraph)
        if not self.styles["disable_ngram_features"]:
            self.multiEdgeFeatureBuilder.buildPathGrams(2, path, sentenceGraph) # remove for fast
            self.multiEdgeFeatureBuilder.buildPathGrams(3, path, sentenceGraph) # remove for fast
            self.multiEdgeFeatureBuilder.buildPathGrams(4, path, sentenceGraph) # remove for fast
        if not self.styles["disable_path_edge_features"]:
            self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, sentenceGraph)
        #self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph)
        self.multiEdgeFeatureBuilder.setFeatureVector(None, None, None, False)
        self.multiEdgeFeatureBuilder.tag = ""
    
    def buildInterArgumentBagOfWords(self, arguments, sentenceGraph):
        if len(arguments) < 2:
            return

        indexByToken = {}
        for i in range(len(sentenceGraph.tokens)):
            indexByToken[sentenceGraph.tokens[i]] = i
        
        argTokenIndices = set()
        for arg in arguments:
            if arg.get("e2") in sentenceGraph.entitiesById: # skip intersentence interactions
                argEntity = sentenceGraph.entitiesById[arg.get("e2")]
                argToken = sentenceGraph.entityHeadTokenByEntity[argEntity]
                argTokenIndices.add(indexByToken[argToken])
        if len(argTokenIndices) < 1:
            return
        minIndex = min(argTokenIndices)
        maxIndex = max(argTokenIndices)
        self.setFeature("argBoWRange", (maxIndex-minIndex))
        self.setFeature("argBoWRange_" + str(maxIndex-minIndex), 1)
        bow = set()
        for i in range(minIndex+1, maxIndex):
            token = sentenceGraph.tokens[i]
            if len(sentenceGraph.tokenIsEntityHead[token]) == 0 and not sentenceGraph.tokenIsName[token]:
                bow.add(token.get("text"))
        bow = sorted(list(bow))
        for word in bow:
            self.setFeature("argBoW_"+word, 1)
            if word in ["/", "-"]:
                self.setFeature("argBoW_slashOrHyphen", 1)
        if len(bow) == 1:
            self.setFeature("argBoWonly_"+bow[0], 1)
            if bow[0] in ["/", "-"]:
                self.setFeature("argBoWonly_slashOrHyphen", 1)
class IntersentenceEdgeExampleBuilder(ExampleBuilder):
    """
    This example builder makes edge examples, i.e. examples describing
    the event arguments.
    """
    def __init__(self,
                 style=["typed", "directed", "headsOnly"],
                 length=None,
                 types=[],
                 featureSet=None,
                 classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert (classSet.getId("neg") == 1)

        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        self.styles = style

        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        if "noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True
        self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        self.types = types

#    @classmethod
#    def run(cls, input, output, parse, tokenization, style, idFileTag=None):
#        """
#        An interface for running the example builder without needing to create a class
#        """
#        classSet, featureSet = cls.getIdSets(idFileTag)
#        if style != None:
#            e = MultiEdgeExampleBuilder(style=style, classSet=classSet, featureSet=featureSet)
#        else:
#            e = MultiEdgeExampleBuilder(classSet=classSet, featureSet=featureSet)
#        sentences = cls.getSentences(input, parse, tokenization)
#        e.buildExamplesForSentences(sentences, output, idFileTag)

    @classmethod
    def run(cls, input, output, parse, tokenization, style, idFileTag=None):
        """
        An interface for running the example builder without needing to create a class
        """
        classSet, featureSet = cls.getIdSets(idFileTag)
        if style != None:
            e = IntersentenceEdgeExampleBuilder(style=style,
                                                classSet=classSet,
                                                featureSet=featureSet)
        else:
            e = IntersentenceEdgeExampleBuilder(classSet=classSet,
                                                featureSet=featureSet)
        # Load documents
        if type(input) != types.ListType:
            # Load corpus and make sentence graphs
            corpusElements = SentenceGraph.loadCorpus(input, parse,
                                                      tokenization, False,
                                                      True)

        else:  # assume input is already a list of sentences
            assert (removeNameInfo == False)
            return input
        # run examplebuilder
        e.buildExamplesForDocuments(corpusElements.documentSentences, output,
                                    idFileTag)

    def buildExamplesForDocuments(self,
                                  documentSentences,
                                  output,
                                  idFileTag=None):
        examples = []
        counter = ProgressCounter(len(documentSentences), "Build examples")

        #calculatePredictedRange(self, sentences)

        outfile = open(output, "wt")
        exampleCount = 0
        for document in documentSentences:
            counter.update(
                1,
                "Building examples (" + document[0].sentence.get("id") + "): ")
            examples = self.buildExamples(document)
            exampleCount += len(examples)
            #examples = self.preProcessExamples(examples)
            ExampleUtils.appendExamples(examples, outfile)
        outfile.close()

        print >> sys.stderr, "Examples built:", exampleCount
        print >> sys.stderr, "Features:", len(self.featureSet.getNames())
        #IF LOCAL
        if self.exampleStats.getExampleCount() > 0:
            self.exampleStats.printStats()
        #ENDIF
        # Save Ids
        if idFileTag != None:
            print >> sys.stderr, "Saving class names to", idFileTag + ".class_names"
            self.classSet.write(idFileTag + ".class_names")
            print >> sys.stderr, "Saving feature names to", idFileTag + ".feature_names"
            self.featureSet.write(idFileTag + ".feature_names")

    def getCategoryName(self, sentence1, sentence2, e1, e2, directed=True):
        """
        Example class. Multiple overlapping edges create a merged type.
        """
        e1Id = e1.get("id")
        e2Id = e2.get("id")
        allInteractions = sentence1.interSentenceInteractions + sentence2.interSentenceInteractions
        interactions = []
        #if len(allInteractions) > 0:
        #    print len(allInteractions)
        for interaction in allInteractions:
            if interaction.get("e1") == e1Id and interaction.get("e2") == e2Id:
                interactions.append(interaction)
        types = set()
        for interaction in interactions:
            types.add(interaction.get("type"))
        types = list(types)
        types.sort()
        categoryName = ""
        for name in types:
            if categoryName != "":
                categoryName += "---"
            categoryName += name
        if categoryName != "":
            return categoryName
        else:
            return "neg"

    def isPotentialCOInteraction(self, e1, e2):
        if e1.get("type") == "Exp" and e2.get("type") == "Exp":
            return True
        else:
            return False

    def buildExamples(self, documentSentences):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        examples = []
        exampleIndex = 0

        for documentSentence in documentSentences:
            if documentSentence.sentenceGraph != None:
                documentSentence.sentenceGraph.undirected = documentSentence.sentenceGraph.dependencyGraph.toUndirected(
                )
                documentSentence.triggerFeatureBuilder = TriggerFeatureBuilder(
                    self.featureSet)
                documentSentence.triggerFeatureBuilder.useNonNameEntities = True
                documentSentence.triggerFeatureBuilder.initSentence(
                    documentSentence.sentenceGraph)

        # Generate examples based on interactions between entities or interactions between tokens
        maxDistance = 1
        for sentence1Index in range(len(documentSentences)):
            sentence1 = documentSentences[sentence1Index]
            if sentence1.sentenceGraph == None:
                continue
            for sentence2Index in range(
                    sentence1Index + 1,
                    min(sentence1Index + 1 + maxDistance,
                        len(documentSentences))):
                sentence2 = documentSentences[sentence2Index]
                if sentence2.sentenceGraph == None:
                    continue
                if "entities" in self.styles:
                    loopRange1 = len(sentence1.sentenceGraph.entities)
                    loopRange2 = len(sentence2.sentenceGraph.entities)
                else:
                    loopRange = len(sentenceGraph.tokens)
                for i in range(loopRange1):
                    for j in range(loopRange2):
                        eI = None
                        eJ = None
                        if "entities" in self.styles:
                            eI = sentence1.sentenceGraph.entities[i]
                            eJ = sentence2.sentenceGraph.entities[j]
                            tI = sentence1.sentenceGraph.entityHeadTokenByEntity[
                                eI]
                            tJ = sentence2.sentenceGraph.entityHeadTokenByEntity[
                                eJ]
                            #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True":
                            #    continue
                            if eI.get("type") == "neg" or eJ.get(
                                    "type") == "neg":
                                continue
                        else:
                            tI = sentenceGraph.tokens[i]
                            tJ = sentenceGraph.tokens[j]
                        # only consider paths between entities (NOTE! entities, not only named entities)
                        if "headsOnly" in self.styles:
                            if (len(sentenceGraph.tokenIsEntityHead[tI])
                                    == 0) or (len(
                                        sentenceGraph.tokenIsEntityHead[tJ])
                                              == 0):
                                continue

                        if "directed" in self.styles:
                            # define forward
                            if "entities" in self.styles:
                                categoryName = self.getCategoryName(
                                    sentence1, sentence2, eI, eJ, True)
                            else:
                                categoryName = self.getCategoryNameFromTokens(
                                    sentenceGraph, tI, tJ, True)
                            # make forward
                            self.exampleStats.beginExample(categoryName)
                            makeExample = True
                            if ("co_limits" in self.styles
                                ) and not self.isPotentialCOInteraction(
                                    eI, eJ):
                                makeExample = False
                                self.exampleStats.filter("co_limits")
                            if makeExample:
                                examples.append(
                                    self.buildExample(sentence1, sentence2,
                                                      categoryName,
                                                      exampleIndex, eI, eJ))
                                exampleIndex += 1
                            self.exampleStats.endExample()

                            # define reverse
                            if "entities" in self.styles:
                                categoryName = self.getCategoryName(
                                    sentence2, sentence1, eJ, eI, True)
                            else:
                                categoryName = self.getCategoryNameFromTokens(
                                    sentenceGraph, tJ, tI, True)
                            # make reverse
                            self.exampleStats.beginExample(categoryName)
                            makeExample = True
                            if ("co_limits" in self.styles
                                ) and not self.isPotentialCOInteraction(
                                    eJ, eI):
                                makeExample = False
                                self.exampleStats.filter("co_limits")
                            if makeExample:
                                examples.append(
                                    self.buildExample(sentence2, sentence1,
                                                      categoryName,
                                                      exampleIndex, eJ, eI))
                                exampleIndex += 1
                            self.exampleStats.endExample()
                        else:
                            if "entities" in self.styles:
                                categoryName = self.getCategoryName(
                                    sentenceGraph, eI, eJ, False)
                            else:
                                categoryName = self.getCategoryNameFromTokens(
                                    sentenceGraph, tI, tJ, False)
                            self.exampleStats.beginExample(categoryName)
                            forwardExample = self.buildExample(
                                tI, tJ, paths, sentenceGraph, categoryName,
                                exampleIndex, eI, eJ)
                            if not "graph_kernel" in self.styles:
                                reverseExample = self.buildExample(
                                    tJ, tI, paths, sentenceGraph, categoryName,
                                    exampleIndex, eJ, eI)
                                forwardExample[2].update(reverseExample[2])
                            examples.append(forwardExample)
                            exampleIndex += 1
                            self.exampleStats.endExample()

        return examples

    def getRootToken(self, sentenceGraph, token, visited=None, level=0):
        if visited == None: visited = set()
        inEdges = sentenceGraph.dependencyGraph.getInEdges(token)
        rv = None
        for inEdge in inEdges:
            if inEdge not in visited:
                visited.add(inEdge)
                rvNew = self.getRootToken(sentenceGraph, inEdge[0], visited,
                                          level + 1)
                if rv == None or rvNew[1] > rv[1]:
                    rv = rvNew
        if rv == None:
            return (token, level)
        else:
            return rv

    def buildExample(self,
                     sentence1,
                     sentence2,
                     categoryName,
                     exampleIndex,
                     entity1=None,
                     entity2=None):
        """
        Build a single directed example for the potential edge between token1 and token2
        """
        # define features
        features = {}
        e1Token = sentence1.sentenceGraph.entityHeadTokenByEntity[entity1]
        e2Token = sentence2.sentenceGraph.entityHeadTokenByEntity[entity2]
        e1RootToken = self.getRootToken(sentence1.sentenceGraph, e1Token)[0]
        e2RootToken = self.getRootToken(sentence2.sentenceGraph, e2Token)[0]
        e1Path = sentence1.sentenceGraph.undirected.getPaths(
            e1Token, e1RootToken)
        e2Path = sentence2.sentenceGraph.undirected.getPaths(
            e2RootToken, e2Token)
        if len(e1Path) > 0: e1Path = e1Path[0]
        else: e1Path = [e1Token, e1RootToken]
        if len(e2Path) > 0: e2Path = e2Path[0]
        else: e2Path = [e2RootToken, e2Token]
        # build features
        if "trigger_features" in self.styles:  # F 85.52 -> 85.55
            sentence1.triggerFeatureBuilder.setFeatureVector(features)
            sentence1.triggerFeatureBuilder.tag = "trg1_"
            sentence1.triggerFeatureBuilder.buildFeatures(e1Token)
            sentence1.triggerFeatureBuilder.setFeatureVector(None)
            sentence2.triggerFeatureBuilder.setFeatureVector(features)
            sentence2.triggerFeatureBuilder.tag = "trg2_"
            sentence2.triggerFeatureBuilder.buildFeatures(e2Token)
            sentence2.triggerFeatureBuilder.setFeatureVector(None)
        if "entity_type" in self.styles:
            features[self.featureSet.getId("e1_" + entity1.get("type"))] = 1
            features[self.featureSet.getId("e2_" + entity2.get("type"))] = 1
            features[self.featureSet.getId("distance_" +
                                           str(len(e1Path) + len(e2Path)))] = 1
        if not "no_dependency" in self.styles:
            for pair in ([e1Path, "e1Edge_", entity1, None, sentence1],
                         [e2Path, "e2Edge_", None, entity2, sentence2]):
                self.multiEdgeFeatureBuilder.tag = pair[1]
                self.multiEdgeFeatureBuilder.setFeatureVector(
                    features, pair[2], pair[3])
                #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast
                if not "disable_entity_features" in self.styles:
                    self.multiEdgeFeatureBuilder.buildEntityFeatures(
                        pair[4].sentenceGraph)
                self.multiEdgeFeatureBuilder.buildPathLengthFeatures(pair[0])
                if not "disable_terminus_features" in self.styles:
                    self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(
                        pair[0], pair[4].sentenceGraph)  # remove for fast
                if not "disable_single_element_features" in self.styles:
                    self.multiEdgeFeatureBuilder.buildSingleElementFeatures(
                        pair[0], pair[4].sentenceGraph)
                if not "disable_ngram_features" in self.styles:
                    #print "NGrams"
                    self.multiEdgeFeatureBuilder.buildPathGrams(
                        2, pair[0], pair[4].sentenceGraph)  # remove for fast
                    self.multiEdgeFeatureBuilder.buildPathGrams(
                        3, pair[0], pair[4].sentenceGraph)  # remove for fast
                    self.multiEdgeFeatureBuilder.buildPathGrams(
                        4, pair[0], pair[4].sentenceGraph)  # remove for fast
                #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast
                #if edges != None:
                #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast
                #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast
                if not "disable_path_edge_features" in self.styles:
                    self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(
                        pair[0], pair[4].sentenceGraph)
                self.multiEdgeFeatureBuilder.buildSentenceFeatures(
                    pair[4].sentenceGraph)
                self.multiEdgeFeatureBuilder.setFeatureVector(None)
#            if not "no_linear" in self.styles:
#                self.tokenFeatureBuilder.setFeatureVector(features)
#                for i in range(len(sentenceGraph.tokens)):
#                    if sentenceGraph.tokens[i] == token1:
#                        token1Index = i
#                    if sentenceGraph.tokens[i] == token2:
#                        token2Index = i
#                linearPreTag = "linfw_"
#                if token1Index > token2Index:
#                    token1Index, token2Index = token2Index, token1Index
#                    linearPreTag = "linrv_"
#                self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1")
#                self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2")
# Before, middle, after
#                self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf")
#                self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw")
#                self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af")
# before-middle, middle, middle-after
#                    self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2)
#                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2)
#                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2)
#                self.tokenFeatureBuilder.setFeatureVector(None)
#            if "random" in self.styles:
#                self.randomFeatureBuilder.setFeatureVector(features)
#                self.randomFeatureBuilder.buildRandomFeatures(100, 0.01)
#                self.randomFeatureBuilder.setFeatureVector(None)
# define extra attributes
        extra = {
            "xtype": "edge",
            "type": "i",
            "t1": e1Token.get("id"),
            "t2": e2Token.get("id")
        }
        if entity1 != None:
            #extra["e1"] = entity1
            extra["e1"] = entity1.get("id")
        if entity2 != None:
            #extra["e2"] = entity2
            extra["e2"] = entity2.get("id")
        extra["categoryName"] = categoryName
        # make example
        if "binary" in self.styles:
            if categoryName != "neg":
                category = 1
            else:
                category = -1
            categoryName = "i"
        else:
            category = self.classSet.getId(categoryName)

        return (sentence1.sentence.get("id") + ".x" + str(exampleIndex),
                category, features, extra)
예제 #16
0
class UnmergedEdgeExampleBuilder(ExampleBuilder):
    def __init__(self, style=["typed","directed","headsOnly"], length=None, types=[], featureSet=None, classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert( classSet.getId("neg") == 1 )
        
        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        self.styles = style
        
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        if "noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True
        self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        self.pathLengths = length
        assert(self.pathLengths == None)
        self.types = types
        if "random" in self.styles:
            from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder
            self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
        
        #self.outFile = open("exampleTempFile.txt","wt")

    @classmethod
    def run(cls, input, output, parse, tokenization, style, idFileTag=None):
        classSet, featureSet = cls.getIdSets(idFileTag)
        if style == None:
            e = UnmergedEdgeExampleBuilder(classSet=classSet, featureSet=featureSet)
        else:
            e = UnmergedEdgeExampleBuilder(style=style, classSet=classSet, featureSet=featureSet)
        sentences = cls.getSentences(input, parse, tokenization)
        e.buildExamplesForSentences(sentences, output, idFileTag)
        print e.classSet.Ids
    
    def definePredictedValueRange(self, sentences, elementName):
        self.multiEdgeFeatureBuilder.definePredictedValueRange(sentences, elementName)                        
    
    def getPredictedValueRange(self):
        return self.multiEdgeFeatureBuilder.predictedRange
    
    def filterEdgesByType(self, edges, typesToInclude):
        if len(typesToInclude) == 0:
            return edges
        edgesToKeep = []
        for edge in edges:
            if edge.get("type") in typesToInclude:
                edgesToKeep.append(edge)
        return edgesToKeep
        
    def getCategoryName(self, sentenceGraph, e1, e2, directed=True):
        # Dummies are potential entities that do not exist in the 
        # training data. If both entities of an interaction are dummies
        # it can't exist in the training data and is therefore a negative
        if e1[2] or e2[2]:
            return "neg"
        
        e1 = e1[0]
        e2 = e2[0]
        
        interactions = sentenceGraph.getInteractions(e1, e2)
        if not directed:
            interactions.extend(sentenceGraph.getInteractions(e2, e1))
        
        types = set()
        for interaction in interactions:
            types.add(interaction.attrib["type"])
        types = list(types)
        types.sort()
        categoryName = ""
        for name in types:
            if categoryName != "":
                categoryName += "---"
            categoryName += name
        if categoryName != "":
            return categoryName
        else:
            return "neg"           
    
    def preProcessExamples(self, allExamples):
        if "normalize" in self.styles:
            print >> sys.stderr, " Normalizing feature vectors"
            ExampleUtils.normalizeFeatureVectors(allExamples)
        return allExamples   
    
    def isPotentialGeniaInteraction(self, e1, e2):
        if e1.get("isName") == "True":
            return False
        else:
            return True
    
    def nxMultiDiGraphToUndirected(self, graph):
        undirected = NX10.MultiGraph(name=graph.name)
        undirected.add_nodes_from(graph)
        undirected.add_edges_from(graph.edges_iter())
        return undirected
    
    def getInteractionEdgeLengths(self, sentenceGraph, paths):
        """
        Return dependency and linear length of all interaction edges
        (measured between the two tokens).
        """
        interactionLengths = {}
        for interaction in sentenceGraph.interactions:
            # Calculated interaction edge dep and lin length
            e1 = sentenceGraph.entitiesById[interaction.get("e1")]
            e2 = sentenceGraph.entitiesById[interaction.get("e2")]
            t1 = sentenceGraph.entityHeadTokenByEntity[e1]
            t2 = sentenceGraph.entityHeadTokenByEntity[e2]
            # Get dep path length
            if t1 != t2 and paths.has_key(t1) and paths[t1].has_key(t2):
                pathLength = len(paths[t1][t2])
            else: # no dependencyPath
                pathLength = 999999 # more than any real path
            # Linear distance
            t1Pos = -1
            t2Pos = -1
            for i in range(len(sentenceGraph.tokens)):
                if sentenceGraph.tokens[i] == t1:
                    t1Pos = i
                    if t2Pos != -1:
                        break
                if sentenceGraph.tokens[i] == t2:
                    t2Pos = i
                    if t1Pos != -1:
                        break
            linLength = abs(t1Pos - t2Pos)
            interactionLengths[interaction] = (pathLength, linLength)
        return interactionLengths
        
    def getPrecedenceLevels(self, sentenceGraph, paths):
        """
        Get overlapping entity precedence
        """
        interactionLengths = self.getInteractionEdgeLengths(sentenceGraph, paths)

        interactionsByEntity = {} # Convenience mapping
        entityPrecedenceValues = {}
        for entity in sentenceGraph.entities:
            interactionsByEntity[entity] = []
            eId = entity.get("id")
            # Add access to interactions
            argDepDist = 0 # Sum of lengths of shortest paths
            argLinDist = 0 # Sum of linear distances
            for interaction in sentenceGraph.interactions:
                if interaction.get("e1") == eId: # An argument of the entity defined by the node
                    interactionsByEntity[entity].append(interaction)
                    argDepDist += interactionLengths[interaction][0]
                    argLinDist += interactionLengths[interaction][1]
            # Store precedence counts (num args, sum of dep lengths, sum of lin lengths)
            entityPrecedenceValues[entity] = (len(interactionsByEntity), argDepDist, argLinDist, entity)
        
        # Determine level of entity from precedence counts
        levelByEntity = {} # slot number
        #levelByInteraction = {} # slot number of parent node
        # There is one slot group per token, per type
        for token in sentenceGraph.tokens: # per token
            entitiesByType = {}
            for entity in sentenceGraph.tokenIsEntityHead[token]: # per type
                if entity.get("isName") == "True": # Names can never have duplicates
                    assert not levelByEntity.has_key(entity)
                    levelByEntity[entity] = 0
                    continue
                eType = entity.get("type")
                if eType == "neg":
                    continue
                if not entitiesByType.has_key(eType):
                    entitiesByType[eType] = []
                entitiesByType[eType].append(entity)
            for eType in sorted(entitiesByType.keys()):
                # Slot ordering by precedence
                sortedEntities = []
                for entity in entitiesByType[eType]:
                    sortedEntities.append(entityPrecedenceValues[entity])
                sortedEntities.sort(compareEntityPrecedence)
                level = 0
                for precedenceTuple in sortedEntities:
                    entity = precedenceTuple[3]
                    assert not levelByEntity.has_key(entity)
                    levelByEntity[entity] = level
                    # Interactions have the same slot as their parent entity
                    #for interaction in interactionsByEntity[entity]:
                    #    assert not levelByInteraction.has_key(interaction)
                    #    levelByInteraction[interaction] = level
                    level += 1
        return levelByEntity#, levelByInteraction      
            
    def buildExamples(self, sentenceGraph):
        examples = []
        exampleIndex = 0
        
        #undirected = sentenceGraph.getUndirectedDependencyGraph()
        undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
        ##undirected = sentenceGraph.dependencyGraph.to_undirected()
        ###undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work
        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
        
        # Determine overlapping entity precedence
        #levelByEntity, levelByInteraction = self.getPrecedenceLevels(sentenceGraph, paths)
        levelByEntity = self.getPrecedenceLevels(sentenceGraph, paths)
        
        entities = []
        # There is one entity group for each token, for each type of entity
        for token in sentenceGraph.tokens: # per token
            entitiesByType = {}
            for entity in sentenceGraph.tokenIsEntityHead[token]: # per type
                if entity.get("isName") == "True": # Names can never have duplicates
                    entities.append( (entity, 0, False) )
                    continue
                eType = entity.get("type")
                if eType == "neg":
                    continue
                if not entitiesByType.has_key(eType):
                    entitiesByType[eType] = []
                entitiesByType[eType].append(entity)
            # Create slot groups for tokens for which exists at least one entity
            eTypes = sorted(entitiesByType.keys())
            if len(eTypes) == 0:
                continue
            # Create slot groups and insert GS data there
            for eType in eTypes:
                # Use first entity of a type as the dummy entity for unfilled slots
                dummyEntity = entitiesByType[eType][0]
                # Define entity slots
                entityGroup = [None, None, None, None]
                #entityGroup = [None, None]
                # Insert existing entities into slots
                for entity in entitiesByType[eType]:
                    if levelByEntity.has_key(entity):
                        level = levelByEntity[entity]
                        if level < len(entityGroup):
                            entityGroup[level] = (entity, level, False)
                # Create dummies for potential entities
                for i in range(len(entityGroup)):
                    if entityGroup[i] == None:
                        entityGroup[i] = (dummyEntity, i, True)
                # Put all slots into one potential entity list
                #print entityGroup
                for e in entityGroup:
                    entities.append(e)
        
        # Generate examples based on interactions between entities
        for i in range(len(entities)-1):
            for j in range(i+1,len(entities)):
                eI = entities[i][0]
                eJ = entities[j][0]
                tI = sentenceGraph.entityHeadTokenByEntity[eI]
                tJ = sentenceGraph.entityHeadTokenByEntity[eJ]
                
                # define forward example
                categoryName = self.getCategoryName(sentenceGraph, entities[i], entities[j], True)
                if (not "genia_limits" in self.styles) or self.isPotentialGeniaInteraction(eI, eJ):
                    examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, entities[i], entities[j]) )
                    exampleIndex += 1
                
                # define reverse
                categoryName = self.getCategoryName(sentenceGraph, entities[j], entities[i], True)
                if (not "genia_limits" in self.styles) or self.isPotentialGeniaInteraction(eJ, eI):
                    examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, entities[j], entities[i]) )
                    exampleIndex += 1
        
        return examples
    
    def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, exampleIndex, e1=None, e2=None):
        entity1=e1[0]
        entity2=e2[0]
        # define features
        features = {}
        features[self.featureSet.getId("gov_level")] = e1[1]
        features[self.featureSet.getId("gov_level_"+str(e1[1]))] = 1
        features[self.featureSet.getId("dep_level")] = e2[1]
        features[self.featureSet.getId("dep_level_"+str(e2[1]))] = 1
        features[self.featureSet.getId("level_pair_"+str(e1[1])+"_"+str(e2[1]))] = 1
        if True: #token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2):
            if token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2):
                path = paths[token1][token2]
            else:
                path = [token1, token2]
            assert(self.pathLengths == None)
            if self.pathLengths == None or len(path)-1 in self.pathLengths:
                if not "no_dependency" in self.styles:
                    if token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2):
                        edges = self.multiEdgeFeatureBuilder.getEdges(sentenceGraph.dependencyGraph, path)
                    else:
                        edges = None
                if "entity_type" in self.styles:
                    features[self.featureSet.getId("e1_"+entity1.attrib["type"])] = 1
                    features[self.featureSet.getId("e2_"+entity2.attrib["type"])] = 1
                    features[self.featureSet.getId("distance_"+str(len(path)))] = 1
                if not "no_dependency" in self.styles:
                    self.multiEdgeFeatureBuilder.setFeatureVector(features, entity1, entity2)
                    #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast
                    if not "disable_entity_features" in self.styles:
                        self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph)
                    self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path)
                    if not "disable_terminus_features" in self.styles:
                        self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast
                    if not "disable_single_element_features" in self.styles:
                        self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, edges, sentenceGraph)
                    if not "disable_ngram_features" in self.styles:
                        self.multiEdgeFeatureBuilder.buildPathGrams(2, path, edges, sentenceGraph) # remove for fast
                        self.multiEdgeFeatureBuilder.buildPathGrams(3, path, edges, sentenceGraph) # remove for fast
                        self.multiEdgeFeatureBuilder.buildPathGrams(4, path, edges, sentenceGraph) # remove for fast
                    #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast
                    #if edges != None:
                    #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast
                    #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast
                    if not "disable_path_edge_features" in self.styles:
                        self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, edges, sentenceGraph)
                    self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph)
                    self.multiEdgeFeatureBuilder.setFeatureVector(None)
                if not "no_linear" in self.styles:
                    self.tokenFeatureBuilder.setFeatureVector(features)
                    for i in range(len(sentenceGraph.tokens)):
                        if sentenceGraph.tokens[i] == token1:
                            token1Index = i
                        if sentenceGraph.tokens[i] == token2:
                            token2Index = i
                    linearPreTag = "linfw_"
                    if token1Index > token2Index: 
                        token1Index, token2Index = token2Index, token1Index
                        linearPreTag = "linrv_"
                    self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1")
                    self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2")
                    # Before, middle, after
    #                self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf")
    #                self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw")
    #                self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af")
                    # before-middle, middle, middle-after
#                    self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2)
#                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2)
#                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2)
                    self.tokenFeatureBuilder.setFeatureVector(None)
                if "random" in self.styles:
                    self.randomFeatureBuilder.setFeatureVector(features)
                    self.randomFeatureBuilder.buildRandomFeatures(100, 0.01)
                    self.randomFeatureBuilder.setFeatureVector(None)
                if "genia_limits" in self.styles:
                    e1Type = entity1.get("type")
                    e2Type = entity2.get("type")
                    assert(entity1.get("isName") == "False")
                    if entity2.get("isName") == "True":
                        features[self.featureSet.getId("GENIA_target_protein")] = 1
                    else:
                        features[self.featureSet.getId("GENIA_nested_event")] = 1
                    if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization
                        if entity2.get("isName") == "True":
                            features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1
                        else:
                            features[self.featureSet.getId("GENIA_regulation_of_event")] = 1
            else:
                features[self.featureSet.getId("always_negative")] = 1
                if "subset" in self.styles:
                    features[self.featureSet.getId("out_of_scope")] = 1
        else:
            features[self.featureSet.getId("always_negative")] = 1
            if "subset" in self.styles:
                features[self.featureSet.getId("out_of_scope")] = 1
            path = [token1, token2]
        # define extra attributes
        if int(path[0].attrib["id"].split("_")[-1]) < int(path[-1].attrib["id"].split("_")[-1]):
            #extra = {"xtype":"edge","type":"i","t1":path[0],"t2":path[-1]}
            extra = {"xtype":"ue","type":"i","t1":path[0].get("id"),"t2":path[-1].get("id")}
            extra["deprev"] = False
        else:
            #extra = {"xtype":"edge","type":"i","t1":path[-1],"t2":path[0]}
            extra = {"xtype":"ue","type":"i","t1":path[-1].get("id"),"t2":path[0].get("id")}
            extra["deprev"] = True
        if entity1 != None:
            extra["e1"] = entity1.get("id")
            extra["l1"] = str(e1[1])
            extra["d1"] = str(e1[2])[0] # is a dummy node (an entity not in existing triggers)
        if entity2 != None:
            extra["e2"] = entity2.get("id")
            extra["l2"] = str(e2[1])
            extra["d2"] = str(e2[2])[0] # is a dummy node (an entity not in existing triggers)
        extra["categoryName"] = categoryName
        sentenceOrigId = sentenceGraph.sentenceElement.get("origId")
        if sentenceOrigId != None:
            extra["SOID"] = sentenceOrigId       
        # make example
        if "binary" in self.styles:
            if categoryName != "neg":
                category = 1
            else:
                category = -1
            categoryName = "i"
        else:
            category = self.classSet.getId(categoryName)
        
        return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)
예제 #17
0
class Round2TriggerExampleBuilder(ExampleBuilder):
    def nxMultiDiGraphToUndirected(self, graph):
        undirected = NX10.MultiGraph(name=graph.name)
        undirected.add_nodes_from(graph)
        undirected.add_edges_from(graph.edges_iter())
        return undirected

    def getPredictionStrength(self, element):
        eType = element.get("type")
        predictions = element.get("predictions")
        if predictions == None:
            return 0
        predictions = predictions.split(",")
        for prediction in predictions:
            predClass, predStrength = prediction.split(":")
            if predClass == eType:
                predStrength = float(predStrength)
                return predStrength
        return 0

    def getInteractionEdgeLengths(self, sentenceGraph, paths):
        """
        Return dependency and linear length of all interaction edges
        (measured between the two tokens).
        """
        interactionLengths = {}
        for interaction in sentenceGraph.interactions:
            # Calculated interaction edge dep and lin length
            e1 = sentenceGraph.entitiesById[interaction.get("e1")]
            e2 = sentenceGraph.entitiesById[interaction.get("e2")]
            t1 = sentenceGraph.entityHeadTokenByEntity[e1]
            t2 = sentenceGraph.entityHeadTokenByEntity[e2]
            # Get dep path length
            if t1 != t2 and paths.has_key(t1) and paths[t1].has_key(t2):
                pathLength = len(paths[t1][t2])
            else:  # no dependencyPath
                pathLength = 999999  # more than any real path
            # Linear distance
            t1Pos = -1
            t2Pos = -1
            for i in range(len(sentenceGraph.tokens)):
                if sentenceGraph.tokens[i] == t1:
                    t1Pos = i
                    if t2Pos != -1:
                        break
                if sentenceGraph.tokens[i] == t2:
                    t2Pos = i
                    if t1Pos != -1:
                        break
            linLength = abs(t1Pos - t2Pos)
            interactionLengths[interaction] = (interaction, pathLength, linLength, t2Pos)
        return interactionLengths

    def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None):
        if classSet == None:
            classSet = IdSet(1)
        assert classSet.getId("neg") == 1
        if featureSet == None:
            featureSet = IdSet()

        ExampleBuilder.__init__(self, classSet, featureSet)
        # gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
        if gazetteerFileName != None:
            self.gazetteer = Gazetteer.loadGztr(gazetteerFileName)
            print >>sys.stderr, "Loaded gazetteer from", gazetteerFileName
        else:
            print >>sys.stderr, "No gazetteer loaded"
            self.gazetteer = None
        self.styles = style

        self.skiplist = set()
        if skiplist != None:
            f = open(skiplist, "rt")
            for line in f.readlines():
                self.skiplist.add(line.strip())
            f.close()

        self.styles = [
            "trigger_features",
            "typed",
            "directed",
            "no_linear",
            "entities",
            "genia_limits",
            "noMasking",
            "maxFeatures",
        ]
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        if "graph_kernel" in self.styles:
            from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder

            self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet)
        if "noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True

        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)

    @classmethod
    def run(cls, input, gold, output, parse, tokenization, style, idFileTag=None, append=False):
        """
        An interface for running the example builder without needing to create a class
        """
        classSet, featureSet = cls.getIdSets(idFileTag)
        if style != None:
            e = Round2TriggerExampleBuilder(style=style, classSet=classSet, featureSet=featureSet)
        else:
            e = Round2TriggerExampleBuilder(classSet=classSet, featureSet=featureSet)
        sentences = cls.getSentences(input, parse, tokenization)
        if gold != None:
            goldSentences = cls.getSentences(gold, parse, tokenization)
        else:
            goldSentences = None
        e.buildExamplesForSentences(sentences, goldSentences, output, idFileTag, append=append)

    def buildExamplesForSentences(self, sentences, goldSentences, output, idFileTag=None, append=False):
        examples = []
        counter = ProgressCounter(len(sentences), "Build examples")

        if append:
            outfile = open(output, "at")
        else:
            outfile = open(output, "wt")
        exampleCount = 0
        for i in range(len(sentences)):
            sentence = sentences[i]
            goldSentence = [None]
            if goldSentences != None:
                goldSentence = goldSentences[i]
            counter.update(1, "Building examples (" + sentence[0].getSentenceId() + "): ")
            examples = self.buildExamples(sentence[0], goldSentence[0], append=append)
            exampleCount += len(examples)
            examples = self.preProcessExamples(examples)
            ExampleUtils.appendExamples(examples, outfile)
        outfile.close()

        print >>sys.stderr, "Examples built:", exampleCount
        print >>sys.stderr, "Features:", len(self.featureSet.getNames())
        # IF LOCAL
        if self.exampleStats.getExampleCount() > 0:
            self.exampleStats.printStats()
        # ENDIF
        # Save Ids
        if idFileTag != None:
            print >>sys.stderr, "Saving class names to", idFileTag + ".class_names"
            self.classSet.write(idFileTag + ".class_names")
            print >>sys.stderr, "Saving feature names to", idFileTag + ".feature_names"
            self.featureSet.write(idFileTag + ".feature_names")

    def preProcessExamples(self, allExamples):
        if "normalize" in self.styles:
            print >>sys.stderr, " Normalizing feature vectors"
            ExampleUtils.normalizeFeatureVectors(allExamples)
        return allExamples

    def getMergedEntityType(self, entities):
        """
        If a single token belongs to multiple entities of different types,
        a new, composite type is defined. This type is the alphabetically
        ordered types of these entities joined with '---'.
        """
        types = set()
        for entity in entities:
            types.add(entity.get("type"))
        types = list(types)
        types.sort()
        typeString = ""
        for type in types:
            if type == "Protein" and "all_tokens" in self.styles:
                continue
            if typeString != "":
                typeString += "---"
            typeString += type

        if typeString == "":
            return "neg"

        if "limit_merged_types" in self.styles:
            if typeString.find("---") != -1:
                if typeString == "Gene_expression---Positive_regulation":
                    return typeString
                else:
                    return typeString.split("---")[0]
            else:
                return typeString
        return typeString

    def getTokenFeatures(self, token, sentenceGraph):
        """
        Returns a list of features based on the attributes of a token.
        These can be used to define more complex features.
        """
        # These features are cached when this method is first called
        # for a token.
        if self.tokenFeatures.has_key(token):
            return self.tokenFeatures[token]
        tokTxt = sentenceGraph.getTokenText(token)
        features = {}
        features["_txt_" + tokTxt] = 1

        # F 69.35 -> 68.22
        # normalizedText = tokTxt.replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower()
        # features["_norTxt_"+normalizedText]=1
        # features["_norStem_" + PorterStemmer.stem(normalizedText)]=1

        features["_POS_" + token.get("POS")] = 1
        if sentenceGraph.tokenIsName[token]:
            features["_isName"] = 1
            for entity in sentenceGraph.tokenIsEntityHead[token]:
                if entity.get("isName") == "True":
                    features["_annType_" + entity.get("type")] = 1
        # Filip's gazetteer based features (can be used separately from exclude_gazetteer)
        if "gazetteer_features" in self.styles:
            tokTxtLower = tokTxt.lower()
            if "stem_gazetteer" in self.styles:
                tokTxtLower = PorterStemmer.stem(tokTxtLower)
            if self.gazetteer and tokTxtLower in self.gazetteer:
                for label, weight in self.gazetteer[tokTxtLower].items():
                    features["_knownLabel_" + label] = weight  # 1 performs slightly worse
        self.tokenFeatures[token] = features
        return features

    def buildLinearOrderFeatures(self, sentenceGraph, index, tag, features):
        """
        Linear features are built by marking token features with a tag
        that defines their relative position in the linear order.
        """
        tag = "linear_" + tag
        for tokenFeature, w in self.getTokenFeatures(sentenceGraph.tokens[index], sentenceGraph).iteritems():
            features[self.featureSet.getId(tag + tokenFeature)] = w

    def buildExamples(self, sentenceGraph, goldGraph, append=False):
        examples = self.buildExamplesInner(sentenceGraph, goldGraph)

        entityCounts = {}
        exampleCounts = {}
        for entity in sentenceGraph.entities:
            eType = entity.get("type")
            if eType == "Protein":
                continue
            if not entityCounts.has_key(eType):
                entityCounts[eType] = 0
                exampleCounts[eType] = 0
            entityCounts[eType] += 1

        for example in examples:
            eTypes = self.classSet.getName(example[1]).split("---")
            for eType in eTypes:
                if not exampleCounts.has_key(eType):
                    exampleCounts[eType] = 0
                exampleCounts[eType] += 1
        # for key in sorted(entityCounts.keys()):
        #    if entityCounts[key] != exampleCounts[key]:
        #        print >> sys.stderr, "Warning, sentence", sentenceGraph.getSentenceId(), "example", key, "diff", entityCounts[key] - exampleCounts[key]

        return examples

    def buildExamplesInner(self, sentenceGraph, goldGraph):
        """
        Build one example for each token of the sentence
        """
        if sentenceGraph.sentenceElement.get("origId") in self.skiplist:
            print >>sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get("origId")
            return []

        self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True)
        self.triggerFeatureBuilder.initSentence(sentenceGraph)

        undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
        # Get argument order
        self.interactionLengths = self.getInteractionEdgeLengths(sentenceGraph, paths)
        self.interactionLengths = self.interactionLengths.values()
        self.interactionLengths.sort(compareInteractionPrecedence)
        # Map tokens to entities
        tokenByOffset = {}
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            if goldGraph != None:
                goldToken = goldGraph.tokens[i]
                assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset")
            tokenByOffset[token.get("charOffset")] = token.get("id")
        # Map gold entities to their head offsets
        goldEntitiesByOffset = {}
        for token in sentenceGraph.tokens:
            goldEntitiesByOffset[token.get("charOffset")] = []
        entityToGold = {}
        for entity in sentenceGraph.entities:
            entityToGold[entity] = []
        if goldGraph != None:
            for entity in goldGraph.entities:
                offset = entity.get("headOffset")
                assert offset != None
                goldEntitiesByOffset[offset].append(entity)
            # Map predicted entities to gold entities
            for entity in sentenceGraph.entities:
                eType = entity.get("type")
                eOffset = entity.get("headOffset")
                for goldEntity in goldEntitiesByOffset[eOffset]:
                    if goldEntity.get("type") == eType:
                        entityToGold[entity].append(goldEntity)
        # Map entities to interactions
        # interactionsByEntityId = {}
        # for entity in sentenceGraph.entities:
        #    interactionsByEntityId[entity.get("id")] = []
        # Map tokens to interactions
        interactionsByToken = {}
        for token in sentenceGraph.tokens:
            interactionsByToken[token] = []
        for interactionTuple in self.interactionLengths:
            interaction = interactionTuple[0]
            if interaction.get("type") == "neg":
                continue
            e1Id = interaction.get("e1")
            token = sentenceGraph.entityHeadTokenByEntity[sentenceGraph.entitiesById[e1Id]]
            interactionsByToken[token].append(interaction)

        examples = []
        exampleIndex = 0

        self.tokenFeatures = {}

        # namedEntityNorStrings = set()
        namedEntityHeadTokens = []
        if not "names" in self.styles:
            namedEntityCount = 0
            for entity in sentenceGraph.entities:
                if entity.get("isName") == "True":  # known data which can be used for features
                    namedEntityCount += 1
                    # namedEntityNorStrings.add( entity.get("text").replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower() )
            namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
            # if namedEntityCount == 0: # no names, no need for triggers
            #    return []

            if "pos_pairs" in self.styles:
                namedEntityHeadTokens = self.getNamedEntityHeadTokens(sentenceGraph)

        # neFeatures = {} # F: 69.35 -> 69.14
        # for norString in namedEntityNorStrings:
        #    neFeatures[self.featureSet.getId("norNE_" + norString)] = 1

        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
        bowFeatures = {}
        for k, v in bagOfWords.iteritems():
            bowFeatures[self.featureSet.getId(k)] = v

        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True)
            fixedInEdges = []
            for edge in inEdges:
                fixedInEdges.append((edge[0], edge[1], edge[2]["element"]))
            inEdges = fixedInEdges
            inEdges.sort(compareDependencyEdgesById)
            self.inEdgesByToken[token] = inEdges
            outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True)
            fixedOutEdges = []
            for edge in outEdges:
                fixedOutEdges.append((edge[0], edge[1], edge[2]["element"]))
            outEdges = fixedOutEdges
            outEdges.sort(compareDependencyEdgesById)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)

        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            # Recognize only non-named entities (i.e. interaction words)
            if sentenceGraph.tokenIsName[token] and not "names" in self.styles and not "all_tokens" in self.styles:
                continue

            # CLASS
            # if len(sentenceGraph.tokenIsEntityHead[token]) > 0:
            #    category = self.classSet.getId(self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token]))
            # else:
            #    category = 1
            offset = token.get("charOffset")
            if len(goldEntitiesByOffset[offset]) > 0:
                category = self.classSet.getId(self.getMergedEntityType(goldEntitiesByOffset[offset]))
            else:
                category = 1

            tokenText = token.get("text").lower()
            if "stem_gazetteer" in self.styles:
                tokenText = PorterStemmer.stem(tokenText)
            if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer:
                features = {}
                features[self.featureSet.getId("exclude_gazetteer")] = 1
                extra = {"xtype": "token", "t": token.get("id"), "excluded": "True"}
                examples.append((sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra))
                exampleIndex += 1
                continue

            # FEATURES
            features = {}
            self.features = features

            if not "names" in self.styles:
                features[self.featureSet.getId(namedEntityCountFeature)] = 1
            # for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)
            # features.update(neFeatures)

            #            for j in range(len(sentenceGraph.tokens)):
            #                text = "bow_" + sentenceGraph.tokens[j].get("text")
            #                if j < i:
            #                    features[self.featureSet.getId("bf_" + text)] = 1
            #                elif j > i:
            #                    features[self.featureSet.getId("af_" + text)] = 1

            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_" + text)] = 1
            features[self.featureSet.getId("POS_" + token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_" + stem)] = 1
            features[self.featureSet.getId("nonstem_" + text[len(stem) :])] = 1

            # Normalized versions of the string (if same as non-normalized, overlap without effect)
            normalizedText = (
                text.replace("-", "").replace("/", "").replace(",", "").replace("\\", "").replace(" ", "").lower()
            )
            if normalizedText == "bound":  # should be for all irregular verbs
                normalizedText = "bind"
            features[self.featureSet.getId("txt_" + normalizedText)] = 1
            norStem = PorterStemmer.stem(normalizedText)
            features[self.featureSet.getId("stem_" + norStem)] = 1
            features[self.featureSet.getId("nonstem_" + normalizedText[len(norStem) :])] = 1

            if "gazetteer_features_maintoken" in self.styles:
                tokTxtLower = text.lower()
                if "stem_gazetteer" in self.styles:
                    tokTxtLower = PorterStemmer.stem(tokTxtLower)
                if self.gazetteer and tokTxtLower in self.gazetteer:
                    for label, weight in self.gazetteer[tokTxtLower].items():
                        features[self.featureSet.getId("gaz_knownLabel_" + label)] = weight  # 1 performs slightly worse

            # Linear order features
            # for index in [-3,-2,-1,1,2,3,4,5]: # 69.35 -> 68.97
            for index in [-3, -2, -1, 1, 2, 3]:
                if i + index > 0 and i + index < len(sentenceGraph.tokens):
                    self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features)

            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j - 1] == "-":
                        features[self.featureSet.getId("has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_" + text[j - 1 : j + 1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_" + text[j - 2 : j + 1].lower())] = 1

            # Attached edges (Hanging in and out edges)
            t1InEdges = self.inEdgesByToken[token]
            for edge in t1InEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HIn_" + edgeType)] = 1
                features[self.featureSet.getId("t1HIn_" + edge[0].get("POS"))] = 1
                features[self.featureSet.getId("t1HIn_" + edgeType + "_" + edge[0].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[0])
                features[self.featureSet.getId("t1HIn_" + tokenText)] = 1
                features[self.featureSet.getId("t1HIn_" + edgeType + "_" + tokenText)] = 1
            t1OutEdges = self.outEdgesByToken[token]
            for edge in t1OutEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HOut_" + edgeType)] = 1
                features[self.featureSet.getId("t1HOut_" + edge[1].get("POS"))] = 1
                features[self.featureSet.getId("t1HOut_" + edgeType + "_" + edge[1].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[1])
                features[self.featureSet.getId("t1HOut_" + tokenText)] = 1
                features[self.featureSet.getId("t1HOut_" + edgeType + "_" + tokenText)] = 1

            extra = {"xtype": "token", "t": token.get("id")}
            examples.append((sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra))
            exampleIndex += 1

            # chains
            self.buildChains(token, sentenceGraph, features)

            if "pos_pairs" in self.styles:
                self.buildPOSPairs(token, namedEntityHeadTokens, features)

            self.buildPredictionFeatures(sentenceGraph, paths, token, interactionsByToken[token])
        return examples

    def buildChains(self, token, sentenceGraph, features, depthLeft=3, chain="", visited=None):
        if depthLeft == 0:
            return
        strDepthLeft = "dist_" + str(depthLeft)

        if visited == None:
            visited = set()

        inEdges = self.inEdgesByToken[token]
        outEdges = self.outEdgesByToken[token]
        edgeSet = visited.union(self.edgeSetByToken[token])
        for edge in inEdges:
            if not edge in visited:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("dep_" + strDepthLeft + edgeType)] = 1

                nextToken = edge[0]
                for tokenFeature, w in self.getTokenFeatures(nextToken, sentenceGraph).iteritems():
                    features[self.featureSet.getId(strDepthLeft + tokenFeature)] = w
                #                for entity in sentenceGraph.tokenIsEntityHead[nextToken]:
                #                    if entity.get("isName") == "True":
                #                        features[self.featureSet.getId("name_dist_"+strDepthLeft)] = 1
                #                        features[self.featureSet.getId("name_dist_"+strDepthLeft+entity.get("type"))] = 1
                #                features[self.featureSet.getId("POS_dist_"+strDepthLeft+nextToken.get("POS"))] = 1
                #                tokenText = sentenceGraph.getTokenText(nextToken)
                #                features[self.featureSet.getId("text_dist_"+strDepthLeft+tokenText)] = 1

                if sentenceGraph.tokenIsName[nextToken]:
                    features[self.featureSet.getId("name_chain_dist_" + strDepthLeft + chain + "-frw_" + edgeType)] = 1
                features[self.featureSet.getId("chain_dist_" + strDepthLeft + chain + "-frw_" + edgeType)] = 1
                self.buildChains(nextToken, sentenceGraph, features, depthLeft - 1, chain + "-frw_" + edgeType, edgeSet)

        for edge in outEdges:
            if not edge in visited:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("dep_dist_" + strDepthLeft + edgeType)] = 1

                nextToken = edge[1]
                for tokenFeature, w in self.getTokenFeatures(nextToken, sentenceGraph).iteritems():
                    features[self.featureSet.getId(strDepthLeft + tokenFeature)] = w
                #                for entity in sentenceGraph.tokenIsEntityHead[nextToken]:
                #                    if entity.get("isName") == "True":
                #                        features[self.featureSet.getId("name_dist_"+strDepthLeft)] = 1
                #                        features[self.featureSet.getId("name_dist_"+strDepthLeft+entity.get("type"))] = 1
                #                features[self.featureSet.getId("POS_dist_"+strDepthLeft+nextToken.get("POS"))] = 1
                #                tokenText = sentenceGraph.getTokenText(nextToken)
                #                features[self.featureSet.getId("text_dist_"+strDepthLeft+tokenText)] = 1

                if sentenceGraph.tokenIsName[nextToken]:
                    features[self.featureSet.getId("name_chain_dist_" + strDepthLeft + chain + "-rev_" + edgeType)] = 1
                features[self.featureSet.getId("chain_dist_" + strDepthLeft + chain + "-rev_" + edgeType)] = 1
                self.buildChains(nextToken, sentenceGraph, features, depthLeft - 1, chain + "-rev_" + edgeType, edgeSet)

    def getNamedEntityHeadTokens(self, sentenceGraph):
        headTokens = []
        for entity in sentenceGraph.entities:
            if entity.get("isName") == "True":  # known data which can be used for features
                headTokens.append(sentenceGraph.entityHeadTokenByEntity[entity])
        return headTokens

    def buildPOSPairs(self, token, namedEntityHeadTokens, features):
        tokenPOS = token.get("POS")
        assert tokenPOS != None
        for headToken in namedEntityHeadTokens:
            headPOS = headToken.get("POS")
            features[self.featureSet.getId("POS_pair_NE_" + tokenPOS + "-" + headPOS)] = 1

    ######################################################
    # Unmerging-style features
    ######################################################

    def buildPredictionFeatures(self, sentenceGraph, paths, token, interactions):  # themeEntities, causeEntities=None):
        # NOTE!!!! TODO
        # add also features for arguments present, but not in this combination

        self.buildInterArgumentBagOfWords(interactions, sentenceGraph)

        if sentenceGraph.entitiesByToken.has_key(token):
            for eventEntity in sentenceGraph.entitiesByToken[token]:
                eventEntityType = eventEntity.get("type")
                self.setFeature("rootType_" + eventEntity.get("type"), 1)
                self.setFeature("predStrength" + eventEntityType, self.getPredictionStrength(eventEntity))
                self.triggerFeatureBuilder.setFeatureVector(self.features)
                self.triggerFeatureBuilder.tag = "trg" + eventEntityType + "_"
                self.triggerFeatureBuilder.buildFeatures(token)
                self.triggerFeatureBuilder.tag = None

        argThemeCount = 0
        argCauseCount = 0
        # Current example's edge combination
        for i in range(len(interactions)):
            arg = interactions[i]
            if arg.get("type") == "Theme":
                argThemeCount += 1
                self.buildArgumentFeatures(sentenceGraph, paths, self.features, token, arg, "argTheme")
                self.buildArgumentFeatures(sentenceGraph, paths, self.features, token, arg, "argTheme" + str(i))
            else:  # Cause
                argCauseCount += 1
                self.buildArgumentFeatures(sentenceGraph, paths, self.features, token, arg, "argCause")
                self.buildArgumentFeatures(sentenceGraph, paths, self.features, token, arg, "argCause" + str(i))

        self.setFeature("argCount", len(interactions))
        self.setFeature("argCount_" + str(len(interactions)), 1)

        self.setFeature("argThemeCount", argThemeCount)
        self.setFeature("argThemeCount_" + str(argThemeCount), 1)
        self.setFeature("argCauseCount", argCauseCount)
        self.setFeature("argCauseCount_" + str(argCauseCount), 1)

        self.triggerFeatureBuilder.tag = ""
        self.triggerFeatureBuilder.setFeatureVector(None)

    def buildArgumentFeatures(self, sentenceGraph, paths, features, eventToken, arg, tag):
        argEntity = sentenceGraph.entitiesById[arg.get("e2")]
        argToken = sentenceGraph.entityHeadTokenByEntity[argEntity]
        self.buildEdgeFeatures(sentenceGraph, paths, features, eventToken, argToken, tag)
        self.triggerFeatureBuilder.tag = tag + "trg_"
        self.triggerFeatureBuilder.buildFeatures(argToken)
        if argEntity.get("isName") == "True":
            self.setFeature(tag + "Protein", 1)
        else:
            self.setFeature(tag + "Event", 1)
            self.setFeature("nestingEvent", 1)
        self.setFeature(tag + "_" + argEntity.get("type"), 1)

    def buildEdgeFeatures(self, sentenceGraph, paths, features, eventToken, argToken, tag):
        # eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode]
        # argToken = sentenceGraph.entityHeadTokenByEntity[argNode]
        self.multiEdgeFeatureBuilder.tag = tag + "_"
        self.multiEdgeFeatureBuilder.setFeatureVector(features, None, None, False)

        self.setFeature(tag + "_present", 1)

        if eventToken != argToken and paths.has_key(eventToken) and paths[eventToken].has_key(argToken):
            path = paths[eventToken][argToken]
            edges = self.multiEdgeFeatureBuilder.getEdges(sentenceGraph.dependencyGraph, path)
        else:
            path = [eventToken, argToken]
            edges = None

        if not "disable_entity_features" in self.styles:
            self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph)
        self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path)
        # if not "disable_terminus_features" in self.styles:
        #    self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast
        if not "disable_single_element_features" in self.styles:
            self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, edges, sentenceGraph)
        if not "disable_ngram_features" in self.styles:
            self.multiEdgeFeatureBuilder.buildPathGrams(2, path, edges, sentenceGraph)  # remove for fast
            self.multiEdgeFeatureBuilder.buildPathGrams(3, path, edges, sentenceGraph)  # remove for fast
            self.multiEdgeFeatureBuilder.buildPathGrams(4, path, edges, sentenceGraph)  # remove for fast
        if not "disable_path_edge_features" in self.styles:
            self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, edges, sentenceGraph)
        # self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph)
        self.multiEdgeFeatureBuilder.setFeatureVector(None, None, None, False)
        self.multiEdgeFeatureBuilder.tag = ""

    def buildInterArgumentBagOfWords(self, arguments, sentenceGraph):
        if len(arguments) < 2:
            return

        indexByToken = {}
        for i in range(len(sentenceGraph.tokens)):
            indexByToken[sentenceGraph.tokens[i]] = i

        argTokenIndices = set()
        for arg in arguments:
            argEntity = sentenceGraph.entitiesById[arg.get("e2")]
            argToken = sentenceGraph.entityHeadTokenByEntity[argEntity]
            argTokenIndices.add(indexByToken[argToken])
        minIndex = min(argTokenIndices)
        maxIndex = max(argTokenIndices)
        self.setFeature("argBoWRange", (maxIndex - minIndex))
        self.setFeature("argBoWRange_" + str(maxIndex - minIndex), 1)
        bow = set()
        for i in range(minIndex + 1, maxIndex):
            token = sentenceGraph.tokens[i]
            if len(sentenceGraph.tokenIsEntityHead[token]) == 0 and not sentenceGraph.tokenIsName[token]:
                bow.add(token.get("text"))
        bow = sorted(list(bow))
        for word in bow:
            self.setFeature("argBoW_" + word, 1)
            if word in ["/", "-"]:
                self.setFeature("argBoW_slashOrHyphen", 1)
        if len(bow) == 1:
            self.setFeature("argBoWonly_" + bow[0], 1)
            if bow[0] in ["/", "-"]:
                self.setFeature("argBoWonly_slashOrHyphen", 1)