Example #1
0
class PathGazetteer(ExampleBuilder):
    def __init__(self, includeNeg=False):
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(None)
        self.gazetteer = {}
        self.includeNeg = includeNeg

    @classmethod
    def build(cls, input, output, parse, tokenization=None, includeNeg=False):
        p = PathGazetteer(includeNeg)
        sentences = cls.getSentences(input, parse, tokenization)

        counter = ProgressCounter(len(sentences), "Build path gazetteer")
        for sentence in sentences:
            counter.update(
                1, "Building path gazetteer (" + sentence[0].getSentenceId() +
                "): ")
            p.processSentence(sentence[0])
        p.calculateFractions()

        f = open(output, "wt")
        for key in sorted(p.gazetteer.keys()):
            v = p.gazetteer[key]
            f.write(key + " " + str(v[0]) + " " + str(v[1]) + " " + str(v[2]) +
                    " " + str(v[3]) + "\n")
        f.close()

    @classmethod
    def load(cls, filename):
        gazetteer = {}
        f = open(filename, "rt")
        for line in f.readlines():
            splits = line.split()
            gazetteer[splits[0]] = (splits[1], splits[2], splits[3], splits[4])
        f.close()
        return gazetteer

    @classmethod
    def getDependencies(cls, gazetteer):
        dependencies = set()
        for k in gazetteer.keys():
            k = k.strip()
            k = k.replace("<", "")
            k = k.replace(">", "")
            for dep in k.split("."):
                dependencies.add(dep)
        return dependencies

    @classmethod
    def getPairs(self, gazetteer):
        pairs = []
        for k in gazetteer.keys():
            k = k.strip()
            k = k.replace("<", "")
            k = k.replace(">", "")
            splits = k.split(".")
            pairs.append((splits[0], splits[-1]))
        return pairs

    def calculateFractions(self):
        numInstances = 0
        numPos = 0
        numNeg = 0
        for v in self.gazetteer.values():
            numInstances += v[2] + v[3]
            numPos += v[2]
            numNeg += v[3]
        print >> sys.stderr, "Total paths:", numInstances
        print >> sys.stderr, "Positives:", numPos
        print >> sys.stderr, "Negatives:", numNeg

        for v in self.gazetteer.values():
            assert v[0] == None and v[1] == None
            if v[3] == None:
                assert v[2] != None
                v[0] = 1
            else:
                v[0] = float(v[2]) / float(v[2] + v[3])
            v[1] = float(v[2] + v[3]) / float(numInstances)

    def nxMultiDiGraphToUndirected(self, graph):
        undirected = NX10.MultiGraph(name=graph.name)
        undirected.add_nodes_from(graph)
        undirected.add_edges_from(graph.edges_iter())
        return undirected

    def processSentence(self, sentenceGraph):
        #undirected = sentenceGraph.dependencyGraph.to_undirected()
        undirected = self.nxMultiDiGraphToUndirected(
            sentenceGraph.dependencyGraph)
        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
        self.multiEdgeFeatureBuilder.setFeatureVector()

        positivePaths = {
        }  # per sentence, a path may still be negative in another sentence
        for interaction in sentenceGraph.interactions:
            e1 = sentenceGraph.entitiesById[interaction.get("e1")]
            e1Token = sentenceGraph.entityHeadTokenByEntity[e1]
            e2 = sentenceGraph.entitiesById[interaction.get("e2")]
            e2Token = sentenceGraph.entityHeadTokenByEntity[e2]

            if paths.has_key(e1Token) and paths[e1Token].has_key(e2Token):
                if not positivePaths.has_key(e1Token):
                    positivePaths[e1Token] = {}
                positivePaths[e1Token][e2Token] = True

                path = paths[e1Token][e2Token]
                for comb in self.multiEdgeFeatureBuilder.getEdgeCombinations(
                        sentenceGraph.dependencyGraph, path):
                    if not self.gazetteer.has_key(comb):
                        self.gazetteer[comb] = [None, None, 0, 0]
                    self.gazetteer[comb][2] += 1

        if self.includeNeg:
            for t1 in sentenceGraph.tokens:
                for t2 in sentenceGraph.tokens:
                    if t1 == t2:
                        continue
                    if positivePaths.has_key(t1) and positivePaths[t1].has_key(
                            t2):
                        continue

                    if paths.has_key(t1) and paths[t1].has_key(t2):
                        path = paths[t1][t2]
                        for comb in self.multiEdgeFeatureBuilder.getEdgeCombinations(
                                sentenceGraph.dependencyGraph, path):
                            if not self.gazetteer.has_key(comb):
                                self.gazetteer[comb] = [None, None, 0, 0]
                            self.gazetteer[comb][3] += 1
class DirectEventExampleBuilder(ExampleBuilder):
    def __init__(self, style=["typed","directed","headsOnly"], length=None, types=[], featureSet=None, classSet=None, gazetteer=None, pathGazetteer=None, negFrac=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert( classSet.getId("neg") == 1 )
        
        if gazetteer != None:
            print >> sys.stderr, "Loading gazetteer from", gazetteer
            self.gazetteer=Gazetteer.loadGztr(gazetteer)
        else:
            print >> sys.stderr, "No gazetteer loaded"
            self.gazetteer=None
        
        self.pathGazetteer=None
        self.pathGazetteerDependencies = None
        self.pathGazetteerPairs = None
        if pathGazetteer != None:
            print >> sys.stderr, "Loading path gazetteer from", pathGazetteer
            self.pathGazetteer=PathGazetteer.load(pathGazetteer)
            self.pathGazetteerDependencies = PathGazetteer.getDependencies(self.pathGazetteer)
            self.pathGazetteerPairs = PathGazetteer.getPairs(self.pathGazetteer)
        else:
            print >> sys.stderr, "No path gazetteer loaded"
        
        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        self.styles = style
        self.negFrac = negFrac
        print >> sys.stderr, "Downsampling negatives to", negFrac
        self.negRand = random.Random()
        
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        if True:#"noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True
        
        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        #if "ontology" in self.styles:
        #    self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
        self.pathLengths = length
        assert(self.pathLengths == None)
        self.types = types
        
        self.eventsByOrigId = {}
        self.headTokensByOrigId = {}
        self.interSentenceEvents = set()
        
        self.examplesByEventOrigId = {}
        self.skippedByType = {}
        self.skippedByTypeAndReason = {}
        self.builtByType = {}
        
        self.gazMatchCache = {}
        
        #self.outFile = open("exampleTempFile.txt","wt")

    @classmethod
    def run(cls, input, output, parse, tokenization, style, idFileTag=None, gazetteer=None, pathGazetteer=None, negFrac=None):
        classSet, featureSet = cls.getIdSets(idFileTag)
        if style != None:
            e = DirectEventExampleBuilder(style=style, classSet=classSet, featureSet=featureSet, gazetteer=gazetteer, pathGazetteer=pathGazetteer, negFrac=negFrac)
        else:
            e = DirectEventExampleBuilder(classSet=classSet, featureSet=featureSet, gazetteer=gazetteer, pathGazetteer=pathGazetteer, negFrac=negFrac)
        sentences = cls.getSentences(input, parse, tokenization)
        e.buildExamplesForSentences(sentences, output, idFileTag)
        e.printStats()
    
    def getGazetteerMatch(self, string):
        if string in self.gazMatchCache:
            return self.gazMatchCache[string]
        
        origString = string
        if "stem_gazetteer" in self.styles:
            string = PorterStemmer.stem(string)
        
        if string in self.gazetteer:
            self.gazMatchCache[origString] = string
            return string
        elif string.find("-") != -1:
            replaced = string.replace("-","")
        else:
            self.gazMatchCache[origString] = None
            return None
        
        if replaced in self.gazetteer:
            self.gazMatchCache[origString] = replaced
            return replaced
        else:
            splitted = string.rsplit("-",1)[-1]
        
        if splitted  in self.gazetteer:
            self.gazMatchCache[origString] = splitted
            return splitted
        else:
            self.gazMatchCache[origString] = None
            return None
    
    def isInGazetteer(self, string):
        return self.getGazetteerMatch(string) != None
    
    def printStats(self):
        eventsByType = {}
        for event in self.eventsByOrigId.values():
            eventsByType[event.get("type")] = eventsByType.get(event.get("type"),0) + 1
        
        f = open("missed-events", "wt")
        missedEvents = {}
        for key in self.examplesByEventOrigId.keys():
            if self.examplesByEventOrigId[key] == 0:
                if not missedEvents.has_key(self.eventsByOrigId[key].get("type")):
                    missedEvents[self.eventsByOrigId[key].get("type")] = []
                missedEvents[self.eventsByOrigId[key].get("type")].append(key)

        for key in sorted(missedEvents.keys()):
            f.write(key + "\n")
            for id in sorted(missedEvents[key]):
                f.write(" " + id + " ")
                if id in self.interSentenceEvents:
                    f.write("intersentence ")
                text = self.headTokensByOrigId[id].get("text").lower()
                if not self.isInGazetteer(text):
                    text = self.headTokensByOrigId[id].get("text").lower()
                    if "stem_gazetteer" in self.styles:
                        stemmed = PorterStemmer.stem(text)
                    f.write("not-in-gazetteer (" + text + " / " + stemmed +")" )
                f.write("\n")
        f.close()

        print >> sys.stderr, "Example selection missed events (other, intersentence, non-gazetteer)"
        for key in sorted(eventsByType.keys()):
            inter = 0
            other = 0
            nongaz = 0
            if missedEvents.has_key(key):
                for id in missedEvents[key]:
                    tokText = self.headTokensByOrigId[id].get("text").lower()
                    if id in self.interSentenceEvents:
                        inter += 1
                    elif not self.isInGazetteer(tokText):
                        nongaz += 1
                    else:
                        other += 1
            if inter == other == nongaz == 0:
                print >> sys.stderr, " " + key + " (" + str(eventsByType[key]) + "): missed none"
            else:
                print >> sys.stderr, " " + key + " (" + str(eventsByType[key]) + "): " + str(other) + ", " + str(inter) + ", " + str(nongaz)
        print >> sys.stderr, "Example generation (total, built/skipped)"
        for key in sorted(list(set(self.skippedByType.keys() + self.builtByType.keys()))):
            string = " " + key + ": (" + str(self.builtByType.get(key,0)+self.skippedByType.get(key,0)) + ", " + str(self.builtByType.get(key,0)) + "/" + str(self.skippedByType.get(key,0)) + ") ["
            for key2 in sorted(self.skippedByTypeAndReason[key].keys()):
                string += key2 + ":" + str(self.skippedByTypeAndReason[key][key2]) + " "
            string += "]"
            print >> sys.stderr, string
    
    def definePredictedValueRange(self, sentences, elementName):
        self.multiEdgeFeatureBuilder.definePredictedValueRange(sentences, elementName)                        
    
    def getPredictedValueRange(self):
        return self.multiEdgeFeatureBuilder.predictedRange
    
    def preProcessExamples(self, allExamples):
        if "normalize" in self.styles:
            print >> sys.stderr, " Normalizing feature vectors"
            ExampleUtils.normalizeFeatureVectors(allExamples)
        return allExamples   
    
#    def isPotentialGeniaInteraction(self, e1, e2):
#        if e1.get("isName") == "True" and e2.get("isName") == "True":
#            return False
#        elif e1.get("isName") == "True" and e2.get("isName") == "False":
#            return False
#        else:
#            return True
    
    def getArgumentEntities(self, sentenceGraph, entityNode):
        eId = entityNode.get("id")
        assert(eId != None)
        themeNodes = []
        causeNodes = []
        for edge in sentenceGraph.interactions:
            if edge.get("e1") == eId:
                edgeType = edge.get("type")
                assert(edgeType in ["Theme", "Cause"]), edgeType
                if edgeType == "Theme":
                    themeNodes.append( sentenceGraph.entitiesById[edge.get("e2")] )
                elif edgeType == "Cause":
                    causeNodes.append( sentenceGraph.entitiesById[edge.get("e2")] )
        return themeNodes, causeNodes
    
    def makeGSEvents(self, sentenceGraph):
        self.namedEntityHeadTokenIds = set()
        self.gsEvents = {} # [token]->[event-type]->[1-n argument sets]
        for token in sentenceGraph.tokens:
            self.gsEvents[token] = {}
        
        for entity in sentenceGraph.entities:
            if entity.get("type") == "neg":
                continue
            elif entity.get("isName") == "True":
                self.namedEntityHeadTokenIds.add(sentenceGraph.entityHeadTokenByEntity[entity].get("id"))
                continue
            
            eId = entity.get("id")
            eOrigId = entity.get("origId")
            assert not self.eventsByOrigId.has_key(eOrigId)
            self.eventsByOrigId[eOrigId] = entity
            if not self.examplesByEventOrigId.has_key(eOrigId):
                self.examplesByEventOrigId[eOrigId] = 0
            if len(sentenceGraph.interSentenceInteractions) > 0:
                for interaction in sentenceGraph.interSentenceInteractions:
                    if interaction.get("e1") == eId:
                        self.interSentenceEvents.add(eOrigId)
            eType = entity.get("type")
            arguments = set()
            for interaction in sentenceGraph.interactions:
                if interaction.get("e1") == eId:
                    e2 = sentenceGraph.entitiesById[interaction.get("e2")]
                    e2TokenId = sentenceGraph.entityHeadTokenByEntity[e2].get("id")
                    arguments.add( (interaction.get("type"), e2TokenId ) )
                    #arguments.add( (interaction.get("type"), interaction.get("e2") ) )
            arguments = tuple(sorted(list(arguments)))
            eHeadToken = sentenceGraph.entityHeadTokenByEntity[entity]
            self.headTokensByOrigId[eOrigId] = eHeadToken
            if not self.gsEvents[eHeadToken].has_key(eType):
                self.gsEvents[eHeadToken][eType] = {}
            if len(arguments) > 0:
                if not self.gsEvents[eHeadToken][eType].has_key(arguments):
                    self.gsEvents[eHeadToken][eType][arguments] = []
                self.gsEvents[eHeadToken][eType][arguments].append(eOrigId)
    
    def getGSEventType(self, sentenceGraph, eHeadToken, themeTokens, causeTokens):
        #eHeadToken = sentenceGraph.entityHeadTokenByEntity[entity]
        #eType = entity.get("type")
        if len(self.gsEvents[eHeadToken]) == 0:
            return "neg", []
            
        argumentSet = set()
        for themeNode in themeTokens:
            if themeNode != None:
                argumentSet.add( ("Theme", themeNode.get("id")) )
        for causeNode in causeTokens:
            if causeNode != None:
                argumentSet.add( ("Cause", causeNode.get("id")) )
        argumentSet = tuple(sorted(list(argumentSet)))
        
        gsTypes = set()
        eventIds = []
        for eventType in sorted(self.gsEvents[eHeadToken].keys()):
            if argumentSet in self.gsEvents[eHeadToken][eventType].keys():
                gsTypes.add(eventType)
                eventIds.extend(self.gsEvents[eHeadToken][eventType][argumentSet])
        
        if len(gsTypes) == 0:
            return "neg", eventIds
        elif len(gsTypes) == 1:
            return list(gsTypes)[0], eventIds
        else:
            gsTypes = sorted(list(gsTypes))
            string = gsTypes[0]
            for gsType in gsTypes[1:]:
                string += "---" + gsType
            return string, eventIds

    def nxMultiDiGraphToUndirected(self, graph):
        undirected = NX10.MultiGraph(name=graph.name)
        undirected.add_nodes_from(graph)
        undirected.add_edges_from(graph.edges_iter())
        return undirected
                        
    def buildExamples(self, sentenceGraph):
        self.makeGSEvents(sentenceGraph)
        self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True)
        self.triggerFeatureBuilder.initSentence(sentenceGraph)
        
        examples = []
        exampleIndex = 0
        
        #undirected = sentenceGraph.dependencyGraph.to_undirected()
        undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
        
        eventTokens = []
        nameTokens = []
        gazCategories = {None:{"neg":-1}}
        #stems = {}
        for token in sentenceGraph.tokens:
            gazText = self.getGazetteerMatch(token.get("text").lower())
            if gazText != None:
                gazCategories[token] = self.gazetteer[gazText]
            else:
                gazCategories[token] = {"neg":-1}
            
            if token.get("id") in self.namedEntityHeadTokenIds:
                nameTokens.append(token)
            elif gazText != None:
                eventTokens.append(token)
        allTokens = eventTokens + nameTokens
        
        #if len(nameTokens) == 0: # there can be no events in this sentence
        #    self.gsEvents = None
        #    return []
        
        for token in eventTokens:
            #gazCategories = self.gazetteer[token.get("text").lower()]
            #print token.get("text").lower(), gazCategories
            
            #multiargument = False
            potentialRegulation = False
            potentialBinding = False
            for key in gazCategories[token].keys():
                if key in ["Regulation","Positive_regulation","Negative_regulation"]:
                    #multiargument = True
                    potentialRegulation = True
                    break
            for key in gazCategories[token].keys():
                if key in ["Binding"]:
                    #multiargument = True
                    potentialBinding = True
                    break
            
            if potentialRegulation:
                combinations = combine.combine(allTokens, allTokens+[None])
            else:
                combinations = []
                for t2 in nameTokens: #allTokens:
                    combinations.append( (t2, None) )
            
            if potentialBinding:
                for i in range(len(nameTokens) - 1):
                    for j in range(i+1, len(nameTokens)):
                        combinations.append( ((nameTokens[i],nameTokens[j]), None) )
                
            for combination in combinations:
                theme2Binding = False
                if type(combination[0]) == types.ListType or type(combination[0]) == types.TupleType:
                    theme2Binding = True
                    categoryName, eventIds = self.getGSEventType(sentenceGraph, token, combination[0], [combination[1]])
                else:
                    categoryName, eventIds = self.getGSEventType(sentenceGraph, token, [combination[0]], [combination[1]])
                
                for id in eventIds:
                    self.examplesByEventOrigId[id] += 1
                

                skip = False
                s = self.skippedByTypeAndReason
                if not s.has_key(categoryName):
                    s[categoryName] = {}
                if gazCategories[token].get("neg",-1) > 0.99:
                    pass
                if combination[0] == combination[1]:
                    pass #skip = True
                if combination[0] == token or combination[1] == token:
                    if theme2Binding or gazCategories[combination[0]].get("Positive_regulation",-1) < 0:
                        skip = True
                        s[categoryName]["duparg"] = s[categoryName].get("duparg", 0) + 1
                if combination[0] == None and combination[1] == None:
                    skip = True
                    s[categoryName]["noncmb"] = s[categoryName].get("noncmb", 0) + 1
                
                validCat = self.isValidEvent(paths, sentenceGraph, token, combination)
                if validCat != "OK": #not self.isValidEvent(paths, sentenceGraph, token, combination):
                    skip = True
                    #s[categoryName]["valid"] = s[categoryName].get("valid", 0) + 1
                    s[categoryName][validCat] = s[categoryName].get(validCat, 0) + 1
                
                if len(nameTokens) == 0:
                    skip = True
                    s[categoryName]["non"] = s[categoryName].get("non", 0) + 1
                
                if theme2Binding:
                    if gazCategories[combination[0][0]].get("neg",-1) > 0.99 or gazCategories[combination[0][1]].get("neg",-1) > 0.99:
                        skip = True
                        s[categoryName]["gazarg"] = s[categoryName].get("gazarg", 0) + 1
                else:
                    if gazCategories[combination[0]].get("neg",-1) > 0.99 or gazCategories[combination[1]].get("neg",-1) > 0.99:
                        skip = True
                        s[categoryName]["gazarg"] = s[categoryName].get("gazarg", 0) + 1
                
                if (skip and self.negFrac == None) or (skip and self.negFrac != None and categoryName == "neg"):
                    self.skippedByType[categoryName] = self.skippedByType.get(categoryName, 0) + 1
                else:
                    if self.negFrac == None or categoryName != "neg" or (categoryName == "neg" and self.negRand.random() < self.negFrac):
                        self.builtByType[categoryName] = self.builtByType.get(categoryName, 0) + 1
                        if theme2Binding:
                            newExample = self.buildExample(exampleIndex, sentenceGraph, paths, token, combination[0], [combination[1]])
                        else:
                            newExample = self.buildExample(exampleIndex, sentenceGraph, paths, token, [combination[0]], [combination[1]])
                        if len(eventIds) > 0: 
                            newExample[3]["numEv"] = str(len(eventIds))
                        examples.append( newExample )
                        exampleIndex += 1
        
        self.gsEvents = None
        return examples
    
    def isValidEvent(self, paths, sentenceGraph, eventToken, argTokens):
        # This one lets through Positive_regulations that are
        # excluded from the duparg-rule
        oneTokenEvent = True
        for argToken in argTokens:
            if argToken != None and eventToken != argToken:
                oneTokenEvent = False
                break
        if oneTokenEvent:
            return "OK" #True
            
        if not paths.has_key(eventToken):
            return "nopaths" #False
        
        newArgTokens = []
        for argToken in argTokens:
            if type(argToken) == types.ListType or type(argToken) == types.TupleType:
                newArgTokens.extend(argToken)
            else:
                newArgTokens.append(argToken)
        argTokens = newArgTokens
        
        oneArgValid = True
        if False:
            oneArgValid = False    
            for argToken in argTokens:
                if argToken == None:
                    continue
                if paths[eventToken].has_key(argToken):
                    path = paths[eventToken][argToken]
                else:
                    #print argToken, argToken.get("text")
                    #return False
                    continue
                depPaths = self.multiEdgeFeatureBuilder.getEdgeCombinations(sentenceGraph.dependencyGraph, path)
                validArg = False
                for p in depPaths:
                    if p in self.pathGazetteer and self.pathGazetteer[p][0] > 0:
                        validArg = True
                        break
                if validArg:
                    oneArgValid = True
        
        # The first and last dependency of a path
        if False:
            oneEdgeValid = False
            for argToken in argTokens:
                if argToken == None:
                    continue
                if paths[eventToken].has_key(argToken):
                    path = paths[eventToken][argToken]
                else:
                    #print argToken, argToken.get("text")
                    #return False
                    continue
                depPaths = self.multiEdgeFeatureBuilder.getEdgeCombinations(sentenceGraph.dependencyGraph, path)
                validArg = False
                for p in depPaths:
                    p = p.replace("<","")
                    p = p.replace(">","")
                    p = p.split(".")
                    pair = (p[0], p[-1])
                    if pair in self.pathGazetteerPairs:
                        validArg = True
                        break
                if validArg:
                    oneEdgeValid = True
                    break
            if not oneEdgeValid:
                return "pair"

        # Event must not have unseen dependencies in any of its paths
        if False:
            for argToken in argTokens:
                if argToken == None:
                    continue
                if paths[eventToken].has_key(argToken):
                    path = paths[eventToken][argToken]
                else:
                    continue
                deps = self.multiEdgeFeatureBuilder.getEdgeSet(sentenceGraph.dependencyGraph, path)
                for d in deps:
                    if d[2].get("type") not in self.pathGazetteerDependencies:
                        #print "Unk", d[2].get("type")
                        return "unkdep"
                
#            validArg = True
#            for p in depPaths:
#                if p in self.pathGazetteer and self.pathGazetteer[p][0] == 0:
#                    validArg = False
#                    break
            
#            if not validArg:
#                return False
        if not oneArgValid:
            return "novalidarg" #False
        return "OK" #True
    
    def setGazetteerFeatures(self, token, tag):
        gazText = self.getGazetteerMatch(token.get("text").lower())
        if gazText != None:
            gazCategories = self.gazetteer[gazText]
            for k,v in gazCategories.iteritems():
                self.setFeature(tag+"gaz_event_value_"+k, v)
                self.setFeature(tag+"gaz_event_"+k, 1)
                if k.find("egulation") != -1:
                    self.setFeature(tag+"potReg", 1)
        else:
            self.setFeature(tag+"notInGaz", 1)
    
    def buildExample(self, exampleIndex, sentenceGraph, paths, eventToken, themeTokens, causeTokens=None):
        features = {}
        self.features = features
        
        categoryName, eventIds = self.getGSEventType(sentenceGraph, eventToken, themeTokens, causeTokens)
        category = self.classSet.getId(categoryName)
        
        potentialRegulation = False
        eventTokenText = eventToken.get("text").lower()
        gazText = self.getGazetteerMatch(eventTokenText)
        gazCategories = self.gazetteer[gazText]
        for k,v in gazCategories.iteritems():
            if k.find("egulation") != -1:
                potentialRegulation = True
        self.setGazetteerFeatures(eventToken,"")
        
        self.triggerFeatureBuilder.setFeatureVector(self.features)
        self.triggerFeatureBuilder.tag = "trg_"
        self.triggerFeatureBuilder.buildFeatures(eventToken)
        
        themeEntities = []
        hasTheme = False
        if len(themeTokens) > 1:
            self.setFeature("multiTheme", 1)
            potentialRegulation = False
        for themeToken in themeTokens:
            if themeToken != None:
                hasTheme = True
                self.setGazetteerFeatures(themeToken,"theme_")
                self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, themeToken, "theme_")
                self.triggerFeatureBuilder.tag = "ttrg_"
                self.triggerFeatureBuilder.buildFeatures(themeToken)
                themeEntity = None
                if sentenceGraph.entitiesByToken.has_key(themeToken):
                    for themeEntity in sentenceGraph.entitiesByToken[themeToken]:
                        if themeEntity.get("isName") == "True":
                            self.setFeature("themeProtein", 1)
                            if potentialRegulation:
                                self.setFeature("regulationThemeProtein", 1)
                            themeEntities.append(themeEntity)
                            break
                if not features.has_key("themeProtein"):
                    self.setFeature("themeEvent", 1)
                    self.setFeature("nestingEvent", 1)
                    if potentialRegulation:
                        self.setFeature("regulationThemeEvent", 1)
        if hasTheme:
            self.setFeature("noTheme", 1)
        
        causeEntities = []
        hasCause = False
        for causeToken in causeTokens:
            if causeToken != None:
                hasCause = True
                self.setGazetteerFeatures(causeToken,"cause_")
                self.buildArgumentFeatures(sentenceGraph, paths, features, eventToken, causeToken, "cause_")
                self.triggerFeatureBuilder.tag = "ctrg_"
                self.triggerFeatureBuilder.buildFeatures(causeToken)
                causeEntity = None
                if sentenceGraph.entitiesByToken.has_key(causeToken):
                    for causeEntity in sentenceGraph.entitiesByToken[causeToken]:
                        if causeEntity.get("isName") == "True":
                            self.setFeature("causeProtein", 1)
                            if potentialRegulation:
                                self.setFeature("regulationCauseProtein", 1)
                            causeEntities.append(causeEntity)
                            break
                if not features.has_key("causeProtein"):
                    self.setFeature("causeEvent", 1)
                    self.setFeature("nestingEvent", 1)
                    if potentialRegulation:
                        self.setFeature("regulationCauseEvent", 1)
        if not hasCause:
            self.setFeature("noCause", 1)
        
        self.triggerFeatureBuilder.tag = ""
        self.triggerFeatureBuilder.setFeatureVector(None)
        
        # Common features
#        if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization
#            if entity2.get("isName") == "True":
#                features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1
#            else:
#                features[self.featureSet.getId("GENIA_regulation_of_event")] = 1

        # define extra attributes
        extra = {"xtype":"event","type":categoryName}
        extra["et"] = eventToken.get("id")
        if len(eventIds) > 0:
            eventIds.sort()
            extra["eids"] = ""
            for eventId in eventIds:
                extra["eids"] += str(eventId) + ","
            extra["eids"] = extra["eids"][:-1]
        for themeToken in themeTokens:
            if themeToken != None:
                if extra.has_key("tt"):
                    extra["tt"] = extra["tt"] + "," + themeToken.get("id")
                else:
                    extra["tt"] = themeToken.get("id")
        for themeEntity in themeEntities:
            if extra.has_key("t"):
                extra["t"] = extra["t"] + "," + themeEntity.get("id")
            else:
                extra["t"] = themeEntity.get("id")
        for causeToken in causeTokens:
            if causeToken != None:
                extra["ct"] = causeTokens[0].get("id")
        if len(causeEntities) > 0:
            extra["c"] = causeEntities[0].get("id")
        sentenceOrigId = sentenceGraph.sentenceElement.get("origId")
        if sentenceOrigId != None:
            extra["SOID"] = sentenceOrigId       
        # make example
        #assert (category == 1 or category == -1)
        self.features = None      
        return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)
    
    def buildArgumentFeatures(self, sentenceGraph, paths, features, eventToken, argToken, tag):
        #eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode]
        #argToken = sentenceGraph.entityHeadTokenByEntity[argNode]
        self.multiEdgeFeatureBuilder.tag = tag
        self.multiEdgeFeatureBuilder.setFeatureVector(features, None, None, False)
        
        if eventToken != argToken and paths.has_key(eventToken) and paths[eventToken].has_key(argToken):
            path = paths[eventToken][argToken]
            edges = self.multiEdgeFeatureBuilder.getEdges(sentenceGraph.dependencyGraph, path)
        else:
            path = [eventToken, argToken]
            edges = None
        
#        if not "disable_entity_features" in self.styles:
#            # doesn't improve beyond 52.32
#            self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph)
#        # buildPathLengthFeatures 52.32 -> 51-51
#        self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path)
#        if not "disable_terminus_features" in self.styles:
#            # didn't improve from 52.32
#            self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast
        if not "disable_single_element_features" in self.styles:
            # 50.74 -> 52.32
            self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, edges, sentenceGraph)
        if not "disable_ngram_features" in self.styles:
            # ngrams alone - 50.74
            self.multiEdgeFeatureBuilder.buildPathGrams(2, path, edges, sentenceGraph) # remove for fast
            self.multiEdgeFeatureBuilder.buildPathGrams(3, path, edges, sentenceGraph) # remove for fast
            self.multiEdgeFeatureBuilder.buildPathGrams(4, path, edges, sentenceGraph) # remove for fast
            # disabling length 4 drops performance
#        if not "disable_path_edge_features" in self.styles:
#            self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, edges, sentenceGraph)
#        self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph)
# buildSentenceFeatures seems to decrease performance by 8 %-points
        self.multiEdgeFeatureBuilder.setFeatureVector(None, None, None, False)
        self.multiEdgeFeatureBuilder.tag = ""
class DirectEventExampleBuilder(ExampleBuilder):
    def __init__(self,
                 style=["typed", "directed", "headsOnly"],
                 length=None,
                 types=[],
                 featureSet=None,
                 classSet=None,
                 gazetteer=None,
                 pathGazetteer=None,
                 negFrac=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert (classSet.getId("neg") == 1)

        if gazetteer != None:
            print >> sys.stderr, "Loading gazetteer from", gazetteer
            self.gazetteer = Gazetteer.loadGztr(gazetteer)
        else:
            print >> sys.stderr, "No gazetteer loaded"
            self.gazetteer = None

        self.pathGazetteer = None
        self.pathGazetteerDependencies = None
        self.pathGazetteerPairs = None
        if pathGazetteer != None:
            print >> sys.stderr, "Loading path gazetteer from", pathGazetteer
            self.pathGazetteer = PathGazetteer.load(pathGazetteer)
            self.pathGazetteerDependencies = PathGazetteer.getDependencies(
                self.pathGazetteer)
            self.pathGazetteerPairs = PathGazetteer.getPairs(
                self.pathGazetteer)
        else:
            print >> sys.stderr, "No path gazetteer loaded"

        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        self.styles = style
        self.negFrac = negFrac
        print >> sys.stderr, "Downsampling negatives to", negFrac
        self.negRand = random.Random()

        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        if True:  #"noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True

        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        #if "ontology" in self.styles:
        #    self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
        self.pathLengths = length
        assert (self.pathLengths == None)
        self.types = types

        self.eventsByOrigId = {}
        self.headTokensByOrigId = {}
        self.interSentenceEvents = set()

        self.examplesByEventOrigId = {}
        self.skippedByType = {}
        self.skippedByTypeAndReason = {}
        self.builtByType = {}

        self.gazMatchCache = {}

        #self.outFile = open("exampleTempFile.txt","wt")

    @classmethod
    def run(cls,
            input,
            output,
            parse,
            tokenization,
            style,
            idFileTag=None,
            gazetteer=None,
            pathGazetteer=None,
            negFrac=None):
        classSet, featureSet = cls.getIdSets(idFileTag)
        if style != None:
            e = DirectEventExampleBuilder(style=style,
                                          classSet=classSet,
                                          featureSet=featureSet,
                                          gazetteer=gazetteer,
                                          pathGazetteer=pathGazetteer,
                                          negFrac=negFrac)
        else:
            e = DirectEventExampleBuilder(classSet=classSet,
                                          featureSet=featureSet,
                                          gazetteer=gazetteer,
                                          pathGazetteer=pathGazetteer,
                                          negFrac=negFrac)
        sentences = cls.getSentences(input, parse, tokenization)
        e.buildExamplesForSentences(sentences, output, idFileTag)
        e.printStats()

    def getGazetteerMatch(self, string):
        if string in self.gazMatchCache:
            return self.gazMatchCache[string]

        origString = string
        if "stem_gazetteer" in self.styles:
            string = PorterStemmer.stem(string)

        if string in self.gazetteer:
            self.gazMatchCache[origString] = string
            return string
        elif string.find("-") != -1:
            replaced = string.replace("-", "")
        else:
            self.gazMatchCache[origString] = None
            return None

        if replaced in self.gazetteer:
            self.gazMatchCache[origString] = replaced
            return replaced
        else:
            splitted = string.rsplit("-", 1)[-1]

        if splitted in self.gazetteer:
            self.gazMatchCache[origString] = splitted
            return splitted
        else:
            self.gazMatchCache[origString] = None
            return None

    def isInGazetteer(self, string):
        return self.getGazetteerMatch(string) != None

    def printStats(self):
        eventsByType = {}
        for event in self.eventsByOrigId.values():
            eventsByType[event.get("type")] = eventsByType.get(
                event.get("type"), 0) + 1

        f = open("missed-events", "wt")
        missedEvents = {}
        for key in self.examplesByEventOrigId.keys():
            if self.examplesByEventOrigId[key] == 0:
                if not missedEvents.has_key(
                        self.eventsByOrigId[key].get("type")):
                    missedEvents[self.eventsByOrigId[key].get("type")] = []
                missedEvents[self.eventsByOrigId[key].get("type")].append(key)

        for key in sorted(missedEvents.keys()):
            f.write(key + "\n")
            for id in sorted(missedEvents[key]):
                f.write(" " + id + " ")
                if id in self.interSentenceEvents:
                    f.write("intersentence ")
                text = self.headTokensByOrigId[id].get("text").lower()
                if not self.isInGazetteer(text):
                    text = self.headTokensByOrigId[id].get("text").lower()
                    if "stem_gazetteer" in self.styles:
                        stemmed = PorterStemmer.stem(text)
                    f.write("not-in-gazetteer (" + text + " / " + stemmed +
                            ")")
                f.write("\n")
        f.close()

        print >> sys.stderr, "Example selection missed events (other, intersentence, non-gazetteer)"
        for key in sorted(eventsByType.keys()):
            inter = 0
            other = 0
            nongaz = 0
            if missedEvents.has_key(key):
                for id in missedEvents[key]:
                    tokText = self.headTokensByOrigId[id].get("text").lower()
                    if id in self.interSentenceEvents:
                        inter += 1
                    elif not self.isInGazetteer(tokText):
                        nongaz += 1
                    else:
                        other += 1
            if inter == other == nongaz == 0:
                print >> sys.stderr, " " + key + " (" + str(
                    eventsByType[key]) + "): missed none"
            else:
                print >> sys.stderr, " " + key + " (" + str(
                    eventsByType[key]) + "): " + str(other) + ", " + str(
                        inter) + ", " + str(nongaz)
        print >> sys.stderr, "Example generation (total, built/skipped)"
        for key in sorted(
                list(set(self.skippedByType.keys() +
                         self.builtByType.keys()))):
            string = " " + key + ": (" + str(
                self.builtByType.get(key, 0) +
                self.skippedByType.get(key, 0)) + ", " + str(
                    self.builtByType.get(key, 0)) + "/" + str(
                        self.skippedByType.get(key, 0)) + ") ["
            for key2 in sorted(self.skippedByTypeAndReason[key].keys()):
                string += key2 + ":" + str(
                    self.skippedByTypeAndReason[key][key2]) + " "
            string += "]"
            print >> sys.stderr, string

    def definePredictedValueRange(self, sentences, elementName):
        self.multiEdgeFeatureBuilder.definePredictedValueRange(
            sentences, elementName)

    def getPredictedValueRange(self):
        return self.multiEdgeFeatureBuilder.predictedRange

    def preProcessExamples(self, allExamples):
        if "normalize" in self.styles:
            print >> sys.stderr, " Normalizing feature vectors"
            ExampleUtils.normalizeFeatureVectors(allExamples)
        return allExamples

#    def isPotentialGeniaInteraction(self, e1, e2):
#        if e1.get("isName") == "True" and e2.get("isName") == "True":
#            return False
#        elif e1.get("isName") == "True" and e2.get("isName") == "False":
#            return False
#        else:
#            return True

    def getArgumentEntities(self, sentenceGraph, entityNode):
        eId = entityNode.get("id")
        assert (eId != None)
        themeNodes = []
        causeNodes = []
        for edge in sentenceGraph.interactions:
            if edge.get("e1") == eId:
                edgeType = edge.get("type")
                assert (edgeType in ["Theme", "Cause"]), edgeType
                if edgeType == "Theme":
                    themeNodes.append(
                        sentenceGraph.entitiesById[edge.get("e2")])
                elif edgeType == "Cause":
                    causeNodes.append(
                        sentenceGraph.entitiesById[edge.get("e2")])
        return themeNodes, causeNodes

    def makeGSEvents(self, sentenceGraph):
        self.namedEntityHeadTokenIds = set()
        self.gsEvents = {}  # [token]->[event-type]->[1-n argument sets]
        for token in sentenceGraph.tokens:
            self.gsEvents[token] = {}

        for entity in sentenceGraph.entities:
            if entity.get("type") == "neg":
                continue
            elif entity.get("isName") == "True":
                self.namedEntityHeadTokenIds.add(
                    sentenceGraph.entityHeadTokenByEntity[entity].get("id"))
                continue

            eId = entity.get("id")
            eOrigId = entity.get("origId")
            assert not self.eventsByOrigId.has_key(eOrigId)
            self.eventsByOrigId[eOrigId] = entity
            if not self.examplesByEventOrigId.has_key(eOrigId):
                self.examplesByEventOrigId[eOrigId] = 0
            if len(sentenceGraph.interSentenceInteractions) > 0:
                for interaction in sentenceGraph.interSentenceInteractions:
                    if interaction.get("e1") == eId:
                        self.interSentenceEvents.add(eOrigId)
            eType = entity.get("type")
            arguments = set()
            for interaction in sentenceGraph.interactions:
                if interaction.get("e1") == eId:
                    e2 = sentenceGraph.entitiesById[interaction.get("e2")]
                    e2TokenId = sentenceGraph.entityHeadTokenByEntity[e2].get(
                        "id")
                    arguments.add((interaction.get("type"), e2TokenId))
                    #arguments.add( (interaction.get("type"), interaction.get("e2") ) )
            arguments = tuple(sorted(list(arguments)))
            eHeadToken = sentenceGraph.entityHeadTokenByEntity[entity]
            self.headTokensByOrigId[eOrigId] = eHeadToken
            if not self.gsEvents[eHeadToken].has_key(eType):
                self.gsEvents[eHeadToken][eType] = {}
            if len(arguments) > 0:
                if not self.gsEvents[eHeadToken][eType].has_key(arguments):
                    self.gsEvents[eHeadToken][eType][arguments] = []
                self.gsEvents[eHeadToken][eType][arguments].append(eOrigId)

    def getGSEventType(self, sentenceGraph, eHeadToken, themeTokens,
                       causeTokens):
        #eHeadToken = sentenceGraph.entityHeadTokenByEntity[entity]
        #eType = entity.get("type")
        if len(self.gsEvents[eHeadToken]) == 0:
            return "neg", []

        argumentSet = set()
        for themeNode in themeTokens:
            if themeNode != None:
                argumentSet.add(("Theme", themeNode.get("id")))
        for causeNode in causeTokens:
            if causeNode != None:
                argumentSet.add(("Cause", causeNode.get("id")))
        argumentSet = tuple(sorted(list(argumentSet)))

        gsTypes = set()
        eventIds = []
        for eventType in sorted(self.gsEvents[eHeadToken].keys()):
            if argumentSet in self.gsEvents[eHeadToken][eventType].keys():
                gsTypes.add(eventType)
                eventIds.extend(
                    self.gsEvents[eHeadToken][eventType][argumentSet])

        if len(gsTypes) == 0:
            return "neg", eventIds
        elif len(gsTypes) == 1:
            return list(gsTypes)[0], eventIds
        else:
            gsTypes = sorted(list(gsTypes))
            string = gsTypes[0]
            for gsType in gsTypes[1:]:
                string += "---" + gsType
            return string, eventIds

    def nxMultiDiGraphToUndirected(self, graph):
        undirected = NX10.MultiGraph(name=graph.name)
        undirected.add_nodes_from(graph)
        undirected.add_edges_from(graph.edges_iter())
        return undirected

    def buildExamples(self, sentenceGraph):
        self.makeGSEvents(sentenceGraph)
        self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True)
        self.triggerFeatureBuilder.initSentence(sentenceGraph)

        examples = []
        exampleIndex = 0

        #undirected = sentenceGraph.dependencyGraph.to_undirected()
        undirected = self.nxMultiDiGraphToUndirected(
            sentenceGraph.dependencyGraph)
        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)

        eventTokens = []
        nameTokens = []
        gazCategories = {None: {"neg": -1}}
        #stems = {}
        for token in sentenceGraph.tokens:
            gazText = self.getGazetteerMatch(token.get("text").lower())
            if gazText != None:
                gazCategories[token] = self.gazetteer[gazText]
            else:
                gazCategories[token] = {"neg": -1}

            if token.get("id") in self.namedEntityHeadTokenIds:
                nameTokens.append(token)
            elif gazText != None:
                eventTokens.append(token)
        allTokens = eventTokens + nameTokens

        #if len(nameTokens) == 0: # there can be no events in this sentence
        #    self.gsEvents = None
        #    return []

        for token in eventTokens:
            #gazCategories = self.gazetteer[token.get("text").lower()]
            #print token.get("text").lower(), gazCategories

            #multiargument = False
            potentialRegulation = False
            potentialBinding = False
            for key in gazCategories[token].keys():
                if key in [
                        "Regulation", "Positive_regulation",
                        "Negative_regulation"
                ]:
                    #multiargument = True
                    potentialRegulation = True
                    break
            for key in gazCategories[token].keys():
                if key in ["Binding"]:
                    #multiargument = True
                    potentialBinding = True
                    break

            if potentialRegulation:
                combinations = combine.combine(allTokens, allTokens + [None])
            else:
                combinations = []
                for t2 in nameTokens:  #allTokens:
                    combinations.append((t2, None))

            if potentialBinding:
                for i in range(len(nameTokens) - 1):
                    for j in range(i + 1, len(nameTokens)):
                        combinations.append(
                            ((nameTokens[i], nameTokens[j]), None))

            for combination in combinations:
                theme2Binding = False
                if type(combination[0]) == types.ListType or type(
                        combination[0]) == types.TupleType:
                    theme2Binding = True
                    categoryName, eventIds = self.getGSEventType(
                        sentenceGraph, token, combination[0], [combination[1]])
                else:
                    categoryName, eventIds = self.getGSEventType(
                        sentenceGraph, token, [combination[0]],
                        [combination[1]])

                for id in eventIds:
                    self.examplesByEventOrigId[id] += 1

                skip = False
                s = self.skippedByTypeAndReason
                if not s.has_key(categoryName):
                    s[categoryName] = {}
                if gazCategories[token].get("neg", -1) > 0.99:
                    pass
                if combination[0] == combination[1]:
                    pass  #skip = True
                if combination[0] == token or combination[1] == token:
                    if theme2Binding or gazCategories[combination[0]].get(
                            "Positive_regulation", -1) < 0:
                        skip = True
                        s[categoryName]["duparg"] = s[categoryName].get(
                            "duparg", 0) + 1
                if combination[0] == None and combination[1] == None:
                    skip = True
                    s[categoryName]["noncmb"] = s[categoryName].get(
                        "noncmb", 0) + 1

                validCat = self.isValidEvent(paths, sentenceGraph, token,
                                             combination)
                if validCat != "OK":  #not self.isValidEvent(paths, sentenceGraph, token, combination):
                    skip = True
                    #s[categoryName]["valid"] = s[categoryName].get("valid", 0) + 1
                    s[categoryName][validCat] = s[categoryName].get(
                        validCat, 0) + 1

                if len(nameTokens) == 0:
                    skip = True
                    s[categoryName]["non"] = s[categoryName].get("non", 0) + 1

                if theme2Binding:
                    if gazCategories[combination[0][0]].get(
                            "neg",
                            -1) > 0.99 or gazCategories[combination[0][1]].get(
                                "neg", -1) > 0.99:
                        skip = True
                        s[categoryName]["gazarg"] = s[categoryName].get(
                            "gazarg", 0) + 1
                else:
                    if gazCategories[combination[0]].get(
                            "neg",
                            -1) > 0.99 or gazCategories[combination[1]].get(
                                "neg", -1) > 0.99:
                        skip = True
                        s[categoryName]["gazarg"] = s[categoryName].get(
                            "gazarg", 0) + 1

                if (skip and self.negFrac
                        == None) or (skip and self.negFrac != None
                                     and categoryName == "neg"):
                    self.skippedByType[categoryName] = self.skippedByType.get(
                        categoryName, 0) + 1
                else:
                    if self.negFrac == None or categoryName != "neg" or (
                            categoryName == "neg"
                            and self.negRand.random() < self.negFrac):
                        self.builtByType[categoryName] = self.builtByType.get(
                            categoryName, 0) + 1
                        if theme2Binding:
                            newExample = self.buildExample(
                                exampleIndex, sentenceGraph, paths, token,
                                combination[0], [combination[1]])
                        else:
                            newExample = self.buildExample(
                                exampleIndex, sentenceGraph, paths, token,
                                [combination[0]], [combination[1]])
                        if len(eventIds) > 0:
                            newExample[3]["numEv"] = str(len(eventIds))
                        examples.append(newExample)
                        exampleIndex += 1

        self.gsEvents = None
        return examples

    def isValidEvent(self, paths, sentenceGraph, eventToken, argTokens):
        # This one lets through Positive_regulations that are
        # excluded from the duparg-rule
        oneTokenEvent = True
        for argToken in argTokens:
            if argToken != None and eventToken != argToken:
                oneTokenEvent = False
                break
        if oneTokenEvent:
            return "OK"  #True

        if not paths.has_key(eventToken):
            return "nopaths"  #False

        newArgTokens = []
        for argToken in argTokens:
            if type(argToken) == types.ListType or type(
                    argToken) == types.TupleType:
                newArgTokens.extend(argToken)
            else:
                newArgTokens.append(argToken)
        argTokens = newArgTokens

        oneArgValid = True
        if False:
            oneArgValid = False
            for argToken in argTokens:
                if argToken == None:
                    continue
                if paths[eventToken].has_key(argToken):
                    path = paths[eventToken][argToken]
                else:
                    #print argToken, argToken.get("text")
                    #return False
                    continue
                depPaths = self.multiEdgeFeatureBuilder.getEdgeCombinations(
                    sentenceGraph.dependencyGraph, path)
                validArg = False
                for p in depPaths:
                    if p in self.pathGazetteer and self.pathGazetteer[p][0] > 0:
                        validArg = True
                        break
                if validArg:
                    oneArgValid = True

        # The first and last dependency of a path
        if False:
            oneEdgeValid = False
            for argToken in argTokens:
                if argToken == None:
                    continue
                if paths[eventToken].has_key(argToken):
                    path = paths[eventToken][argToken]
                else:
                    #print argToken, argToken.get("text")
                    #return False
                    continue
                depPaths = self.multiEdgeFeatureBuilder.getEdgeCombinations(
                    sentenceGraph.dependencyGraph, path)
                validArg = False
                for p in depPaths:
                    p = p.replace("<", "")
                    p = p.replace(">", "")
                    p = p.split(".")
                    pair = (p[0], p[-1])
                    if pair in self.pathGazetteerPairs:
                        validArg = True
                        break
                if validArg:
                    oneEdgeValid = True
                    break
            if not oneEdgeValid:
                return "pair"

        # Event must not have unseen dependencies in any of its paths
        if False:
            for argToken in argTokens:
                if argToken == None:
                    continue
                if paths[eventToken].has_key(argToken):
                    path = paths[eventToken][argToken]
                else:
                    continue
                deps = self.multiEdgeFeatureBuilder.getEdgeSet(
                    sentenceGraph.dependencyGraph, path)
                for d in deps:
                    if d[2].get("type") not in self.pathGazetteerDependencies:
                        #print "Unk", d[2].get("type")
                        return "unkdep"

#            validArg = True
#            for p in depPaths:
#                if p in self.pathGazetteer and self.pathGazetteer[p][0] == 0:
#                    validArg = False
#                    break

#            if not validArg:
#                return False
        if not oneArgValid:
            return "novalidarg"  #False
        return "OK"  #True

    def setGazetteerFeatures(self, token, tag):
        gazText = self.getGazetteerMatch(token.get("text").lower())
        if gazText != None:
            gazCategories = self.gazetteer[gazText]
            for k, v in gazCategories.iteritems():
                self.setFeature(tag + "gaz_event_value_" + k, v)
                self.setFeature(tag + "gaz_event_" + k, 1)
                if k.find("egulation") != -1:
                    self.setFeature(tag + "potReg", 1)
        else:
            self.setFeature(tag + "notInGaz", 1)

    def buildExample(self,
                     exampleIndex,
                     sentenceGraph,
                     paths,
                     eventToken,
                     themeTokens,
                     causeTokens=None):
        features = {}
        self.features = features

        categoryName, eventIds = self.getGSEventType(sentenceGraph, eventToken,
                                                     themeTokens, causeTokens)
        category = self.classSet.getId(categoryName)

        potentialRegulation = False
        eventTokenText = eventToken.get("text").lower()
        gazText = self.getGazetteerMatch(eventTokenText)
        gazCategories = self.gazetteer[gazText]
        for k, v in gazCategories.iteritems():
            if k.find("egulation") != -1:
                potentialRegulation = True
        self.setGazetteerFeatures(eventToken, "")

        self.triggerFeatureBuilder.setFeatureVector(self.features)
        self.triggerFeatureBuilder.tag = "trg_"
        self.triggerFeatureBuilder.buildFeatures(eventToken)

        themeEntities = []
        hasTheme = False
        if len(themeTokens) > 1:
            self.setFeature("multiTheme", 1)
            potentialRegulation = False
        for themeToken in themeTokens:
            if themeToken != None:
                hasTheme = True
                self.setGazetteerFeatures(themeToken, "theme_")
                self.buildArgumentFeatures(sentenceGraph, paths, features,
                                           eventToken, themeToken, "theme_")
                self.triggerFeatureBuilder.tag = "ttrg_"
                self.triggerFeatureBuilder.buildFeatures(themeToken)
                themeEntity = None
                if sentenceGraph.entitiesByToken.has_key(themeToken):
                    for themeEntity in sentenceGraph.entitiesByToken[
                            themeToken]:
                        if themeEntity.get("isName") == "True":
                            self.setFeature("themeProtein", 1)
                            if potentialRegulation:
                                self.setFeature("regulationThemeProtein", 1)
                            themeEntities.append(themeEntity)
                            break
                if not features.has_key("themeProtein"):
                    self.setFeature("themeEvent", 1)
                    self.setFeature("nestingEvent", 1)
                    if potentialRegulation:
                        self.setFeature("regulationThemeEvent", 1)
        if hasTheme:
            self.setFeature("noTheme", 1)

        causeEntities = []
        hasCause = False
        for causeToken in causeTokens:
            if causeToken != None:
                hasCause = True
                self.setGazetteerFeatures(causeToken, "cause_")
                self.buildArgumentFeatures(sentenceGraph, paths, features,
                                           eventToken, causeToken, "cause_")
                self.triggerFeatureBuilder.tag = "ctrg_"
                self.triggerFeatureBuilder.buildFeatures(causeToken)
                causeEntity = None
                if sentenceGraph.entitiesByToken.has_key(causeToken):
                    for causeEntity in sentenceGraph.entitiesByToken[
                            causeToken]:
                        if causeEntity.get("isName") == "True":
                            self.setFeature("causeProtein", 1)
                            if potentialRegulation:
                                self.setFeature("regulationCauseProtein", 1)
                            causeEntities.append(causeEntity)
                            break
                if not features.has_key("causeProtein"):
                    self.setFeature("causeEvent", 1)
                    self.setFeature("nestingEvent", 1)
                    if potentialRegulation:
                        self.setFeature("regulationCauseEvent", 1)
        if not hasCause:
            self.setFeature("noCause", 1)

        self.triggerFeatureBuilder.tag = ""
        self.triggerFeatureBuilder.setFeatureVector(None)

        # Common features
        #        if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization
        #            if entity2.get("isName") == "True":
        #                features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1
        #            else:
        #                features[self.featureSet.getId("GENIA_regulation_of_event")] = 1

        # define extra attributes
        extra = {"xtype": "event", "type": categoryName}
        extra["et"] = eventToken.get("id")
        if len(eventIds) > 0:
            eventIds.sort()
            extra["eids"] = ""
            for eventId in eventIds:
                extra["eids"] += str(eventId) + ","
            extra["eids"] = extra["eids"][:-1]
        for themeToken in themeTokens:
            if themeToken != None:
                if extra.has_key("tt"):
                    extra["tt"] = extra["tt"] + "," + themeToken.get("id")
                else:
                    extra["tt"] = themeToken.get("id")
        for themeEntity in themeEntities:
            if extra.has_key("t"):
                extra["t"] = extra["t"] + "," + themeEntity.get("id")
            else:
                extra["t"] = themeEntity.get("id")
        for causeToken in causeTokens:
            if causeToken != None:
                extra["ct"] = causeTokens[0].get("id")
        if len(causeEntities) > 0:
            extra["c"] = causeEntities[0].get("id")
        sentenceOrigId = sentenceGraph.sentenceElement.get("origId")
        if sentenceOrigId != None:
            extra["SOID"] = sentenceOrigId
        # make example
        #assert (category == 1 or category == -1)
        self.features = None
        return (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex),
                category, features, extra)

    def buildArgumentFeatures(self, sentenceGraph, paths, features, eventToken,
                              argToken, tag):
        #eventToken = sentenceGraph.entityHeadTokenByEntity[eventNode]
        #argToken = sentenceGraph.entityHeadTokenByEntity[argNode]
        self.multiEdgeFeatureBuilder.tag = tag
        self.multiEdgeFeatureBuilder.setFeatureVector(features, None, None,
                                                      False)

        if eventToken != argToken and paths.has_key(
                eventToken) and paths[eventToken].has_key(argToken):
            path = paths[eventToken][argToken]
            edges = self.multiEdgeFeatureBuilder.getEdges(
                sentenceGraph.dependencyGraph, path)
        else:
            path = [eventToken, argToken]
            edges = None

#        if not "disable_entity_features" in self.styles:
#            # doesn't improve beyond 52.32
#            self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph)
#        # buildPathLengthFeatures 52.32 -> 51-51
#        self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path)
#        if not "disable_terminus_features" in self.styles:
#            # didn't improve from 52.32
#            self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast
        if not "disable_single_element_features" in self.styles:
            # 50.74 -> 52.32
            self.multiEdgeFeatureBuilder.buildSingleElementFeatures(
                path, edges, sentenceGraph)
        if not "disable_ngram_features" in self.styles:
            # ngrams alone - 50.74
            self.multiEdgeFeatureBuilder.buildPathGrams(
                2, path, edges, sentenceGraph)  # remove for fast
            self.multiEdgeFeatureBuilder.buildPathGrams(
                3, path, edges, sentenceGraph)  # remove for fast
            self.multiEdgeFeatureBuilder.buildPathGrams(
                4, path, edges, sentenceGraph)  # remove for fast
            # disabling length 4 drops performance
#        if not "disable_path_edge_features" in self.styles:
#            self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, edges, sentenceGraph)
#        self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph)
# buildSentenceFeatures seems to decrease performance by 8 %-points
        self.multiEdgeFeatureBuilder.setFeatureVector(None, None, None, False)
        self.multiEdgeFeatureBuilder.tag = ""
Example #4
0
class PathGazetteer(ExampleBuilder):
    def __init__(self, includeNeg=False):
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(None)
        self.gazetteer = {}
        self.includeNeg = includeNeg
    
    @classmethod
    def build(cls, input, output, parse, tokenization=None, includeNeg=False):
        p = PathGazetteer(includeNeg)
        sentences = cls.getSentences(input, parse, tokenization)
        
        counter = ProgressCounter(len(sentences), "Build path gazetteer")
        for sentence in sentences:
            counter.update(1, "Building path gazetteer ("+sentence[0].getSentenceId()+"): ")
            p.processSentence(sentence[0])
        p.calculateFractions()
        
        f = open(output, "wt")
        for key in sorted(p.gazetteer.keys()):
            v = p.gazetteer[key]
            f.write(key + " " + str(v[0]) + " " + str(v[1]) + " " + str(v[2]) + " " + str(v[3]) + "\n")
        f.close()
    
    @classmethod
    def load(cls, filename):
        gazetteer = {}
        f = open(filename, "rt")
        for line in f.readlines():
            splits = line.split()
            gazetteer[splits[0]] = (splits[1], splits[2], splits[3], splits[4])
        f.close()
        return gazetteer
    
    @classmethod
    def getDependencies(cls, gazetteer):
        dependencies = set()
        for k in gazetteer.keys():
            k = k.strip()
            k = k.replace("<","")
            k = k.replace(">","")
            for dep in k.split("."):
                dependencies.add(dep)
        return dependencies
    
    @classmethod
    def getPairs(self, gazetteer):
        pairs = []
        for k in gazetteer.keys():
            k = k.strip()
            k = k.replace("<","")
            k = k.replace(">","")
            splits = k.split(".")
            pairs.append( (splits[0], splits[-1]) )
        return pairs
    
    def calculateFractions(self):
        numInstances = 0
        numPos = 0
        numNeg = 0
        for v in self.gazetteer.values():
            numInstances += v[2] + v[3]
            numPos += v[2]
            numNeg += v[3]
        print >> sys.stderr, "Total paths:", numInstances
        print >> sys.stderr, "Positives:", numPos
        print >> sys.stderr, "Negatives:", numNeg
        
        for v in self.gazetteer.values():
            assert v[0] == None and v[1] == None
            if v[3] == None:
                assert v[2] != None
                v[0] = 1
            else:
                v[0] = float(v[2]) / float(v[2] + v[3])
            v[1] = float(v[2] + v[3]) / float(numInstances)

    def nxMultiDiGraphToUndirected(self, graph):
        undirected = NX10.MultiGraph(name=graph.name)
        undirected.add_nodes_from(graph)
        undirected.add_edges_from(graph.edges_iter())
        return undirected
                        
    def processSentence(self, sentenceGraph):        
        #undirected = sentenceGraph.dependencyGraph.to_undirected()
        undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
        self.multiEdgeFeatureBuilder.setFeatureVector()
        
        positivePaths = {} # per sentence, a path may still be negative in another sentence
        for interaction in sentenceGraph.interactions:
            e1 = sentenceGraph.entitiesById[interaction.get("e1")]
            e1Token = sentenceGraph.entityHeadTokenByEntity[e1]
            e2 = sentenceGraph.entitiesById[interaction.get("e2")]
            e2Token = sentenceGraph.entityHeadTokenByEntity[e2]
            
            if paths.has_key(e1Token) and paths[e1Token].has_key(e2Token):
                if not positivePaths.has_key(e1Token): 
                    positivePaths[e1Token] = {}
                positivePaths[e1Token][e2Token] = True
                
                path = paths[e1Token][e2Token]
                for comb in self.multiEdgeFeatureBuilder.getEdgeCombinations(sentenceGraph.dependencyGraph, path):
                    if not self.gazetteer.has_key(comb):
                        self.gazetteer[comb] = [None,None,0,0]
                    self.gazetteer[comb][2] += 1
        
        if self.includeNeg:
            for t1 in sentenceGraph.tokens:
                for t2 in sentenceGraph.tokens:
                    if t1 == t2:
                        continue
                    if positivePaths.has_key(t1) and positivePaths[t1].has_key(t2):
                        continue
                    
                    if paths.has_key(t1) and paths[t1].has_key(t2):
                        path = paths[t1][t2]
                        for comb in self.multiEdgeFeatureBuilder.getEdgeCombinations(sentenceGraph.dependencyGraph, path):
                            if not self.gazetteer.has_key(comb):
                                self.gazetteer[comb] = [None,None,0,0]
                            self.gazetteer[comb][3] += 1