コード例 #1
0
ファイル: PathGazetteer.py プロジェクト: thiagoki/Tdevel
    def processSentence(self, sentenceGraph):
        #undirected = sentenceGraph.dependencyGraph.to_undirected()
        undirected = self.nxMultiDiGraphToUndirected(
            sentenceGraph.dependencyGraph)
        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
        self.multiEdgeFeatureBuilder.setFeatureVector()

        positivePaths = {
        }  # per sentence, a path may still be negative in another sentence
        for interaction in sentenceGraph.interactions:
            e1 = sentenceGraph.entitiesById[interaction.get("e1")]
            e1Token = sentenceGraph.entityHeadTokenByEntity[e1]
            e2 = sentenceGraph.entitiesById[interaction.get("e2")]
            e2Token = sentenceGraph.entityHeadTokenByEntity[e2]

            if paths.has_key(e1Token) and paths[e1Token].has_key(e2Token):
                if not positivePaths.has_key(e1Token):
                    positivePaths[e1Token] = {}
                positivePaths[e1Token][e2Token] = True

                path = paths[e1Token][e2Token]
                for comb in self.multiEdgeFeatureBuilder.getEdgeCombinations(
                        sentenceGraph.dependencyGraph, path):
                    if not self.gazetteer.has_key(comb):
                        self.gazetteer[comb] = [None, None, 0, 0]
                    self.gazetteer[comb][2] += 1

        if self.includeNeg:
            for t1 in sentenceGraph.tokens:
                for t2 in sentenceGraph.tokens:
                    if t1 == t2:
                        continue
                    if positivePaths.has_key(t1) and positivePaths[t1].has_key(
                            t2):
                        continue

                    if paths.has_key(t1) and paths[t1].has_key(t2):
                        path = paths[t1][t2]
                        for comb in self.multiEdgeFeatureBuilder.getEdgeCombinations(
                                sentenceGraph.dependencyGraph, path):
                            if not self.gazetteer.has_key(comb):
                                self.gazetteer[comb] = [None, None, 0, 0]
                            self.gazetteer[comb][3] += 1
コード例 #2
0
ファイル: PathGazetteer.py プロジェクト: jbjorne/Tdevel
 def processSentence(self, sentenceGraph):        
     #undirected = sentenceGraph.dependencyGraph.to_undirected()
     undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
     paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
     self.multiEdgeFeatureBuilder.setFeatureVector()
     
     positivePaths = {} # per sentence, a path may still be negative in another sentence
     for interaction in sentenceGraph.interactions:
         e1 = sentenceGraph.entitiesById[interaction.get("e1")]
         e1Token = sentenceGraph.entityHeadTokenByEntity[e1]
         e2 = sentenceGraph.entitiesById[interaction.get("e2")]
         e2Token = sentenceGraph.entityHeadTokenByEntity[e2]
         
         if paths.has_key(e1Token) and paths[e1Token].has_key(e2Token):
             if not positivePaths.has_key(e1Token): 
                 positivePaths[e1Token] = {}
             positivePaths[e1Token][e2Token] = True
             
             path = paths[e1Token][e2Token]
             for comb in self.multiEdgeFeatureBuilder.getEdgeCombinations(sentenceGraph.dependencyGraph, path):
                 if not self.gazetteer.has_key(comb):
                     self.gazetteer[comb] = [None,None,0,0]
                 self.gazetteer[comb][2] += 1
     
     if self.includeNeg:
         for t1 in sentenceGraph.tokens:
             for t2 in sentenceGraph.tokens:
                 if t1 == t2:
                     continue
                 if positivePaths.has_key(t1) and positivePaths[t1].has_key(t2):
                     continue
                 
                 if paths.has_key(t1) and paths[t1].has_key(t2):
                     path = paths[t1][t2]
                     for comb in self.multiEdgeFeatureBuilder.getEdgeCombinations(sentenceGraph.dependencyGraph, path):
                         if not self.gazetteer.has_key(comb):
                             self.gazetteer[comb] = [None,None,0,0]
                         self.gazetteer[comb][3] += 1
コード例 #3
0
 def buildExamples(self, sentenceGraph):
     examples = []
     exampleIndex = 0
     
     #undirected = sentenceGraph.getUndirectedDependencyGraph()
     undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
     ##undirected = sentenceGraph.dependencyGraph.to_undirected()
     ###undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work
     paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
     
     # Determine overlapping entity precedence
     #levelByEntity, levelByInteraction = self.getPrecedenceLevels(sentenceGraph, paths)
     levelByEntity = self.getPrecedenceLevels(sentenceGraph, paths)
     
     entities = []
     # There is one entity group for each token, for each type of entity
     for token in sentenceGraph.tokens: # per token
         entitiesByType = {}
         for entity in sentenceGraph.tokenIsEntityHead[token]: # per type
             if entity.get("isName") == "True": # Names can never have duplicates
                 entities.append( (entity, 0, False) )
                 continue
             eType = entity.get("type")
             if eType == "neg":
                 continue
             if not entitiesByType.has_key(eType):
                 entitiesByType[eType] = []
             entitiesByType[eType].append(entity)
         # Create slot groups for tokens for which exists at least one entity
         eTypes = sorted(entitiesByType.keys())
         if len(eTypes) == 0:
             continue
         # Create slot groups and insert GS data there
         for eType in eTypes:
             # Use first entity of a type as the dummy entity for unfilled slots
             dummyEntity = entitiesByType[eType][0]
             # Define entity slots
             entityGroup = [None, None, None, None]
             #entityGroup = [None, None]
             # Insert existing entities into slots
             for entity in entitiesByType[eType]:
                 if levelByEntity.has_key(entity):
                     level = levelByEntity[entity]
                     if level < len(entityGroup):
                         entityGroup[level] = (entity, level, False)
             # Create dummies for potential entities
             for i in range(len(entityGroup)):
                 if entityGroup[i] == None:
                     entityGroup[i] = (dummyEntity, i, True)
             # Put all slots into one potential entity list
             #print entityGroup
             for e in entityGroup:
                 entities.append(e)
     
     # Generate examples based on interactions between entities
     for i in range(len(entities)-1):
         for j in range(i+1,len(entities)):
             eI = entities[i][0]
             eJ = entities[j][0]
             tI = sentenceGraph.entityHeadTokenByEntity[eI]
             tJ = sentenceGraph.entityHeadTokenByEntity[eJ]
             
             # define forward example
             categoryName = self.getCategoryName(sentenceGraph, entities[i], entities[j], True)
             if (not "genia_limits" in self.styles) or self.isPotentialGeniaInteraction(eI, eJ):
                 examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, entities[i], entities[j]) )
                 exampleIndex += 1
             
             # define reverse
             categoryName = self.getCategoryName(sentenceGraph, entities[j], entities[i], True)
             if (not "genia_limits" in self.styles) or self.isPotentialGeniaInteraction(eJ, eI):
                 examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, entities[j], entities[i]) )
                 exampleIndex += 1
     
     return examples
コード例 #4
0
    def buildExamples(self, sentenceGraph):
        examples = []
        exampleIndex = 0

        clearGraph = sentenceGraph.getCleared()

        #undirected = sentenceGraph.getUndirectedDependencyGraph()
        undirected = self.nxMultiDiGraphToUndirected(
            sentenceGraph.dependencyGraph)
        ##undirected = sentenceGraph.dependencyGraph.to_undirected()
        ###undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work
        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)

        self.triggerFeatureBuilder.initSentence(clearGraph)

        # Generate examples based on interactions between entities or interactions between tokens
        if "entities" in self.styles:
            loopRange = len(sentenceGraph.entities)
        else:
            loopRange = len(sentenceGraph.tokens)
        #for i in range(loopRange-1):
        for i in range(loopRange):  # allow self-interactions
            #for j in range(i+1,loopRange):
            for j in range(i, loopRange):  # allow self-interactions
                eI = None
                eJ = None
                if "entities" in self.styles:
                    eI = sentenceGraph.entities[i]
                    eJ = sentenceGraph.entities[j]
                    tI = sentenceGraph.entityHeadTokenByEntity[eI]
                    tJ = sentenceGraph.entityHeadTokenByEntity[eJ]
                    #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True":
                    #    continue
                    if eI.get("type") == "neg" or eJ.get("type") == "neg":
                        continue
                else:
                    tI = sentenceGraph.tokens[i]
                    tJ = sentenceGraph.tokens[j]
#                # only consider paths between entities (NOTE! entities, not only named entities)
#                if "headsOnly" in self.styles:
#                    if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0):
#                        continue

                if "directed" in self.styles:
                    # define forward
                    if "entities" in self.styles:
                        categoryName = self.getCategoryName(
                            sentenceGraph, eI, eJ, True)
                    else:
                        categoryName = self.getCategoryNameFromTokens(
                            sentenceGraph, tI, tJ, True)
                    self.exampleStats.beginExample(categoryName)
                    if self.negFrac == None or categoryName != "neg" or (
                            categoryName == "neg"
                            and self.negRand.random() < self.negFrac):
                        makeExample = True
                        if ("genia_limits" in self.styles
                            ) and not self.isPotentialGeniaInteraction(eI, eJ):
                            makeExample = False
                            self.exampleStats.filter("genia_limits")
                        if self.posPairGaz.getNegFrac(
                            (tI.get("POS"), tJ.get("POS"))) == 1.0:
                            makeExample = False
                            self.exampleStats.filter("pos_pair")
                        if makeExample:
                            if not sentenceGraph.tokenIsName[tI]:
                                examples.append(
                                    self.buildExample(tI, tJ, paths,
                                                      clearGraph, categoryName,
                                                      exampleIndex, eI, eJ))
                                exampleIndex += 1
                            else:
                                self.exampleStats.filter("genia_token_limits")
                    else:
                        self.exampleStats.filter("neg_frac")
                    self.exampleStats.endExample()

                    # define reverse
                    if "entities" in self.styles:
                        categoryName = self.getCategoryName(
                            sentenceGraph, eJ, eI, True)
                    else:
                        categoryName = self.getCategoryNameFromTokens(
                            sentenceGraph, tJ, tI, True)
                    self.exampleStats.beginExample(categoryName)
                    if self.negFrac == None or categoryName != "neg" or (
                            categoryName == "neg"
                            and self.negRand.random() < self.negFrac):
                        makeExample = True
                        if ("genia_limits" in self.styles
                            ) and not self.isPotentialGeniaInteraction(eJ, eI):
                            makeExample = False
                            self.exampleStats.filter("genia_limits")
                        if ("bioinfer_limits" in self.styles
                            ) and not self.isPotentialBioInferInteraction(
                                eJ, eI, categoryName):
                            makeExample = False
                            self.exampleStats.filter("bioinfer_limits")
                        if self.posPairGaz.getNegFrac(
                            (tJ.get("POS"), tI.get("POS"))) == 1.0:
                            makeExample = False
                            self.exampleStats.filter("pos_pair")
                        if makeExample:
                            if not sentenceGraph.tokenIsName[tJ]:
                                examples.append(
                                    self.buildExample(tJ, tI, paths,
                                                      clearGraph, categoryName,
                                                      exampleIndex, eJ, eI))
                                exampleIndex += 1
                            else:
                                self.exampleStats.filter("genia_token_limits")
                    else:
                        self.exampleStats.filter("neg_frac")
                    self.exampleStats.endExample()
#                else:
#                    if "entities" in self.styles:
#                        categoryName = self.getCategoryName(sentenceGraph, eI, eJ, False)
#                    else:
#                        categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, False)
#                    forwardExample = self.buildExample(tI, tJ, paths, clearGraph, categoryName, exampleIndex, eI, eJ)
#                    if not "graph_kernel" in self.styles:
#                        reverseExample = self.buildExample(tJ, tI, paths, clearGraph, categoryName, exampleIndex, eJ, eI)
#                        forwardExample[2].update(reverseExample[2])
#                    examples.append(forwardExample)
#                    exampleIndex += 1

        return examples
コード例 #5
0
    def buildExamplesInner(self, sentenceGraph, goldGraph):
        """
        Build one example for each token of the sentence
        """
        if sentenceGraph.sentenceElement.get("origId") in self.skiplist:
            print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get(
                "origId")
            return []

        self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True)
        self.triggerFeatureBuilder.initSentence(sentenceGraph)

        undirected = self.nxMultiDiGraphToUndirected(
            sentenceGraph.dependencyGraph)
        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
        # Get argument order
        self.interactionLengths = self.getInteractionEdgeLengths(
            sentenceGraph, paths)
        self.interactionLengths = self.interactionLengths.values()
        self.interactionLengths.sort(compareInteractionPrecedence)
        # Map tokens to entities
        tokenByOffset = {}
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            if goldGraph != None:
                goldToken = goldGraph.tokens[i]
                assert token.get("id") == goldToken.get("id") and token.get(
                    "charOffset") == goldToken.get("charOffset")
            tokenByOffset[token.get("charOffset")] = token.get("id")
        # Map gold entities to their head offsets
        goldEntitiesByOffset = {}
        for token in sentenceGraph.tokens:
            goldEntitiesByOffset[token.get("charOffset")] = []
        entityToGold = {}
        for entity in sentenceGraph.entities:
            entityToGold[entity] = []
        if goldGraph != None:
            for entity in goldGraph.entities:
                offset = entity.get("headOffset")
                assert offset != None
                goldEntitiesByOffset[offset].append(entity)
            # Map predicted entities to gold entities
            for entity in sentenceGraph.entities:
                eType = entity.get("type")
                eOffset = entity.get("headOffset")
                for goldEntity in goldEntitiesByOffset[eOffset]:
                    if goldEntity.get("type") == eType:
                        entityToGold[entity].append(goldEntity)
        # Map entities to interactions
        #interactionsByEntityId = {}
        #for entity in sentenceGraph.entities:
        #    interactionsByEntityId[entity.get("id")] = []
        # Map tokens to interactions
        interactionsByToken = {}
        for token in sentenceGraph.tokens:
            interactionsByToken[token] = []
        for interactionTuple in self.interactionLengths:
            interaction = interactionTuple[0]
            if interaction.get("type") == "neg":
                continue
            e1Id = interaction.get("e1")
            token = sentenceGraph.entityHeadTokenByEntity[
                sentenceGraph.entitiesById[e1Id]]
            interactionsByToken[token].append(interaction)

        examples = []
        exampleIndex = 0

        self.tokenFeatures = {}

        #namedEntityNorStrings = set()
        namedEntityHeadTokens = []
        if not "names" in self.styles:
            namedEntityCount = 0
            for entity in sentenceGraph.entities:
                if entity.get(
                        "isName"
                ) == "True":  # known data which can be used for features
                    namedEntityCount += 1
                    #namedEntityNorStrings.add( entity.get("text").replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower() )
            namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
            #if namedEntityCount == 0: # no names, no need for triggers
            #    return []

            if "pos_pairs" in self.styles:
                namedEntityHeadTokens = self.getNamedEntityHeadTokens(
                    sentenceGraph)

        #neFeatures = {} # F: 69.35 -> 69.14
        #for norString in namedEntityNorStrings:
        #    neFeatures[self.featureSet.getId("norNE_" + norString)] = 1

        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
        bowFeatures = {}
        for k, v in bagOfWords.iteritems():
            bowFeatures[self.featureSet.getId(k)] = v

        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True)
            fixedInEdges = []
            for edge in inEdges:
                fixedInEdges.append((edge[0], edge[1], edge[2]["element"]))
            inEdges = fixedInEdges
            inEdges.sort(compareDependencyEdgesById)
            self.inEdgesByToken[token] = inEdges
            outEdges = sentenceGraph.dependencyGraph.out_edges(token,
                                                               data=True)
            fixedOutEdges = []
            for edge in outEdges:
                fixedOutEdges.append((edge[0], edge[1], edge[2]["element"]))
            outEdges = fixedOutEdges
            outEdges.sort(compareDependencyEdgesById)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)

        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            # Recognize only non-named entities (i.e. interaction words)
            if sentenceGraph.tokenIsName[
                    token] and not "names" in self.styles and not "all_tokens" in self.styles:
                continue

            # CLASS
            #if len(sentenceGraph.tokenIsEntityHead[token]) > 0:
            #    category = self.classSet.getId(self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token]))
            #else:
            #    category = 1
            offset = token.get("charOffset")
            if len(goldEntitiesByOffset[offset]) > 0:
                category = self.classSet.getId(
                    self.getMergedEntityType(goldEntitiesByOffset[offset]))
            else:
                category = 1

            tokenText = token.get("text").lower()
            if "stem_gazetteer" in self.styles:
                tokenText = PorterStemmer.stem(tokenText)
            if ("exclude_gazetteer" in self.styles
                ) and self.gazetteer and tokenText not in self.gazetteer:
                features = {}
                features[self.featureSet.getId("exclude_gazetteer")] = 1
                extra = {
                    "xtype": "token",
                    "t": token.get("id"),
                    "excluded": "True"
                }
                examples.append(
                    (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex),
                     category, features, extra))
                exampleIndex += 1
                continue

            # FEATURES
            features = {}
            self.features = features

            if not "names" in self.styles:
                features[self.featureSet.getId(namedEntityCountFeature)] = 1
            #for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)
            #features.update(neFeatures)

            #            for j in range(len(sentenceGraph.tokens)):
            #                text = "bow_" + sentenceGraph.tokens[j].get("text")
            #                if j < i:
            #                    features[self.featureSet.getId("bf_" + text)] = 1
            #                elif j > i:
            #                    features[self.featureSet.getId("af_" + text)] = 1

            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_" + text)] = 1
            features[self.featureSet.getId("POS_" + token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_" + stem)] = 1
            features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1

            # Normalized versions of the string (if same as non-normalized, overlap without effect)
            normalizedText = text.replace("-", "").replace("/", "").replace(
                ",", "").replace("\\", "").replace(" ", "").lower()
            if normalizedText == "bound":  # should be for all irregular verbs
                normalizedText = "bind"
            features[self.featureSet.getId("txt_" + normalizedText)] = 1
            norStem = PorterStemmer.stem(normalizedText)
            features[self.featureSet.getId("stem_" + norStem)] = 1
            features[self.featureSet.getId("nonstem_" +
                                           normalizedText[len(norStem):])] = 1

            if "gazetteer_features_maintoken" in self.styles:
                tokTxtLower = text.lower()
                if "stem_gazetteer" in self.styles:
                    tokTxtLower = PorterStemmer.stem(tokTxtLower)
                if self.gazetteer and tokTxtLower in self.gazetteer:
                    for label, weight in self.gazetteer[tokTxtLower].items():
                        features[self.featureSet.getId(
                            "gaz_knownLabel_" +
                            label)] = weight  # 1 performs slightly worse

            # Linear order features
            #for index in [-3,-2,-1,1,2,3,4,5]: # 69.35 -> 68.97
            for index in [-3, -2, -1, 1, 2, 3]:
                if i + index > 0 and i + index < len(sentenceGraph.tokens):
                    self.buildLinearOrderFeatures(sentenceGraph, i + index,
                                                  str(index), features)

            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j - 1] == "-":
                        features[self.featureSet.getId(
                            "has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_" +
                                                   text[j - 1:j +
                                                        1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_" +
                                                   text[j - 2:j +
                                                        1].lower())] = 1

            # Attached edges (Hanging in and out edges)
            t1InEdges = self.inEdgesByToken[token]
            for edge in t1InEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HIn_" + edgeType)] = 1
                features[self.featureSet.getId("t1HIn_" +
                                               edge[0].get("POS"))] = 1
                features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                               edge[0].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[0])
                features[self.featureSet.getId("t1HIn_" + tokenText)] = 1
                features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                               tokenText)] = 1
            t1OutEdges = self.outEdgesByToken[token]
            for edge in t1OutEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HOut_" + edgeType)] = 1
                features[self.featureSet.getId("t1HOut_" +
                                               edge[1].get("POS"))] = 1
                features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                               edge[1].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[1])
                features[self.featureSet.getId("t1HOut_" + tokenText)] = 1
                features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                               tokenText)] = 1

            extra = {"xtype": "token", "t": token.get("id")}
            examples.append(
                (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex),
                 category, features, extra))
            exampleIndex += 1

            # chains
            self.buildChains(token, sentenceGraph, features)

            if "pos_pairs" in self.styles:
                self.buildPOSPairs(token, namedEntityHeadTokens, features)

            self.buildPredictionFeatures(sentenceGraph, paths, token,
                                         interactionsByToken[token])
        return examples
コード例 #6
0
    def buildExamples(self, sentenceGraph):
        self.makeGSEvents(sentenceGraph)
        self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True)
        self.triggerFeatureBuilder.initSentence(sentenceGraph)

        examples = []
        exampleIndex = 0

        #undirected = sentenceGraph.dependencyGraph.to_undirected()
        undirected = self.nxMultiDiGraphToUndirected(
            sentenceGraph.dependencyGraph)
        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)

        eventTokens = []
        nameTokens = []
        gazCategories = {None: {"neg": -1}}
        #stems = {}
        for token in sentenceGraph.tokens:
            gazText = self.getGazetteerMatch(token.get("text").lower())
            if gazText != None:
                gazCategories[token] = self.gazetteer[gazText]
            else:
                gazCategories[token] = {"neg": -1}

            if token.get("id") in self.namedEntityHeadTokenIds:
                nameTokens.append(token)
            elif gazText != None:
                eventTokens.append(token)
        allTokens = eventTokens + nameTokens

        #if len(nameTokens) == 0: # there can be no events in this sentence
        #    self.gsEvents = None
        #    return []

        for token in eventTokens:
            #gazCategories = self.gazetteer[token.get("text").lower()]
            #print token.get("text").lower(), gazCategories

            #multiargument = False
            potentialRegulation = False
            potentialBinding = False
            for key in gazCategories[token].keys():
                if key in [
                        "Regulation", "Positive_regulation",
                        "Negative_regulation"
                ]:
                    #multiargument = True
                    potentialRegulation = True
                    break
            for key in gazCategories[token].keys():
                if key in ["Binding"]:
                    #multiargument = True
                    potentialBinding = True
                    break

            if potentialRegulation:
                combinations = combine.combine(allTokens, allTokens + [None])
            else:
                combinations = []
                for t2 in nameTokens:  #allTokens:
                    combinations.append((t2, None))

            if potentialBinding:
                for i in range(len(nameTokens) - 1):
                    for j in range(i + 1, len(nameTokens)):
                        combinations.append(
                            ((nameTokens[i], nameTokens[j]), None))

            for combination in combinations:
                theme2Binding = False
                if type(combination[0]) == types.ListType or type(
                        combination[0]) == types.TupleType:
                    theme2Binding = True
                    categoryName, eventIds = self.getGSEventType(
                        sentenceGraph, token, combination[0], [combination[1]])
                else:
                    categoryName, eventIds = self.getGSEventType(
                        sentenceGraph, token, [combination[0]],
                        [combination[1]])

                for id in eventIds:
                    self.examplesByEventOrigId[id] += 1

                skip = False
                s = self.skippedByTypeAndReason
                if not s.has_key(categoryName):
                    s[categoryName] = {}
                if gazCategories[token].get("neg", -1) > 0.99:
                    pass
                if combination[0] == combination[1]:
                    pass  #skip = True
                if combination[0] == token or combination[1] == token:
                    if theme2Binding or gazCategories[combination[0]].get(
                            "Positive_regulation", -1) < 0:
                        skip = True
                        s[categoryName]["duparg"] = s[categoryName].get(
                            "duparg", 0) + 1
                if combination[0] == None and combination[1] == None:
                    skip = True
                    s[categoryName]["noncmb"] = s[categoryName].get(
                        "noncmb", 0) + 1

                validCat = self.isValidEvent(paths, sentenceGraph, token,
                                             combination)
                if validCat != "OK":  #not self.isValidEvent(paths, sentenceGraph, token, combination):
                    skip = True
                    #s[categoryName]["valid"] = s[categoryName].get("valid", 0) + 1
                    s[categoryName][validCat] = s[categoryName].get(
                        validCat, 0) + 1

                if len(nameTokens) == 0:
                    skip = True
                    s[categoryName]["non"] = s[categoryName].get("non", 0) + 1

                if theme2Binding:
                    if gazCategories[combination[0][0]].get(
                            "neg",
                            -1) > 0.99 or gazCategories[combination[0][1]].get(
                                "neg", -1) > 0.99:
                        skip = True
                        s[categoryName]["gazarg"] = s[categoryName].get(
                            "gazarg", 0) + 1
                else:
                    if gazCategories[combination[0]].get(
                            "neg",
                            -1) > 0.99 or gazCategories[combination[1]].get(
                                "neg", -1) > 0.99:
                        skip = True
                        s[categoryName]["gazarg"] = s[categoryName].get(
                            "gazarg", 0) + 1

                if (skip and self.negFrac
                        == None) or (skip and self.negFrac != None
                                     and categoryName == "neg"):
                    self.skippedByType[categoryName] = self.skippedByType.get(
                        categoryName, 0) + 1
                else:
                    if self.negFrac == None or categoryName != "neg" or (
                            categoryName == "neg"
                            and self.negRand.random() < self.negFrac):
                        self.builtByType[categoryName] = self.builtByType.get(
                            categoryName, 0) + 1
                        if theme2Binding:
                            newExample = self.buildExample(
                                exampleIndex, sentenceGraph, paths, token,
                                combination[0], [combination[1]])
                        else:
                            newExample = self.buildExample(
                                exampleIndex, sentenceGraph, paths, token,
                                [combination[0]], [combination[1]])
                        if len(eventIds) > 0:
                            newExample[3]["numEv"] = str(len(eventIds))
                        examples.append(newExample)
                        exampleIndex += 1

        self.gsEvents = None
        return examples
コード例 #7
0
    def buildExamples(self, sentenceGraph):
        examples = []
        exampleIndex = 0

        #undirected = sentenceGraph.getUndirectedDependencyGraph()
        undirected = self.nxMultiDiGraphToUndirected(
            sentenceGraph.dependencyGraph)
        ##undirected = sentenceGraph.dependencyGraph.to_undirected()
        ###undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work
        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)

        # Determine overlapping entity precedence
        #levelByEntity, levelByInteraction = self.getPrecedenceLevels(sentenceGraph, paths)
        levelByEntity = self.getPrecedenceLevels(sentenceGraph, paths)

        entities = []
        # There is one entity group for each token, for each type of entity
        for token in sentenceGraph.tokens:  # per token
            entitiesByType = {}
            for entity in sentenceGraph.tokenIsEntityHead[token]:  # per type
                if entity.get(
                        "isName") == "True":  # Names can never have duplicates
                    entities.append((entity, 0, False))
                    continue
                eType = entity.get("type")
                if eType == "neg":
                    continue
                if not entitiesByType.has_key(eType):
                    entitiesByType[eType] = []
                entitiesByType[eType].append(entity)
            # Create slot groups for tokens for which exists at least one entity
            eTypes = sorted(entitiesByType.keys())
            if len(eTypes) == 0:
                continue
            # Create slot groups and insert GS data there
            for eType in eTypes:
                # Use first entity of a type as the dummy entity for unfilled slots
                dummyEntity = entitiesByType[eType][0]
                # Define entity slots
                entityGroup = [None, None, None, None]
                #entityGroup = [None, None]
                # Insert existing entities into slots
                for entity in entitiesByType[eType]:
                    if levelByEntity.has_key(entity):
                        level = levelByEntity[entity]
                        if level < len(entityGroup):
                            entityGroup[level] = (entity, level, False)
                # Create dummies for potential entities
                for i in range(len(entityGroup)):
                    if entityGroup[i] == None:
                        entityGroup[i] = (dummyEntity, i, True)
                # Put all slots into one potential entity list
                #print entityGroup
                for e in entityGroup:
                    entities.append(e)

        # Generate examples based on interactions between entities
        for i in range(len(entities) - 1):
            for j in range(i + 1, len(entities)):
                eI = entities[i][0]
                eJ = entities[j][0]
                tI = sentenceGraph.entityHeadTokenByEntity[eI]
                tJ = sentenceGraph.entityHeadTokenByEntity[eJ]

                # define forward example
                categoryName = self.getCategoryName(sentenceGraph, entities[i],
                                                    entities[j], True)
                if (not "genia_limits"
                        in self.styles) or self.isPotentialGeniaInteraction(
                            eI, eJ):
                    examples.append(
                        self.buildExample(tI, tJ, paths, sentenceGraph,
                                          categoryName, exampleIndex,
                                          entities[i], entities[j]))
                    exampleIndex += 1

                # define reverse
                categoryName = self.getCategoryName(sentenceGraph, entities[j],
                                                    entities[i], True)
                if (not "genia_limits"
                        in self.styles) or self.isPotentialGeniaInteraction(
                            eJ, eI):
                    examples.append(
                        self.buildExample(tJ, tI, paths, sentenceGraph,
                                          categoryName, exampleIndex,
                                          entities[j], entities[i]))
                    exampleIndex += 1

        return examples
コード例 #8
0
ファイル: CorpusAnalysis.py プロジェクト: thiagoki/Tdevel
def analyzeLengths(corpusElements):
    interactionEdges = 0
    dependencyEdges = 0
    pathsByLength = {}
    pathsBetweenAllEntitiesByLength = {}
    for sentence in corpusElements.sentences:
        sentenceGraph = sentence.sentenceGraph
        #interactionEdges += len(sentenceGraph.interactionGraph.edges())
        interactionEdges += len(sentence.interactions)
        dependencyEdges += len(sentenceGraph.dependencyGraph.edges())

        undirected = sentenceGraph.dependencyGraph.to_undirected()
        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
        # Shortest path for interaction edge
        for interaction in sentence.interactions:
            e1 = sentence.entitiesById[interaction.attrib["e1"]]
            e2 = sentence.entitiesById[interaction.attrib["e2"]]
            t1 = sentenceGraph.entityHeadTokenByEntity[e1]
            t2 = sentenceGraph.entityHeadTokenByEntity[e2]
            if paths.has_key(t1) and paths[t1].has_key(t2):
                path = paths[t1][t2]
                if not pathsByLength.has_key(len(path) - 1):
                    pathsByLength[len(path) - 1] = 0
                pathsByLength[len(path) - 1] += 1
            else:
                if not pathsByLength.has_key("none"):
                    pathsByLength["none"] = 0
                pathsByLength["none"] += 1

#        for intEdge in sentenceGraph.interactionGraph.edges():
#            if paths.has_key(intEdge[0]) and paths[intEdge[0]].has_key(intEdge[1]):
#                path = paths[intEdge[0]][intEdge[1]]
#                if not pathsByLength.has_key(len(path)-1):
#                    pathsByLength[len(path)-1] = 0
#                pathsByLength[len(path)-1] += 1
#            else:
#                if not pathsByLength.has_key("none"):
#                    pathsByLength["none"] = 0
#                pathsByLength["none"] += 1
# Shortest paths between all entities
        for i in range(len(sentence.entities) - 1):
            for j in range(i + 1, len(sentence.entities)):
                tI = sentenceGraph.entityHeadTokenByEntity[
                    sentence.entities[i]]
                tJ = sentenceGraph.entityHeadTokenByEntity[
                    sentence.entities[j]]
                if paths.has_key(tI) and paths[tI].has_key(tJ):
                    path = paths[tI][tJ]
                    if not pathsBetweenAllEntitiesByLength.has_key(
                            len(path) - 1):
                        pathsBetweenAllEntitiesByLength[len(path) - 1] = 0
                    pathsBetweenAllEntitiesByLength[len(path) - 1] += 1
                elif tI == tJ:
                    if not pathsBetweenAllEntitiesByLength.has_key(0):
                        pathsBetweenAllEntitiesByLength[0] = 0
                    pathsBetweenAllEntitiesByLength[0] += 1
                else:
                    if not pathsBetweenAllEntitiesByLength.has_key("none"):
                        pathsBetweenAllEntitiesByLength["none"] = 0
                    pathsBetweenAllEntitiesByLength["none"] += 1


#        for i in range(len(sentenceGraph.tokens)-1):
#            for j in range(i+1,len(sentenceGraph.tokens)):
#                tI = sentenceGraph.tokens[i]
#                tJ = sentenceGraph.tokens[j]
#                if sentenceGraph.tokenIsEntityHead[tI] == None or sentenceGraph.tokenIsEntityHead[tJ] == None:
#                    continue
#                if paths.has_key(tI) and paths[tI].has_key(tJ):
#                    path = paths[tI][tJ]
#                    if not pathsBetweenAllEntitiesByLength.has_key(len(path)-1):
#                        pathsBetweenAllEntitiesByLength[len(path)-1] = 0
#                    pathsBetweenAllEntitiesByLength[len(path)-1] += 1
#                else:
#                    if not pathsBetweenAllEntitiesByLength.has_key("none"):
#                        pathsBetweenAllEntitiesByLength["none"] = 0
#                    pathsBetweenAllEntitiesByLength["none"] += 1

    print >> sys.stderr, "Interaction edges:", interactionEdges
    print >> sys.stderr, "Dependency edges:", dependencyEdges
    print >> sys.stderr, "Shortest path of dependencies for interaction edge:"
    printPathDistribution(pathsByLength)
    if options.output != None:
        pathsByLength["corpus"] = options.input
        pathsByLength["parse"] = options.parse
        TableUtils.addToCSV(pathsByLength,
                            options.output + "/pathsByLength.csv")
    print >> sys.stderr, "Shortest path of dependencies between all entities:"
    printPathDistribution(pathsBetweenAllEntitiesByLength)
    if options.output != None:
        pathsByLength["corpus"] = options.input
        pathsByLength["parse"] = options.parse
        TableUtils.addToCSV(
            pathsBetweenAllEntitiesByLength,
            options.output + "/pathsBetweenAllEntitiesByLength.csv")
コード例 #9
0
    def buildExamples(self, sentenceGraph):
        self.makeGSEvents(sentenceGraph)
        self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True)
        self.triggerFeatureBuilder.initSentence(sentenceGraph)
        
        examples = []
        exampleIndex = 0
        
        #undirected = sentenceGraph.dependencyGraph.to_undirected()
        undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
        
        eventTokens = []
        nameTokens = []
        gazCategories = {None:{"neg":-1}}
        #stems = {}
        for token in sentenceGraph.tokens:
            gazText = self.getGazetteerMatch(token.get("text").lower())
            if gazText != None:
                gazCategories[token] = self.gazetteer[gazText]
            else:
                gazCategories[token] = {"neg":-1}
            
            if token.get("id") in self.namedEntityHeadTokenIds:
                nameTokens.append(token)
            elif gazText != None:
                eventTokens.append(token)
        allTokens = eventTokens + nameTokens
        
        #if len(nameTokens) == 0: # there can be no events in this sentence
        #    self.gsEvents = None
        #    return []
        
        for token in eventTokens:
            #gazCategories = self.gazetteer[token.get("text").lower()]
            #print token.get("text").lower(), gazCategories
            
            #multiargument = False
            potentialRegulation = False
            potentialBinding = False
            for key in gazCategories[token].keys():
                if key in ["Regulation","Positive_regulation","Negative_regulation"]:
                    #multiargument = True
                    potentialRegulation = True
                    break
            for key in gazCategories[token].keys():
                if key in ["Binding"]:
                    #multiargument = True
                    potentialBinding = True
                    break
            
            if potentialRegulation:
                combinations = combine.combine(allTokens, allTokens+[None])
            else:
                combinations = []
                for t2 in nameTokens: #allTokens:
                    combinations.append( (t2, None) )
            
            if potentialBinding:
                for i in range(len(nameTokens) - 1):
                    for j in range(i+1, len(nameTokens)):
                        combinations.append( ((nameTokens[i],nameTokens[j]), None) )
                
            for combination in combinations:
                theme2Binding = False
                if type(combination[0]) == types.ListType or type(combination[0]) == types.TupleType:
                    theme2Binding = True
                    categoryName, eventIds = self.getGSEventType(sentenceGraph, token, combination[0], [combination[1]])
                else:
                    categoryName, eventIds = self.getGSEventType(sentenceGraph, token, [combination[0]], [combination[1]])
                
                for id in eventIds:
                    self.examplesByEventOrigId[id] += 1
                

                skip = False
                s = self.skippedByTypeAndReason
                if not s.has_key(categoryName):
                    s[categoryName] = {}
                if gazCategories[token].get("neg",-1) > 0.99:
                    pass
                if combination[0] == combination[1]:
                    pass #skip = True
                if combination[0] == token or combination[1] == token:
                    if theme2Binding or gazCategories[combination[0]].get("Positive_regulation",-1) < 0:
                        skip = True
                        s[categoryName]["duparg"] = s[categoryName].get("duparg", 0) + 1
                if combination[0] == None and combination[1] == None:
                    skip = True
                    s[categoryName]["noncmb"] = s[categoryName].get("noncmb", 0) + 1
                
                validCat = self.isValidEvent(paths, sentenceGraph, token, combination)
                if validCat != "OK": #not self.isValidEvent(paths, sentenceGraph, token, combination):
                    skip = True
                    #s[categoryName]["valid"] = s[categoryName].get("valid", 0) + 1
                    s[categoryName][validCat] = s[categoryName].get(validCat, 0) + 1
                
                if len(nameTokens) == 0:
                    skip = True
                    s[categoryName]["non"] = s[categoryName].get("non", 0) + 1
                
                if theme2Binding:
                    if gazCategories[combination[0][0]].get("neg",-1) > 0.99 or gazCategories[combination[0][1]].get("neg",-1) > 0.99:
                        skip = True
                        s[categoryName]["gazarg"] = s[categoryName].get("gazarg", 0) + 1
                else:
                    if gazCategories[combination[0]].get("neg",-1) > 0.99 or gazCategories[combination[1]].get("neg",-1) > 0.99:
                        skip = True
                        s[categoryName]["gazarg"] = s[categoryName].get("gazarg", 0) + 1
                
                if (skip and self.negFrac == None) or (skip and self.negFrac != None and categoryName == "neg"):
                    self.skippedByType[categoryName] = self.skippedByType.get(categoryName, 0) + 1
                else:
                    if self.negFrac == None or categoryName != "neg" or (categoryName == "neg" and self.negRand.random() < self.negFrac):
                        self.builtByType[categoryName] = self.builtByType.get(categoryName, 0) + 1
                        if theme2Binding:
                            newExample = self.buildExample(exampleIndex, sentenceGraph, paths, token, combination[0], [combination[1]])
                        else:
                            newExample = self.buildExample(exampleIndex, sentenceGraph, paths, token, [combination[0]], [combination[1]])
                        if len(eventIds) > 0: 
                            newExample[3]["numEv"] = str(len(eventIds))
                        examples.append( newExample )
                        exampleIndex += 1
        
        self.gsEvents = None
        return examples
コード例 #10
0
ファイル: CorpusAnalysis.py プロジェクト: jbjorne/Tdevel
def analyzeLengths(corpusElements):
    interactionEdges = 0
    dependencyEdges = 0
    pathsByLength = {}
    pathsBetweenAllEntitiesByLength = {}
    for sentence in corpusElements.sentences:
        sentenceGraph = sentence.sentenceGraph
        #interactionEdges += len(sentenceGraph.interactionGraph.edges())
        interactionEdges += len(sentence.interactions)
        dependencyEdges += len(sentenceGraph.dependencyGraph.edges())
        
        undirected = sentenceGraph.dependencyGraph.to_undirected()
        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
        # Shortest path for interaction edge
        for interaction in sentence.interactions:
            e1 = sentence.entitiesById[interaction.attrib["e1"]]
            e2 = sentence.entitiesById[interaction.attrib["e2"]]
            t1 = sentenceGraph.entityHeadTokenByEntity[e1]
            t2 = sentenceGraph.entityHeadTokenByEntity[e2]
            if paths.has_key(t1) and paths[t1].has_key(t2):
                path = paths[t1][t2]
                if not pathsByLength.has_key(len(path)-1):
                    pathsByLength[len(path)-1] = 0
                pathsByLength[len(path)-1] += 1
            else:
                if not pathsByLength.has_key("none"):
                    pathsByLength["none"] = 0
                pathsByLength["none"] += 1

#        for intEdge in sentenceGraph.interactionGraph.edges():
#            if paths.has_key(intEdge[0]) and paths[intEdge[0]].has_key(intEdge[1]):
#                path = paths[intEdge[0]][intEdge[1]]
#                if not pathsByLength.has_key(len(path)-1):
#                    pathsByLength[len(path)-1] = 0
#                pathsByLength[len(path)-1] += 1
#            else:
#                if not pathsByLength.has_key("none"):
#                    pathsByLength["none"] = 0
#                pathsByLength["none"] += 1
        # Shortest paths between all entities
        for i in range(len(sentence.entities)-1):
            for j in range(i+1,len(sentence.entities)):
                tI = sentenceGraph.entityHeadTokenByEntity[sentence.entities[i]]
                tJ = sentenceGraph.entityHeadTokenByEntity[sentence.entities[j]]
                if paths.has_key(tI) and paths[tI].has_key(tJ):
                    path = paths[tI][tJ]
                    if not pathsBetweenAllEntitiesByLength.has_key(len(path)-1):
                        pathsBetweenAllEntitiesByLength[len(path)-1] = 0
                    pathsBetweenAllEntitiesByLength[len(path)-1] += 1
                elif tI == tJ:
                    if not pathsBetweenAllEntitiesByLength.has_key(0):
                        pathsBetweenAllEntitiesByLength[0] = 0
                    pathsBetweenAllEntitiesByLength[0] += 1
                else:
                    if not pathsBetweenAllEntitiesByLength.has_key("none"):
                        pathsBetweenAllEntitiesByLength["none"] = 0
                    pathsBetweenAllEntitiesByLength["none"] += 1

#        for i in range(len(sentenceGraph.tokens)-1):
#            for j in range(i+1,len(sentenceGraph.tokens)):
#                tI = sentenceGraph.tokens[i]
#                tJ = sentenceGraph.tokens[j]
#                if sentenceGraph.tokenIsEntityHead[tI] == None or sentenceGraph.tokenIsEntityHead[tJ] == None:
#                    continue
#                if paths.has_key(tI) and paths[tI].has_key(tJ):
#                    path = paths[tI][tJ]
#                    if not pathsBetweenAllEntitiesByLength.has_key(len(path)-1):
#                        pathsBetweenAllEntitiesByLength[len(path)-1] = 0
#                    pathsBetweenAllEntitiesByLength[len(path)-1] += 1
#                else:
#                    if not pathsBetweenAllEntitiesByLength.has_key("none"):
#                        pathsBetweenAllEntitiesByLength["none"] = 0
#                    pathsBetweenAllEntitiesByLength["none"] += 1
    
    print >> sys.stderr, "Interaction edges:", interactionEdges
    print >> sys.stderr, "Dependency edges:", dependencyEdges
    print >> sys.stderr, "Shortest path of dependencies for interaction edge:"
    printPathDistribution(pathsByLength)
    if options.output != None:
        pathsByLength["corpus"] = options.input
        pathsByLength["parse"] = options.parse
        TableUtils.addToCSV(pathsByLength, options.output+"/pathsByLength.csv")
    print >> sys.stderr, "Shortest path of dependencies between all entities:"
    printPathDistribution(pathsBetweenAllEntitiesByLength)
    if options.output != None:
        pathsByLength["corpus"] = options.input
        pathsByLength["parse"] = options.parse
        TableUtils.addToCSV(pathsBetweenAllEntitiesByLength, options.output+"/pathsBetweenAllEntitiesByLength.csv")
コード例 #11
0
    def buildExamples(self, sentenceGraph):
        examples = []
        exampleIndex = 0
        
        clearGraph = sentenceGraph.getCleared()
        
        #undirected = sentenceGraph.getUndirectedDependencyGraph()
        undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
        ##undirected = sentenceGraph.dependencyGraph.to_undirected()
        ###undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work
        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
        
        self.triggerFeatureBuilder.initSentence(clearGraph)
        
        # Generate examples based on interactions between entities or interactions between tokens
        if "entities" in self.styles:
            loopRange = len(sentenceGraph.entities)
        else:
            loopRange = len(sentenceGraph.tokens)
        #for i in range(loopRange-1):
        for i in range(loopRange): # allow self-interactions
            #for j in range(i+1,loopRange):
            for j in range(i,loopRange): # allow self-interactions
                eI = None
                eJ = None
                if "entities" in self.styles:
                    eI = sentenceGraph.entities[i]
                    eJ = sentenceGraph.entities[j]
                    tI = sentenceGraph.entityHeadTokenByEntity[eI]
                    tJ = sentenceGraph.entityHeadTokenByEntity[eJ]
                    #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True":
                    #    continue
                    if eI.get("type") == "neg" or eJ.get("type") == "neg":
                        continue
                else:
                    tI = sentenceGraph.tokens[i]
                    tJ = sentenceGraph.tokens[j]
#                # only consider paths between entities (NOTE! entities, not only named entities)
#                if "headsOnly" in self.styles:
#                    if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0):
#                        continue
                
                if "directed" in self.styles:
                    # define forward
                    if "entities" in self.styles:
                        categoryName = self.getCategoryName(sentenceGraph, eI, eJ, True)
                    else:
                        categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, True)
                    self.exampleStats.beginExample(categoryName)
                    if self.negFrac == None or categoryName != "neg" or (categoryName == "neg" and self.negRand.random() < self.negFrac):
                        makeExample = True
                        if ("genia_limits" in self.styles) and not self.isPotentialGeniaInteraction(eI, eJ):
                            makeExample = False
                            self.exampleStats.filter("genia_limits")
                        if self.posPairGaz.getNegFrac((tI.get("POS"), tJ.get("POS"))) == 1.0:
                            makeExample = False
                            self.exampleStats.filter("pos_pair")
                        if makeExample:
                            if not sentenceGraph.tokenIsName[tI]:
                                examples.append( self.buildExample(tI, tJ, paths, clearGraph, categoryName, exampleIndex, eI, eJ) )
                                exampleIndex += 1
                            else:
                                self.exampleStats.filter("genia_token_limits")
                    else:
                        self.exampleStats.filter("neg_frac")
                    self.exampleStats.endExample()
                    
                    # define reverse
                    if "entities" in self.styles:
                        categoryName = self.getCategoryName(sentenceGraph, eJ, eI, True)
                    else:
                        categoryName = self.getCategoryNameFromTokens(sentenceGraph, tJ, tI, True)
                    self.exampleStats.beginExample(categoryName)
                    if self.negFrac == None or categoryName != "neg" or (categoryName == "neg" and self.negRand.random() < self.negFrac):
                        makeExample = True
                        if ("genia_limits" in self.styles) and not self.isPotentialGeniaInteraction(eJ, eI):
                            makeExample = False
                            self.exampleStats.filter("genia_limits")
                        if ("bioinfer_limits" in self.styles) and not self.isPotentialBioInferInteraction(eJ, eI, categoryName):
                            makeExample = False
                            self.exampleStats.filter("bioinfer_limits")
                        if self.posPairGaz.getNegFrac((tJ.get("POS"), tI.get("POS"))) == 1.0:
                            makeExample = False
                            self.exampleStats.filter("pos_pair")
                        if makeExample:
                            if not sentenceGraph.tokenIsName[tJ]:
                                examples.append( self.buildExample(tJ, tI, paths, clearGraph, categoryName, exampleIndex, eJ, eI) )
                                exampleIndex += 1
                            else:
                                self.exampleStats.filter("genia_token_limits")
                    else:
                        self.exampleStats.filter("neg_frac")
                    self.exampleStats.endExample()
#                else:
#                    if "entities" in self.styles:
#                        categoryName = self.getCategoryName(sentenceGraph, eI, eJ, False)
#                    else:
#                        categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, False)
#                    forwardExample = self.buildExample(tI, tJ, paths, clearGraph, categoryName, exampleIndex, eI, eJ)
#                    if not "graph_kernel" in self.styles:
#                        reverseExample = self.buildExample(tJ, tI, paths, clearGraph, categoryName, exampleIndex, eJ, eI)
#                        forwardExample[2].update(reverseExample[2])
#                    examples.append(forwardExample)
#                    exampleIndex += 1
        
        return examples
コード例 #12
0
    def buildExamplesInner(self, sentenceGraph, goldGraph):
        """
        Build one example for each token of the sentence
        """
        if sentenceGraph.sentenceElement.get("origId") in self.skiplist:
            print >>sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get("origId")
            return []

        self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True)
        self.triggerFeatureBuilder.initSentence(sentenceGraph)

        undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
        # Get argument order
        self.interactionLengths = self.getInteractionEdgeLengths(sentenceGraph, paths)
        self.interactionLengths = self.interactionLengths.values()
        self.interactionLengths.sort(compareInteractionPrecedence)
        # Map tokens to entities
        tokenByOffset = {}
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            if goldGraph != None:
                goldToken = goldGraph.tokens[i]
                assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset")
            tokenByOffset[token.get("charOffset")] = token.get("id")
        # Map gold entities to their head offsets
        goldEntitiesByOffset = {}
        for token in sentenceGraph.tokens:
            goldEntitiesByOffset[token.get("charOffset")] = []
        entityToGold = {}
        for entity in sentenceGraph.entities:
            entityToGold[entity] = []
        if goldGraph != None:
            for entity in goldGraph.entities:
                offset = entity.get("headOffset")
                assert offset != None
                goldEntitiesByOffset[offset].append(entity)
            # Map predicted entities to gold entities
            for entity in sentenceGraph.entities:
                eType = entity.get("type")
                eOffset = entity.get("headOffset")
                for goldEntity in goldEntitiesByOffset[eOffset]:
                    if goldEntity.get("type") == eType:
                        entityToGold[entity].append(goldEntity)
        # Map entities to interactions
        # interactionsByEntityId = {}
        # for entity in sentenceGraph.entities:
        #    interactionsByEntityId[entity.get("id")] = []
        # Map tokens to interactions
        interactionsByToken = {}
        for token in sentenceGraph.tokens:
            interactionsByToken[token] = []
        for interactionTuple in self.interactionLengths:
            interaction = interactionTuple[0]
            if interaction.get("type") == "neg":
                continue
            e1Id = interaction.get("e1")
            token = sentenceGraph.entityHeadTokenByEntity[sentenceGraph.entitiesById[e1Id]]
            interactionsByToken[token].append(interaction)

        examples = []
        exampleIndex = 0

        self.tokenFeatures = {}

        # namedEntityNorStrings = set()
        namedEntityHeadTokens = []
        if not "names" in self.styles:
            namedEntityCount = 0
            for entity in sentenceGraph.entities:
                if entity.get("isName") == "True":  # known data which can be used for features
                    namedEntityCount += 1
                    # namedEntityNorStrings.add( entity.get("text").replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower() )
            namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
            # if namedEntityCount == 0: # no names, no need for triggers
            #    return []

            if "pos_pairs" in self.styles:
                namedEntityHeadTokens = self.getNamedEntityHeadTokens(sentenceGraph)

        # neFeatures = {} # F: 69.35 -> 69.14
        # for norString in namedEntityNorStrings:
        #    neFeatures[self.featureSet.getId("norNE_" + norString)] = 1

        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
        bowFeatures = {}
        for k, v in bagOfWords.iteritems():
            bowFeatures[self.featureSet.getId(k)] = v

        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True)
            fixedInEdges = []
            for edge in inEdges:
                fixedInEdges.append((edge[0], edge[1], edge[2]["element"]))
            inEdges = fixedInEdges
            inEdges.sort(compareDependencyEdgesById)
            self.inEdgesByToken[token] = inEdges
            outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True)
            fixedOutEdges = []
            for edge in outEdges:
                fixedOutEdges.append((edge[0], edge[1], edge[2]["element"]))
            outEdges = fixedOutEdges
            outEdges.sort(compareDependencyEdgesById)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)

        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            # Recognize only non-named entities (i.e. interaction words)
            if sentenceGraph.tokenIsName[token] and not "names" in self.styles and not "all_tokens" in self.styles:
                continue

            # CLASS
            # if len(sentenceGraph.tokenIsEntityHead[token]) > 0:
            #    category = self.classSet.getId(self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token]))
            # else:
            #    category = 1
            offset = token.get("charOffset")
            if len(goldEntitiesByOffset[offset]) > 0:
                category = self.classSet.getId(self.getMergedEntityType(goldEntitiesByOffset[offset]))
            else:
                category = 1

            tokenText = token.get("text").lower()
            if "stem_gazetteer" in self.styles:
                tokenText = PorterStemmer.stem(tokenText)
            if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer:
                features = {}
                features[self.featureSet.getId("exclude_gazetteer")] = 1
                extra = {"xtype": "token", "t": token.get("id"), "excluded": "True"}
                examples.append((sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra))
                exampleIndex += 1
                continue

            # FEATURES
            features = {}
            self.features = features

            if not "names" in self.styles:
                features[self.featureSet.getId(namedEntityCountFeature)] = 1
            # for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)
            # features.update(neFeatures)

            #            for j in range(len(sentenceGraph.tokens)):
            #                text = "bow_" + sentenceGraph.tokens[j].get("text")
            #                if j < i:
            #                    features[self.featureSet.getId("bf_" + text)] = 1
            #                elif j > i:
            #                    features[self.featureSet.getId("af_" + text)] = 1

            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_" + text)] = 1
            features[self.featureSet.getId("POS_" + token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_" + stem)] = 1
            features[self.featureSet.getId("nonstem_" + text[len(stem) :])] = 1

            # Normalized versions of the string (if same as non-normalized, overlap without effect)
            normalizedText = (
                text.replace("-", "").replace("/", "").replace(",", "").replace("\\", "").replace(" ", "").lower()
            )
            if normalizedText == "bound":  # should be for all irregular verbs
                normalizedText = "bind"
            features[self.featureSet.getId("txt_" + normalizedText)] = 1
            norStem = PorterStemmer.stem(normalizedText)
            features[self.featureSet.getId("stem_" + norStem)] = 1
            features[self.featureSet.getId("nonstem_" + normalizedText[len(norStem) :])] = 1

            if "gazetteer_features_maintoken" in self.styles:
                tokTxtLower = text.lower()
                if "stem_gazetteer" in self.styles:
                    tokTxtLower = PorterStemmer.stem(tokTxtLower)
                if self.gazetteer and tokTxtLower in self.gazetteer:
                    for label, weight in self.gazetteer[tokTxtLower].items():
                        features[self.featureSet.getId("gaz_knownLabel_" + label)] = weight  # 1 performs slightly worse

            # Linear order features
            # for index in [-3,-2,-1,1,2,3,4,5]: # 69.35 -> 68.97
            for index in [-3, -2, -1, 1, 2, 3]:
                if i + index > 0 and i + index < len(sentenceGraph.tokens):
                    self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features)

            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j - 1] == "-":
                        features[self.featureSet.getId("has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_" + text[j - 1 : j + 1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_" + text[j - 2 : j + 1].lower())] = 1

            # Attached edges (Hanging in and out edges)
            t1InEdges = self.inEdgesByToken[token]
            for edge in t1InEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HIn_" + edgeType)] = 1
                features[self.featureSet.getId("t1HIn_" + edge[0].get("POS"))] = 1
                features[self.featureSet.getId("t1HIn_" + edgeType + "_" + edge[0].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[0])
                features[self.featureSet.getId("t1HIn_" + tokenText)] = 1
                features[self.featureSet.getId("t1HIn_" + edgeType + "_" + tokenText)] = 1
            t1OutEdges = self.outEdgesByToken[token]
            for edge in t1OutEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HOut_" + edgeType)] = 1
                features[self.featureSet.getId("t1HOut_" + edge[1].get("POS"))] = 1
                features[self.featureSet.getId("t1HOut_" + edgeType + "_" + edge[1].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[1])
                features[self.featureSet.getId("t1HOut_" + tokenText)] = 1
                features[self.featureSet.getId("t1HOut_" + edgeType + "_" + tokenText)] = 1

            extra = {"xtype": "token", "t": token.get("id")}
            examples.append((sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra))
            exampleIndex += 1

            # chains
            self.buildChains(token, sentenceGraph, features)

            if "pos_pairs" in self.styles:
                self.buildPOSPairs(token, namedEntityHeadTokens, features)

            self.buildPredictionFeatures(sentenceGraph, paths, token, interactionsByToken[token])
        return examples