def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph = None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ #examples = [] exampleIndex = 0 if self.styles["trigger_features"]: self.triggerFeatureBuilder.initSentence(sentenceGraph) if self.styles["evex"]: self.evexFeatureBuilder.initSentence(sentenceGraph) # Filter entities, if needed #mergedIds = None #duplicateEntities = None #entities = sentenceGraph.entities #entities, mergedIds, duplicateEntities = self.mergeEntities(sentenceGraph, False) # "no_duplicates" in self.styles) sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities entityToDuplicates = sentenceGraph.mergedEntityToDuplicates self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) # Connect to optional gold graph if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities(entities, goldGraph.entities) paths = None if not self.styles["no_path"]: ##undirected = sentenceGraph.getUndirectedDependencyGraph() #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) ###undirected = sentenceGraph.dependencyGraph.to_undirected() ####undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work undirected = sentenceGraph.dependencyGraph.toUndirected() #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) paths = undirected #for edge in sentenceGraph.dependencyGraph.edges: # assert edge[2] != None #for edge in undirected.edges: # assert edge[2] != None #if sentenceGraph.sentenceElement.get("id") == "GENIA.d70.s5": # print [(x[0].get("id"), x[1].get("id"), x[2].get("id")) for x in sentenceGraph.dependencyGraph.edges] # Generate examples based on interactions between entities or interactions between tokens if self.styles["entities"]: loopRange = len(entities) else: loopRange = len(sentenceGraph.tokens) for i in range(loopRange-1): for j in range(i+1,loopRange): eI = None eJ = None if self.styles["entities"]: eI = entities[i] eJ = entities[j] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True": # continue if eI.get("type") == "neg" or eJ.get("type") == "neg": continue if self.styles["skip_extra_triggers"]: if eI.get("source") != None or eJ.get("source") != None: continue else: tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] # only consider paths between entities (NOTE! entities, not only named entities) if self.styles["headsOnly"]: if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0): continue if self.styles["directed"]: # define forward if self.styles["entities"]: categoryName = self.getCategoryName(sentenceGraph, eI, eJ, True) if goldGraph != None: categoryName = self.getGoldCategoryName(goldGraph, entityToGold, eI, eJ, True) else: categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, True) # make forward self.exampleStats.beginExample(categoryName) makeExample = True if self.styles["genia_limits"] and not self.isPotentialGeniaInteraction(eI, eJ): makeExample = False self.exampleStats.filter("genia_limits") if self.styles["genia_task1"] and (eI.get("type") == "Entity" or eJ.get("type") == "Entity"): makeExample = False self.exampleStats.filter("genia_task1") if self.styles["rel_limits"] and not self.isPotentialRELInteraction(eI, eJ): makeExample = False self.exampleStats.filter("rel_limits") if self.styles["co_limits"] and not self.isPotentialCOInteraction(eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("co_limits") if self.styles["bb_limits"] and not self.isPotentialBBInteraction(eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("bb_limits") if categoryName != "neg": self.exampleStats.filter("bb_limits(" + categoryName + ":" + eI.get("type") + "/" + eJ.get("type") + ")") if self.styles["bi_limits"] and not self.isPotentialBIInteraction(eI, eJ, sentenceGraph, self.exampleStats): makeExample = False #self.exampleStats.filter("bi_limits") if self.styles["epi_limits"] and not self.isPotentialEPIInteraction(eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("epi_limits") if self.styles["id_limits"] and not self.isPotentialIDInteraction(eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("id_limits") # if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"): # makeExample = False # self.exampleStats.filter("selftrain_limits") # if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups): # makeExample = False # self.exampleStats.filter("selftrain_group") if self.styles["pos_only"] and categoryName == "neg": makeExample = False self.exampleStats.filter("pos_only") if makeExample: #examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) ) ExampleUtils.appendExamples([self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ)], outfile) exampleIndex += 1 self.exampleStats.endExample() # define reverse if self.styles["entities"]: categoryName = self.getCategoryName(sentenceGraph, eJ, eI, True) if goldGraph != None: categoryName = self.getGoldCategoryName(goldGraph, entityToGold, eJ, eI, True) else: categoryName = self.getCategoryNameFromTokens(sentenceGraph, tJ, tI, True) # make reverse self.exampleStats.beginExample(categoryName) makeExample = True if self.styles["genia_limits"] and not self.isPotentialGeniaInteraction(eJ, eI): makeExample = False self.exampleStats.filter("genia_limits") if self.styles["genia_task1"] and (eI.get("type") == "Entity" or eJ.get("type") == "Entity"): makeExample = False self.exampleStats.filter("genia_task1") if self.styles["rel_limits"] and not self.isPotentialRELInteraction(eJ, eI): makeExample = False self.exampleStats.filter("rel_limits") if self.styles["co_limits"] and not self.isPotentialCOInteraction(eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("co_limits") if self.styles["bb_limits"] and not self.isPotentialBBInteraction(eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("bb_limits") if categoryName != "neg": self.exampleStats.filter("bb_limits(" + categoryName + ":" + eJ.get("type") + "/" + eI.get("type") + ")") if self.styles["bi_limits"] and not self.isPotentialBIInteraction(eJ, eI, sentenceGraph, self.exampleStats): makeExample = False #self.exampleStats.filter("bi_limits") if self.styles["epi_limits"] and not self.isPotentialEPIInteraction(eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("epi_limits") if self.styles["id_limits"] and not self.isPotentialIDInteraction(eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("id_limits") # if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"): # makeExample = False # self.exampleStats.filter("selftrain_limits") # if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups): # makeExample = False # self.exampleStats.filter("selftrain_group") if self.styles["pos_only"] and categoryName == "neg": makeExample = False self.exampleStats.filter("pos_only") if makeExample: #examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) ) ExampleUtils.appendExamples([self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI)], outfile) exampleIndex += 1 self.exampleStats.endExample() else: if self.styles["entities"]: categoryName = self.getCategoryName(sentenceGraph, eI, eJ, False) else: categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, False) self.exampleStats.beginExample(categoryName) forwardExample = self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) if not self.styles["graph_kernel"]: reverseExample = self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) forwardExample[2].update(reverseExample[2]) #examples.append(forwardExample) ExampleUtils.appendExamples([forwardExample], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex
def buildExamplesFromGraph(self, sentenceGraph, examples, goldGraph=None): # example directionality if self.styles.get("directed") == None and self.styles.get("undirected") == None: # determine directedness from corpus examplesAreDirected = self.structureAnalyzer.hasDirectedTargets() if self.structureAnalyzer != None else True elif self.styles.get("directed"): assert self.styles.get("undirected") in [None, False] examplesAreDirected = True elif self.styles.get("undirected"): assert self.styles.get("directed") in [None, False] examplesAreDirected = False # Filter entities, if needed sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities #entityToDuplicates = sentenceGraph.mergedEntityToDuplicates self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) # Connect to optional gold graph entityToGold = None if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities(entities, goldGraph.entities) # paths = None # if not self.styles.get("no_path"): # undirected = sentenceGraph.dependencyGraph.toUndirected() # paths = undirected # if self.styles.get("filter_shortest_path") != None: # For DDI use filter_shortest_path=conj_and # paths.resetAnalyses() # just in case # paths.FloydWarshall(self.filterEdge, {"edgeTypes":self.styles["filter_shortest_path"]}) dg = sentenceGraph.dependencyGraph undirected = dg.toUndirected() edgeCounts = {x:len(dg.getInEdges(x) + dg.getOutEdges(x)) for x in sentenceGraph.tokens} tokens, tokenMap = self.getTokenFeatures(sentenceGraph) # Generate examples based on interactions between entities or interactions between tokens if self.styles.get("token_nodes"): loopRange = len(tokens) else: loopRange = len(entities) for i in range(loopRange-1): for j in range(i+1,loopRange): eI = None eJ = None if self.styles.get("token_nodes"): tI = tokens[i]["element"] tJ = tokens[j]["element"] else: eI = entities[i] eJ = entities[j] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] if eI.get("type") == "neg" or eJ.get("type") == "neg": continue if self.styles.get("skip_extra_triggers"): if eI.get("source") != None or eJ.get("source") != None: continue # only consider paths between entities (NOTE! entities, not only named entities) if self.styles.get("headsOnly"): if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0): continue if examplesAreDirected: self.buildExample(examples, tI, tJ, eI, eJ, tokens, tokenMap, sentenceGraph, goldGraph, entityToGold, undirected, edgeCounts) self.buildExample(examples, tJ, tI, eJ, eI, tokens, tokenMap, sentenceGraph, goldGraph, entityToGold, undirected, edgeCounts) else: if tokenMap[tJ]["index"] < tokenMap[tI]["index"]: tI, tJ = tJ, tI eI, eJ = eJ, eI self.buildExample(examples, tI, tJ, eI, eJ, tokens, tokenMap, sentenceGraph, goldGraph, entityToGold, undirected, edgeCounts, False)
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph = None, structureAnalyzer=None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ #examples = [] exampleIndex = 0 # example directionality if self.styles["directed"] == None and self.styles["undirected"] == None: # determine directedness from corpus examplesAreDirected = structureAnalyzer.hasDirectedTargets() if structureAnalyzer != None else True elif self.styles["directed"]: assert self.styles["undirected"] in [None, False] examplesAreDirected = True elif self.styles["undirected"]: assert self.styles["directed"] in [None, False] examplesAreDirected = False if not self.styles["no_trigger_features"]: self.triggerFeatureBuilder.initSentence(sentenceGraph) if self.styles["evex"]: self.evexFeatureBuilder.initSentence(sentenceGraph) # if self.styles["sdb_merge"]: # self.determineNonOverlappingTypes(structureAnalyzer) # Filter entities, if needed sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities entityToDuplicates = sentenceGraph.mergedEntityToDuplicates self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) # Connect to optional gold graph entityToGold = None if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities(entities, goldGraph.entities) paths = None if not self.styles["no_path"]: undirected = sentenceGraph.dependencyGraph.toUndirected() paths = undirected if self.styles["filter_shortest_path"] != None: # For DDI use filter_shortest_path=conj_and paths.resetAnalyses() # just in case paths.FloydWarshall(self.filterEdge, {"edgeTypes":self.styles["filter_shortest_path"]}) # Generate examples based on interactions between entities or interactions between tokens if self.styles["token_nodes"]: loopRange = len(sentenceGraph.tokens) else: loopRange = len(entities) for i in range(loopRange-1): for j in range(i+1,loopRange): eI = None eJ = None if self.styles["token_nodes"]: tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] else: eI = entities[i] eJ = entities[j] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] if eI.get("type") == "neg" or eJ.get("type") == "neg": continue if self.styles["skip_extra_triggers"]: if eI.get("source") != None or eJ.get("source") != None: continue # only consider paths between entities (NOTE! entities, not only named entities) if self.styles["headsOnly"]: if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0): continue examples = self.buildExamplesForPair(tI, tJ, paths, sentenceGraph, goldGraph, entityToGold, eI, eJ, structureAnalyzer, examplesAreDirected) for categoryName, features, extra in examples: # make example if self.styles["binary"]: if categoryName != "neg": category = 1 else: category = -1 extra["categoryName"] = "i" else: category = self.classSet.getId(categoryName) example = [sentenceGraph.getSentenceId()+".x"+str(exampleIndex), category, features, extra] ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 return exampleIndex
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None): """ Build one example for each token of the sentence """ examples = [] exampleIndex = 0 self.tokenFeatures = {} if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities(sentenceGraph.entities, goldGraph.entities) namedEntityCount = 0 entityCount = 0 for entity in sentenceGraph.entities: if entity.get("given") == "True": # known data which can be used for features namedEntityCount += 1 else: # known data which can be used for features entityCount += 1 namedEntityCountFeature = "nameCount_" + str(namedEntityCount) entityCountFeature = "entityCount_" + str(entityCount) bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if len(sentenceGraph.tokenIsEntityHead) > 0: text = "ge_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 text = token.get("text") if self.styles["speculation_words"] and text in self.specWords: if not bagOfWords.has_key("spec_bow_"+text): bagOfWords["spec_bow_"+text] = 0 bagOfWords["spec_bow_"+text] += 1 bagOfWords["spec_sentence"] = 1 bowFeatures = {} for k,v in bagOfWords.iteritems(): bowFeatures[self.featureSet.getId(k)] = v self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: inEdges = sentenceGraph.dependencyGraph.getInEdges(token) self.inEdgesByToken[token] = inEdges outEdges = sentenceGraph.dependencyGraph.getOutEdges(token) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) for entity in sentenceGraph.entities: #token = sentenceGraph.tokens[i] token = sentenceGraph.entityHeadTokenByEntity[entity] # Recognize only non-named entities (i.e. interaction words) if entity.get("given") == "True": continue # CLASS if self.styles["classification"] == "multiclass": task3Type = "multiclass" categoryName = "" if entity.get("negation") == "True": categoryName += "negation" if entity.get("speculation") == "True": if categoryName != "": categoryName += "---" categoryName += "speculation" if categoryName == "": categoryName = "neg" category = self.classSet.getId(categoryName) elif self.styles["classification"] == "speculation": task3Type = "speculation" if entity.get("speculation") == "True": category = self.classSet.getId("speculation") else: category = 1 if goldGraph != None: if len(entityToGold[entity]) > 0 and entityToGold[entity][0].get("speculation") == "True": category = self.classSet.getId("speculation") else: category = 1 categoryName = self.classSet.getName(category) elif self.styles["classification"] == "negation": task3Type = "negation" if entity.get("negation") == "True": category = self.classSet.getId("negation") else: category = 1 if goldGraph != None: if len(entityToGold[entity]) > 0 and entityToGold[entity][0].get("negation") == "True": category = self.classSet.getId("negation") else: category = 1 categoryName = self.classSet.getName(category) self.exampleStats.beginExample(categoryName) # FEATURES features = {} # ENTITY TYPE #entityType = self.classSet.getId(self.getMergedEntityType(entity)) #del self.classSet.Ids[self.getMergedEntityType(entity)] #IF LOCAL # There's a mistake here. The entityType should be the string, not # the id of the type. But there's also another issue. getMergedEntityType # expects a list, not an item. Therefore the type is always empty -> # types don't get used in classification. But this is the code used in # the publication, so it will now be published as is, and fixed in a later # release. # # Besides, using the classSet here generates an unneeded # additional class, that shows up in evaluations etc. However, to be # able to publish the exact models used for the publication experiments, # this can't be fixed so it breaks feature id consistency. Therefore I'll # now just remove the redundant class id from the classSet. #ENDIF #features[self.featureSet.getId(entityType)] = 1 features[self.featureSet.getId(namedEntityCountFeature)] = 1 features[self.featureSet.getId(entityCountFeature)] = 1 #for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_"+text)] = 1 features[self.featureSet.getId("POS_"+token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_"+stem)] = 1 features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1 if self.styles["speculation_words"]: if text in self.specWords: features[self.featureSet.getId("ent_spec")] = 1 if stem in self.specWordStems: features[self.featureSet.getId("ent_spec_stem")] = 1 # Linear order features for i in range(len(sentenceGraph.tokens)): if token == sentenceGraph.tokens[i]: break for index in [-3,-2,-1,1,2,3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j-1] == "-": features[self.featureSet.getId("has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1 # Attached edges (Hanging in and out edges) t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_"+edgeType)] = 1 features[self.featureSet.getId("t1HIn_"+edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_"+tokenText)] = 1 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenText)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_"+edgeType)] = 1 features[self.featureSet.getId("t1HOut_"+edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_"+tokenText)] = 1 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenText)] = 1 self.buildChains(token, sentenceGraph, features) extra = {"xtype":"task3","t3type":task3Type,"t":token.get("id"),"entity":entity.get("id")} #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) example = (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ #examples = [] exampleIndex = 0 if self.styles["trigger_features"]: self.triggerFeatureBuilder.initSentence(sentenceGraph) if self.styles["evex"]: self.evexFeatureBuilder.initSentence(sentenceGraph) # Filter entities, if needed #mergedIds = None #duplicateEntities = None #entities = sentenceGraph.entities #entities, mergedIds, duplicateEntities = self.mergeEntities(sentenceGraph, False) # "no_duplicates" in self.styles) sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities entityToDuplicates = sentenceGraph.mergedEntityToDuplicates self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) # Connect to optional gold graph if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities( entities, goldGraph.entities) paths = None if not self.styles["no_path"]: ##undirected = sentenceGraph.getUndirectedDependencyGraph() #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) ###undirected = sentenceGraph.dependencyGraph.to_undirected() ####undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work undirected = sentenceGraph.dependencyGraph.toUndirected() #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) paths = undirected #for edge in sentenceGraph.dependencyGraph.edges: # assert edge[2] != None #for edge in undirected.edges: # assert edge[2] != None #if sentenceGraph.sentenceElement.get("id") == "GENIA.d70.s5": # print [(x[0].get("id"), x[1].get("id"), x[2].get("id")) for x in sentenceGraph.dependencyGraph.edges] # Generate examples based on interactions between entities or interactions between tokens if self.styles["entities"]: loopRange = len(entities) else: loopRange = len(sentenceGraph.tokens) for i in range(loopRange - 1): for j in range(i + 1, loopRange): eI = None eJ = None if self.styles["entities"]: eI = entities[i] eJ = entities[j] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True": # continue if eI.get("type") == "neg" or eJ.get("type") == "neg": continue if self.styles["skip_extra_triggers"]: if eI.get("source") != None or eJ.get( "source") != None: continue else: tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] # only consider paths between entities (NOTE! entities, not only named entities) if self.styles["headsOnly"]: if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len( sentenceGraph.tokenIsEntityHead[tJ]) == 0): continue if self.styles["directed"]: # define forward if self.styles["entities"]: categoryName = self.getCategoryName( sentenceGraph, eI, eJ, True) if goldGraph != None: categoryName = self.getGoldCategoryName( goldGraph, entityToGold, eI, eJ, True) else: categoryName = self.getCategoryNameFromTokens( sentenceGraph, tI, tJ, True) # make forward self.exampleStats.beginExample(categoryName) makeExample = True if self.styles[ "genia_limits"] and not self.isPotentialGeniaInteraction( eI, eJ): makeExample = False self.exampleStats.filter("genia_limits") if self.styles["genia_task1"] and ( eI.get("type") == "Entity" or eJ.get("type") == "Entity"): makeExample = False self.exampleStats.filter("genia_task1") if self.styles[ "rel_limits"] and not self.isPotentialRELInteraction( eI, eJ): makeExample = False self.exampleStats.filter("rel_limits") if self.styles[ "co_limits"] and not self.isPotentialCOInteraction( eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("co_limits") if self.styles[ "bb_limits"] and not self.isPotentialBBInteraction( eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("bb_limits") if categoryName != "neg": self.exampleStats.filter("bb_limits(" + categoryName + ":" + eI.get("type") + "/" + eJ.get("type") + ")") if self.styles[ "bi_limits"] and not self.isPotentialBIInteraction( eI, eJ, sentenceGraph, self.exampleStats): makeExample = False #self.exampleStats.filter("bi_limits") if self.styles[ "epi_limits"] and not self.isPotentialEPIInteraction( eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("epi_limits") if self.styles[ "id_limits"] and not self.isPotentialIDInteraction( eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("id_limits") # if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"): # makeExample = False # self.exampleStats.filter("selftrain_limits") # if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups): # makeExample = False # self.exampleStats.filter("selftrain_group") if self.styles["pos_only"] and categoryName == "neg": makeExample = False self.exampleStats.filter("pos_only") if makeExample: #examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) ) ExampleUtils.appendExamples([ self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) ], outfile) exampleIndex += 1 self.exampleStats.endExample() # define reverse if self.styles["entities"]: categoryName = self.getCategoryName( sentenceGraph, eJ, eI, True) if goldGraph != None: categoryName = self.getGoldCategoryName( goldGraph, entityToGold, eJ, eI, True) else: categoryName = self.getCategoryNameFromTokens( sentenceGraph, tJ, tI, True) # make reverse self.exampleStats.beginExample(categoryName) makeExample = True if self.styles[ "genia_limits"] and not self.isPotentialGeniaInteraction( eJ, eI): makeExample = False self.exampleStats.filter("genia_limits") if self.styles["genia_task1"] and ( eI.get("type") == "Entity" or eJ.get("type") == "Entity"): makeExample = False self.exampleStats.filter("genia_task1") if self.styles[ "rel_limits"] and not self.isPotentialRELInteraction( eJ, eI): makeExample = False self.exampleStats.filter("rel_limits") if self.styles[ "co_limits"] and not self.isPotentialCOInteraction( eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("co_limits") if self.styles[ "bb_limits"] and not self.isPotentialBBInteraction( eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("bb_limits") if categoryName != "neg": self.exampleStats.filter("bb_limits(" + categoryName + ":" + eJ.get("type") + "/" + eI.get("type") + ")") if self.styles[ "bi_limits"] and not self.isPotentialBIInteraction( eJ, eI, sentenceGraph, self.exampleStats): makeExample = False #self.exampleStats.filter("bi_limits") if self.styles[ "epi_limits"] and not self.isPotentialEPIInteraction( eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("epi_limits") if self.styles[ "id_limits"] and not self.isPotentialIDInteraction( eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("id_limits") # if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"): # makeExample = False # self.exampleStats.filter("selftrain_limits") # if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups): # makeExample = False # self.exampleStats.filter("selftrain_group") if self.styles["pos_only"] and categoryName == "neg": makeExample = False self.exampleStats.filter("pos_only") if makeExample: #examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) ) ExampleUtils.appendExamples([ self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) ], outfile) exampleIndex += 1 self.exampleStats.endExample() else: if self.styles["entities"]: categoryName = self.getCategoryName( sentenceGraph, eI, eJ, False) else: categoryName = self.getCategoryNameFromTokens( sentenceGraph, tI, tJ, False) self.exampleStats.beginExample(categoryName) forwardExample = self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) if not self.styles["graph_kernel"]: reverseExample = self.buildExample( tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) forwardExample[2].update(reverseExample[2]) #examples.append(forwardExample) ExampleUtils.appendExamples([forwardExample], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None): """ Build one example for each token of the sentence """ examples = [] exampleIndex = 0 self.tokenFeatures = {} if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities( sentenceGraph.entities, goldGraph.entities) namedEntityCount = 0 entityCount = 0 for entity in sentenceGraph.entities: if entity.get( "isName" ) == "True": # known data which can be used for features namedEntityCount += 1 else: # known data which can be used for features entityCount += 1 namedEntityCountFeature = "nameCount_" + str(namedEntityCount) entityCountFeature = "entityCount_" + str(entityCount) bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if len(sentenceGraph.tokenIsEntityHead) > 0: text = "ge_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 text = token.get("text") if self.styles["speculation_words"] and text in self.specWords: if not bagOfWords.has_key("spec_bow_" + text): bagOfWords["spec_bow_" + text] = 0 bagOfWords["spec_bow_" + text] += 1 bagOfWords["spec_sentence"] = 1 bowFeatures = {} for k, v in bagOfWords.iteritems(): bowFeatures[self.featureSet.getId(k)] = v self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: inEdges = sentenceGraph.dependencyGraph.getInEdges(token) self.inEdgesByToken[token] = inEdges outEdges = sentenceGraph.dependencyGraph.getOutEdges(token) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) for entity in sentenceGraph.entities: #token = sentenceGraph.tokens[i] token = sentenceGraph.entityHeadTokenByEntity[entity] # Recognize only non-named entities (i.e. interaction words) if entity.get("isName") == "True": continue # CLASS if self.styles["classification"] == "multiclass": task3Type = "multiclass" categoryName = "" if entity.get("negation") == "True": categoryName += "negation" if entity.get("speculation") == "True": if categoryName != "": categoryName += "---" categoryName += "speculation" if categoryName == "": categoryName = "neg" category = self.classSet.getId(categoryName) elif self.styles["classification"] == "speculation": task3Type = "speculation" if entity.get("speculation") == "True": category = self.classSet.getId("speculation") else: category = 1 if goldGraph != None: if len(entityToGold[entity]) > 0 and entityToGold[entity][ 0].get("speculation") == "True": category = self.classSet.getId("speculation") else: category = 1 categoryName = self.classSet.getName(category) elif self.styles["classification"] == "negation": task3Type = "negation" if entity.get("negation") == "True": category = self.classSet.getId("negation") else: category = 1 if goldGraph != None: if len(entityToGold[entity]) > 0 and entityToGold[entity][ 0].get("negation") == "True": category = self.classSet.getId("negation") else: category = 1 categoryName = self.classSet.getName(category) self.exampleStats.beginExample(categoryName) # FEATURES features = {} # ENTITY TYPE #entityType = self.classSet.getId(self.getMergedEntityType(entity)) #del self.classSet.Ids[self.getMergedEntityType(entity)] #IF LOCAL # There's a mistake here. The entityType should be the string, not # the id of the type. But there's also another issue. getMergedEntityType # expects a list, not an item. Therefore the type is always empty -> # types don't get used in classification. But this is the code used in # the publication, so it will now be published as is, and fixed in a later # release. # # Besides, using the classSet here generates an unneeded # additional class, that shows up in evaluations etc. However, to be # able to publish the exact models used for the publication experiments, # this can't be fixed so it breaks feature id consistency. Therefore I'll # now just remove the redundant class id from the classSet. #ENDIF #features[self.featureSet.getId(entityType)] = 1 features[self.featureSet.getId(namedEntityCountFeature)] = 1 features[self.featureSet.getId(entityCountFeature)] = 1 #for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_" + text)] = 1 features[self.featureSet.getId("POS_" + token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_" + stem)] = 1 features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1 if self.styles["speculation_words"]: if text in self.specWords: features[self.featureSet.getId("ent_spec")] = 1 if stem in self.specWordStems: features[self.featureSet.getId("ent_spec_stem")] = 1 # Linear order features for i in range(len(sentenceGraph.tokens)): if token == sentenceGraph.tokens[i]: break for index in [-3, -2, -1, 1, 2, 3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j - 1] == "-": features[self.featureSet.getId( "has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_" + text[j - 1:j + 1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_" + text[j - 2:j + 1].lower())] = 1 # Attached edges (Hanging in and out edges) t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_" + edgeType)] = 1 features[self.featureSet.getId("t1HIn_" + edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_" + tokenText)] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + tokenText)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_" + edgeType)] = 1 features[self.featureSet.getId("t1HOut_" + edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_" + tokenText)] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + tokenText)] = 1 self.buildChains(token, sentenceGraph, features) extra = { "xtype": "task3", "t3type": task3Type, "t": token.get("id"), "entity": entity.get("id") } #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) example = (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex
def makeExampleGraphWithGold(self, builder, sentenceGraph, goldGraph, sentenceIndex): exampleGraph = NX10.MultiDiGraph() for token in goldGraph.tokens: exampleGraph.add_node(token) arcStyles = {} labelStyles = {} extraByToken = {} edgeTypes = {} stats = {"entities":0,"edges":0,"tp":0,"fp":0,"tn":0,"fn":0} entityMap = EvaluateInteractionXML.mapEntities(sentenceGraph.entities, goldGraph.entities, goldGraph.tokens) tokenMap = self.getTokenMap(sentenceGraph, goldGraph) toEntitiesWithPredictions = set() for entityFrom, entitiesTo in entityMap.iteritems(): stats["entities"] += 1 entityFromHeadToken = sentenceGraph.entityHeadTokenByEntity[entityFrom] for entityTo in entitiesTo: toEntitiesWithPredictions.add(entityTo) entityToHeadToken = goldGraph.entityHeadTokenByEntity[entityTo] style = None eFromType = entityFrom.get("type") eToType = entityTo.get("type") if extraByToken.has_key(entityToHeadToken): style = extraByToken[entityToHeadToken] if eFromType == eToType: if eToType != "neg": if style == None: style = [entityTo.get("type"),{"fill":"green"}] elif style[1]["fill"] == "#79BAEC": style = [entityTo.get("type"),{"fill":"green"}] if entityTo.get("isName") == "True": style = [entityTo.get("type"),{"fill":"brown"}] else: stats["tp"] += 1 else: if eToType == "neg": pass extraByToken[entityToHeadToken] = style if len(entitiesTo) == 0: stats["fp"] += 1 if extraByToken.has_key(tokenMap[entityFromHeadToken]): style = extraByToken[tokenMap[entityFromHeadToken]] if style[1]["fill"] != "green": style = [entityFrom.get("type"),{"fill":"red"}] extraByToken[tokenMap[entityFromHeadToken]] = style else: extraByToken[tokenMap[entityFromHeadToken]] = [entityFrom.get("type"),{"fill":"red"}] for entity in goldGraph.entities: if entity not in toEntitiesWithPredictions: stats["fn"] += 1 extraByToken[goldGraph.entityHeadTokenByEntity[entity]] = [entity.get("type"),{"fill":"#79BAEC"}] toInteractionsWithPredictions = set() for interactionFrom in sentenceGraph.interactions: if interactionFrom.get("type") == "neg": continue stats["edges"] += 1 e1s = entityMap[sentenceGraph.entitiesById[interactionFrom.get("e1")]] e1Ids = [] for e1 in e1s: e1Ids.append(e1.get("id")) e2s = entityMap[sentenceGraph.entitiesById[interactionFrom.get("e2")]] e2Ids = [] for e2 in e2s: e2Ids.append(e2.get("id")) t1 = tokenMap[sentenceGraph.entityHeadTokenByEntity[sentenceGraph.entitiesById[interactionFrom.get("e1")]]] t2 = tokenMap[sentenceGraph.entityHeadTokenByEntity[sentenceGraph.entitiesById[interactionFrom.get("e2")]]] iFromType = interactionFrom.get("type") found = False for interactionTo in goldGraph.interactions: if interactionTo.get("e1") in e1Ids and interactionTo.get("e2") in e2Ids: toInteractionsWithPredictions.add(interactionTo) iToType = interactionTo.get("type") exampleGraph.add_edge(t1, t2, element=interactionFrom) #edge = exampleGraph.get_edge(t1, t2, data=True) edge = self.getNXEdge(exampleGraph, t1, t2, interactionFrom) if t1 != t2: if iToType == iFromType: edge[2]["arcStyles"] = {"stroke":"green"} edge[2]["labelStyles"] = {"fill":"green"} stats["tp"] += 1 else: edge[2]["arcStyles"] = {"stroke":"red"} edge[2]["labelStyles"] = {"fill":"red"} stats["fp"] += 1 found = True if not found: # false positive prediction if t1 != t2: exampleGraph.add_edge(t1, t2, element=interactionFrom) edge = self.getNXEdge(exampleGraph, t1, t2, interactionFrom) edge[2]["arcStyles"] = {"stroke":"red"} edge[2]["labelStyles"] = {"fill":"red"} stats["fp"] += 1 for interactionTo in goldGraph.interactions: if interactionTo not in toInteractionsWithPredictions: # false negative gold t1 = goldGraph.entityHeadTokenByEntity[goldGraph.entitiesById[interactionTo.get("e1")]] t2 = goldGraph.entityHeadTokenByEntity[goldGraph.entitiesById[interactionTo.get("e2")]] if t1 != t2: exampleGraph.add_edge(t1, t2, element=interactionTo) edge = self.getNXEdge(exampleGraph, t1, t2, interactionTo) edge[2]["arcStyles"] = {"stroke":"#79BAEC"} edge[2]["labelStyles"] = {"fill":"#79BAEC"} stats["fn"] += 1 builder.header("Classification",4) svgTokens = GraphToSVG.tokensToSVG(goldGraph.tokens,False,None,extraByToken) #arcStyles, labelStyles = self.getMatchingEdgeStyles(exampleGraph, sentenceGraph.interactionGraph, "green", "red" ) svgEdges = GraphToSVG.edgesToSVG(svgTokens, exampleGraph, "type", None) sentenceId = sentenceGraph.getSentenceId() svgElement = GraphToSVG.writeSVG(svgTokens, svgEdges, self.outDir+"/svg/"+sentenceId+"-"+str(sentenceIndex)+"_learned.svg") builder.svg("../svg/" + sentenceId + "-"+str(sentenceIndex)+"_learned.svg",svgElement.attrib["width"],svgElement.attrib["height"],id="learned_graph") builder.lineBreak() return stats