def buildExamples(self, sentenceGraph): examples = [] exampleIndex = 0 if not self.tokenCounts.has_key(len(sentenceGraph.tokens)): self.tokenCounts[len(sentenceGraph.tokens)] = 0 self.tokenCounts[len(sentenceGraph.tokens)] += 1 for token in sentenceGraph.tokens: entityCounts = {} for entity in sentenceGraph.tokenIsEntityHead[token]: t = entity.get("type") if not entityCounts.has_key(t): entityCounts[t] = 0 entityCounts[t] += 1 for k, v in entityCounts.iteritems(): if not self.counts.has_key(v): self.counts[v] = 0 self.counts[v] += 1 # per type if not self.countsPerType.has_key(k): self.countsPerType[k] = {} if not self.countsPerType[k].has_key(v): self.countsPerType[k][v] = 0 self.countsPerType[k][v] += 1 numEntities = len(sentenceGraph.tokenIsEntityHead[token]) if not self.untypedCounts.has_key(numEntities): self.untypedCounts[numEntities] = 0 self.untypedCounts[numEntities] += 1 #count = len(sentenceGraph.tokenIsEntityHead[token]) #if not self.counts.has_key(count): self.counts[count] = 0 #self.counts[count] += 1 if max(entityCounts.values() + [0]) >= 8: print "======================================" print "Entity", token.get("id") for e in sentenceGraph.tokenIsEntityHead[token]: print ETUtils.toStr(e) print "======================================" return [] # for entity in sentenceGraph.entities: # if entity # #undirected = sentenceGraph.getUndirectedDependencyGraph() # undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) # ##undirected = sentenceGraph.dependencyGraph.to_undirected() # ###undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work # paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) # # # Generate examples based on interactions between entities or interactions between tokens # if "entities" in self.styles: # loopRange = len(sentenceGraph.entities) # else: # loopRange = len(sentenceGraph.tokens) # for i in range(loopRange-1): # for j in range(i+1,loopRange): # eI = None # eJ = None # if "entities" in self.styles: # eI = sentenceGraph.entities[i]
def buildExamples(self, sentenceGraph): examples = [] exampleIndex = 0 if not self.tokenCounts.has_key(len(sentenceGraph.tokens)): self.tokenCounts[len(sentenceGraph.tokens)] = 0 self.tokenCounts[len(sentenceGraph.tokens)] += 1 for token in sentenceGraph.tokens: entityCounts = {} for entity in sentenceGraph.tokenIsEntityHead[token]: t = entity.get("type") if not entityCounts.has_key(t): entityCounts[t] = 0 entityCounts[t] += 1 for k,v in entityCounts.iteritems(): if not self.counts.has_key(v): self.counts[v] = 0 self.counts[v] += 1 # per type if not self.countsPerType.has_key(k): self.countsPerType[k] = {} if not self.countsPerType[k].has_key(v): self.countsPerType[k][v] = 0 self.countsPerType[k][v] += 1 numEntities = len(sentenceGraph.tokenIsEntityHead[token]) if not self.untypedCounts.has_key(numEntities): self.untypedCounts[numEntities] = 0 self.untypedCounts[numEntities] += 1 #count = len(sentenceGraph.tokenIsEntityHead[token]) #if not self.counts.has_key(count): self.counts[count] = 0 #self.counts[count] += 1 if max(entityCounts.values() + [0]) >= 8: print "======================================" print "Entity", token.get("id") for e in sentenceGraph.tokenIsEntityHead[token]: print ETUtils.toStr(e) print "======================================" return [] # for entity in sentenceGraph.entities: # if entity # #undirected = sentenceGraph.getUndirectedDependencyGraph() # undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) # ##undirected = sentenceGraph.dependencyGraph.to_undirected() # ###undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work # paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) # # # Generate examples based on interactions between entities or interactions between tokens # if "entities" in self.styles: # loopRange = len(sentenceGraph.entities) # else: # loopRange = len(sentenceGraph.tokens) # for i in range(loopRange-1): # for j in range(i+1,loopRange): # eI = None # eJ = None # if "entities" in self.styles: # eI = sentenceGraph.entities[i]
def loadEventXML(path, verbose=False): xml = ETUtils.ETFromObj(path) sentDict = {} for sentence in xml.getiterator("sentence"): sentenceText = getText(sentence).strip() if not sentDict.has_key(sentenceText): sentDict[sentenceText] = [] for event in xml.getiterator("event"): sentenceText = getText(event).strip() if not sentDict.has_key(sentenceText): sentDict[sentenceText] = [] events = sentDict[sentenceText] clue = event.find("clue") clueTuple = getClue(clue) eventType = event.find("type").get("class") if eventType == "Protein_amino_acid_phosphorylation": eventType = "Phosphorylation" if type(clueTuple) == types.StringType: if verbose: print "Event", eventType, "clue with no clueType:", ETUtils.toStr( clue) else: assert sentenceText[clueTuple[1]:clueTuple[2] + 1] == clueTuple[0], ( sentenceText, sentenceText[clueTuple[1]:clueTuple[2] + 1], clueTuple) event = (clueTuple[1], clueTuple[2], eventType, clueTuple[0]) if event not in events: events.append(event) return sentDict
def loadEventXML(path, verbose=False): xml = ETUtils.ETFromObj(path) sentDict = {} for sentence in xml.getiterator("sentence"): sentenceText = getText(sentence).strip() if not sentDict.has_key(sentenceText): sentDict[sentenceText] = [] for event in xml.getiterator("event"): sentenceText = getText(event).strip() if not sentDict.has_key(sentenceText): sentDict[sentenceText] = [] events = sentDict[sentenceText] clue = event.find("clue") clueTuple = getClue(clue) eventType = event.find("type").get("class") if eventType == "Protein_amino_acid_phosphorylation": eventType = "Phosphorylation" if type(clueTuple) == types.StringType: if verbose: print "Event", eventType, "clue with no clueType:", ETUtils.toStr(clue) else: assert sentenceText[clueTuple[1]:clueTuple[2]+1] == clueTuple[0], (sentenceText, sentenceText[clueTuple[1]:clueTuple[2]+1], clueTuple) event = (clueTuple[1], clueTuple[2], eventType, clueTuple[0]) if event not in events: events.append(event) return sentDict
def processCorpus(input, parserName): print >> sys.stderr, "Loading corpus file", input corpusRoot = ETUtils.ETFromObj(input).getroot() documents = corpusRoot.findall("document") counts = defaultdict(int) matchByType = defaultdict(lambda: [0, 0]) filteredMatchByType = defaultdict(lambda: [0, 0]) filter = set( ["NP", "TOK-tIN", "WHADVP", "WHNP", "TOK-tWP$", "TOK-tPRP$", "NP-IN"]) # # fix spans # for document in documents: # for sentence in document.findall("sentence"): # sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) # for entity in sentence.findall("entity"): # altOffsetString = entity.get("altOffset") # if altOffsetString == None: # continue # #print altOffsetString # altOffsets = Range.charOffsetToTuples(altOffsetString) # assert len(altOffsets) == 1 # for i in range(len(altOffsets)): # altOffset = altOffsets[i] # altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0]) # entity.set("altOffset", Range.tuplesToCharOffset(altOffsets)) #counter = ProgressCounter(len(documents), "Documents") for document in documents: for sentence in document.findall("sentence"): entities = sentence.findall("entity") parse = ETUtils.getElementByAttrib( sentence.find("sentenceanalyses"), "parse", {"parser": parserName}) if parse == None: continue tokenization = ETUtils.getElementByAttrib( sentence.find("sentenceanalyses"), "tokenization", {"tokenizer": parse.get("tokenizer")}) phrases, phraseDict = makePhrases(parse, tokenization, entities) phraseOffsets = phraseDict.keys() #phraseOffsets.sort() phraseNECounts = getNECounts(phrases, entities) for value in phraseDict.values(): counts["phrases"] += len(value) for phrase in value: matchByType[phrase.get("type")][0] += 1 if phrase.get("type") in filter: filteredMatchByType[phrase.get("type")][0] += 1 counts["phrases-filtered"] += 1 if phrase.get("type").find("NP") != -1: matchByType[phrase.get("type") + "_NE" + str(phraseNECounts[phrase])][0] += 1 counts["tokens"] += len(tokenization.findall("token")) corefType = {} for interaction in sentence.findall("interaction"): if interaction.get("type") == "Coref": corefType[interaction.get("e1")] = "Anaphora" corefType[interaction.get("e2")] = "Antecedent" for entity in entities: if entity.get("isName") == "True": continue counts["entity"] += 1 print "entity", entity.get("id") print ETUtils.toStr(entity) matches = getMatchingPhrases(entity, phraseOffsets, phraseDict) count = 0 filteredCount = 0 for phrase in matches: cType = "UNKNOWN" if corefType.has_key(entity.get("id")): cType = corefType[entity.get("id")] print " match", count, ETUtils.toStr(phrase), "NE" + str( phraseNECounts[phrase] ), "ctype:" + cType, "ent:" + ETUtils.toStr(entity) count += 1 matchByType[phrase.get("type")][1] += 1 matchByType[phrase.get("type") + "_" + cType][1] += 1 matchByType[phrase.get("type") + "_" + cType + "_NE" + str(phraseNECounts[phrase])][1] += 1 if phrase.get("type") in filter: filteredCount += 1 filteredMatchByType[phrase.get("type")][1] += 1 # Matching if count == 0: print " NO MATCH", ETUtils.toStr(entity) counts["no-match"] += 1 else: counts["match"] += 1 # Multimatching if len(matches) > 1: bestMatch = selectBestMatch(entity, matches) print " MULTIMATCH(" + entity.get( "charOffset" ) + "," + str(entity.get("altOffset")) + ")", ", ".join([ x.get("type") + "_" + x.get("charOffset") for x in matches ]), "SEL(" + bestMatch.get("type") + "_" + bestMatch.get( "charOffset") + ")" # Filtered matching if filteredCount == 0: counts["no-match-filtered"] += 1 else: counts["match-filtered"] += 1 print "Match" for key in sorted(matchByType.keys()): print " ", key, " ", matchByType[key] print "Filtered", filteredMatchByType print "Counts", counts
def processCorpus(input, parserName): print >> sys.stderr, "Loading corpus file", input corpusRoot = ETUtils.ETFromObj(input).getroot() documents = corpusRoot.findall("document") counts = defaultdict(int) matchByType = defaultdict(lambda : [0,0]) filteredMatchByType = defaultdict(lambda : [0,0]) #filter = set(["NP", "TOK-tIN", "WHADVP", "WHNP", "TOK-tWP$", "TOK-tPRP$", "NP-IN"]) filter = set(["ADJP", "DT(-)-NP-IN", "DT(-)-NP", "NP", "NP-IN", "PP", "S", "S1", "TOK-tJJ", "TOK-tNN", "TOK-tNNP", "TOK-tNNS", "VP", "VP-IN"]) # # fix spans # for document in documents: # for sentence in document.findall("sentence"): # sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) # for entity in sentence.findall("entity"): # altOffsetString = entity.get("altOffset") # if altOffsetString == None: # continue # #print altOffsetString # altOffsets = Range.charOffsetToTuples(altOffsetString) # assert len(altOffsets) == 1 # for i in range(len(altOffsets)): # altOffset = altOffsets[i] # altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0]) # entity.set("altOffset", Range.tuplesToCharOffset(altOffsets)) #counter = ProgressCounter(len(documents), "Documents") for document in documents: for sentence in document.findall("sentence"): entities = sentence.findall("entity") parse = ETUtils.getElementByAttrib(sentence.find("sentenceanalyses"), "parse", {"parser":parserName}) if parse == None: continue tokenization = ETUtils.getElementByAttrib(sentence.find("sentenceanalyses"), "tokenization", {"tokenizer":parse.get("tokenizer")}) phrases, phraseDict = makePhrases(parse, tokenization, entities) phraseOffsets = phraseDict.keys() #phraseOffsets.sort() phraseNECounts = getNECounts(phrases, entities) for value in phraseDict.values(): counts["phrases"] += len(value) for phrase in value: matchByType[phrase.get("type")][0] += 1 if phrase.get("type") in filter: filteredMatchByType[phrase.get("type")][0] += 1 counts["phrases-filtered"] += 1 if phrase.get("type").find("NP") != -1: matchByType[phrase.get("type")+"_NE"+str(phraseNECounts[phrase])][0] += 1 counts["tokens"] += len(tokenization.findall("token")) corefType = {} for interaction in sentence.findall("interaction"): if interaction.get("type") == "Coref": corefType[interaction.get("e1")] = "Anaphora" corefType[interaction.get("e2")] = "Antecedent" for entity in entities: if entity.get("isName") == "True": continue counts["entity"] += 1 print "entity", entity.get("id") print ETUtils.toStr(entity) matches = getMatchingPhrases(entity, phraseOffsets, phraseDict) count = 0 filteredCount = 0 for phrase in matches: cType = "UNKNOWN" if corefType.has_key(entity.get("id")): cType = corefType[entity.get("id")] print " match", count, ETUtils.toStr(phrase), "NE" + str(phraseNECounts[phrase]), "ctype:" + cType, "ent:" + ETUtils.toStr(entity) count += 1 matchByType[phrase.get("type")][1] += 1 matchByType[phrase.get("type")+"_"+cType][1] += 1 matchByType[phrase.get("type")+"_"+cType+"_NE"+str(phraseNECounts[phrase])][1] += 1 if phrase.get("type") in filter: filteredCount += 1 filteredMatchByType[phrase.get("type")][1] += 1 # Matching if count == 0: print " NO MATCH", ETUtils.toStr(entity) for phrase in phrases: print " " + phraseToStr(phrase) counts["no-match"] += 1 else: counts["match"] += 1 # Multimatching if len(matches) > 1: bestMatch = selectBestMatch(entity, matches) print " MULTIMATCH("+ entity.get("charOffset")+","+str(entity.get("altOffset")) + ")", ", ".join([x.get("type") + "_" + x.get("charOffset") for x in matches]), "SEL(" + bestMatch.get("type") + "_" + bestMatch.get("charOffset") + ")" # Filtered matching if filteredCount == 0: counts["no-match-filtered"] += 1 else: counts["match-filtered"] += 1 print "Match" for key in sorted(matchByType.keys()): print " ", key, " ", matchByType[key] print "Filtered", filteredMatchByType print "Counts", counts