def loadEventXML(path, verbose=False): xml = ETUtils.ETFromObj(path) sentDict = {} for sentence in xml.getiterator("sentence"): sentenceText = getText(sentence).strip() if not sentDict.has_key(sentenceText): sentDict[sentenceText] = [] for event in xml.getiterator("event"): sentenceText = getText(event).strip() if not sentDict.has_key(sentenceText): sentDict[sentenceText] = [] events = sentDict[sentenceText] clue = event.find("clue") clueTuple = getClue(clue) eventType = event.find("type").get("class") if eventType == "Protein_amino_acid_phosphorylation": eventType = "Phosphorylation" if type(clueTuple) == types.StringType: if verbose: print "Event", eventType, "clue with no clueType:", ETUtils.toStr(clue) else: assert sentenceText[clueTuple[1]:clueTuple[2]+1] == clueTuple[0], (sentenceText, sentenceText[clueTuple[1]:clueTuple[2]+1], clueTuple) event = (clueTuple[1], clueTuple[2], eventType, clueTuple[0]) if event not in events: events.append(event) return sentDict
def loadEventXML(path, verbose=False): xml = ETUtils.ETFromObj(path) sentDict = {} for sentence in xml.getiterator("sentence"): sentenceText = getText(sentence).strip() if not sentDict.has_key(sentenceText): sentDict[sentenceText] = [] for event in xml.getiterator("event"): sentenceText = getText(event).strip() if not sentDict.has_key(sentenceText): sentDict[sentenceText] = [] events = sentDict[sentenceText] clue = event.find("clue") clueTuple = getClue(clue) eventType = event.find("type").get("class") if eventType == "Protein_amino_acid_phosphorylation": eventType = "Phosphorylation" if type(clueTuple) == types.StringType: if verbose: print "Event", eventType, "clue with no clueType:", ETUtils.toStr(clue) else: assert sentenceText[clueTuple[1]:clueTuple[2]+1] == clueTuple[0], (sentenceText, sentenceText[clueTuple[1]:clueTuple[2]+1], clueTuple) event = (clueTuple[1], clueTuple[2], eventType, clueTuple[0]) if event not in events: events.append(event) return sentDict
def processCorpus(input, parserName): print >> sys.stderr, "Loading corpus file", input corpusRoot = ETUtils.ETFromObj(input).getroot() documents = corpusRoot.findall("document") counts = defaultdict(int) matchByType = defaultdict(lambda: [0, 0]) filteredMatchByType = defaultdict(lambda: [0, 0]) filter = set(["NP", "TOK-tIN", "WHADVP", "WHNP", "TOK-tWP$", "TOK-tPRP$", "NP-IN"]) # # fix spans # for document in documents: # for sentence in document.findall("sentence"): # sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) # for entity in sentence.findall("entity"): # altOffsetString = entity.get("altOffset") # if altOffsetString == None: # continue # #print altOffsetString # altOffsets = Range.charOffsetToTuples(altOffsetString) # assert len(altOffsets) == 1 # for i in range(len(altOffsets)): # altOffset = altOffsets[i] # altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0]) # entity.set("altOffset", Range.tuplesToCharOffset(altOffsets)) # counter = ProgressCounter(len(documents), "Documents") for document in documents: for sentence in document.findall("sentence"): entities = sentence.findall("entity") parse = ETUtils.getElementByAttrib(sentence.find("sentenceanalyses"), "parse", {"parser": parserName}) if parse == None: continue tokenization = ETUtils.getElementByAttrib( sentence.find("sentenceanalyses"), "tokenization", {"tokenizer": parse.get("tokenizer")} ) phrases, phraseDict = makePhrases(parse, tokenization, entities) phraseOffsets = phraseDict.keys() # phraseOffsets.sort() phraseNECounts = getNECounts(phrases, entities) for value in phraseDict.values(): counts["phrases"] += len(value) for phrase in value: matchByType[phrase.get("type")][0] += 1 if phrase.get("type") in filter: filteredMatchByType[phrase.get("type")][0] += 1 counts["phrases-filtered"] += 1 if phrase.get("type").find("NP") != -1: matchByType[phrase.get("type") + "_NE" + str(phraseNECounts[phrase])][0] += 1 counts["tokens"] += len(tokenization.findall("token")) corefType = {} for interaction in sentence.findall("interaction"): if interaction.get("type") == "Coref": corefType[interaction.get("e1")] = "Anaphora" corefType[interaction.get("e2")] = "Antecedent" for entity in entities: if entity.get("given") == "True": continue counts["entity"] += 1 print "entity", entity.get("id") print ETUtils.toStr(entity) matches = getMatchingPhrases(entity, phraseOffsets, phraseDict) count = 0 filteredCount = 0 for phrase in matches: cType = "UNKNOWN" if corefType.has_key(entity.get("id")): cType = corefType[entity.get("id")] print " match", count, ETUtils.toStr(phrase), "NE" + str( phraseNECounts[phrase] ), "ctype:" + cType, "ent:" + ETUtils.toStr(entity) count += 1 matchByType[phrase.get("type")][1] += 1 matchByType[phrase.get("type") + "_" + cType][1] += 1 matchByType[phrase.get("type") + "_" + cType + "_NE" + str(phraseNECounts[phrase])][1] += 1 if phrase.get("type") in filter: filteredCount += 1 filteredMatchByType[phrase.get("type")][1] += 1 # Matching if count == 0: print " NO MATCH", ETUtils.toStr(entity) counts["no-match"] += 1 else: counts["match"] += 1 # Multimatching if len(matches) > 1: bestMatch = selectBestMatch(entity, matches) print " MULTIMATCH(" + entity.get("charOffset") + "," + str( entity.get("altOffset") ) + ")", ", ".join( [x.get("type") + "_" + x.get("charOffset") for x in matches] ), "SEL(" + bestMatch.get( "type" ) + "_" + bestMatch.get( "charOffset" ) + ")" # Filtered matching if filteredCount == 0: counts["no-match-filtered"] += 1 else: counts["match-filtered"] += 1 print "Match" for key in sorted(matchByType.keys()): print " ", key, " ", matchByType[key] print "Filtered", filteredMatchByType print "Counts", counts
def processCorpus(input, parserName): print >> sys.stderr, "Loading corpus file", input corpusRoot = ETUtils.ETFromObj(input).getroot() documents = corpusRoot.findall("document") counts = defaultdict(int) matchByType = defaultdict(lambda: [0, 0]) filteredMatchByType = defaultdict(lambda: [0, 0]) filter = set( ["NP", "TOK-tIN", "WHADVP", "WHNP", "TOK-tWP$", "TOK-tPRP$", "NP-IN"]) # # fix spans # for document in documents: # for sentence in document.findall("sentence"): # sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) # for entity in sentence.findall("entity"): # altOffsetString = entity.get("altOffset") # if altOffsetString == None: # continue # #print altOffsetString # altOffsets = Range.charOffsetToTuples(altOffsetString) # assert len(altOffsets) == 1 # for i in range(len(altOffsets)): # altOffset = altOffsets[i] # altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0]) # entity.set("altOffset", Range.tuplesToCharOffset(altOffsets)) #counter = ProgressCounter(len(documents), "Documents") for document in documents: for sentence in document.findall("sentence"): entities = sentence.findall("entity") parse = ETUtils.getElementByAttrib( sentence.find("sentenceanalyses"), "parse", {"parser": parserName}) if parse == None: continue tokenization = ETUtils.getElementByAttrib( sentence.find("sentenceanalyses"), "tokenization", {"tokenizer": parse.get("tokenizer")}) phrases, phraseDict = makePhrases(parse, tokenization, entities) phraseOffsets = phraseDict.keys() #phraseOffsets.sort() phraseNECounts = getNECounts(phrases, entities) for value in phraseDict.values(): counts["phrases"] += len(value) for phrase in value: matchByType[phrase.get("type")][0] += 1 if phrase.get("type") in filter: filteredMatchByType[phrase.get("type")][0] += 1 counts["phrases-filtered"] += 1 if phrase.get("type").find("NP") != -1: matchByType[phrase.get("type") + "_NE" + str(phraseNECounts[phrase])][0] += 1 counts["tokens"] += len(tokenization.findall("token")) corefType = {} for interaction in sentence.findall("interaction"): if interaction.get("type") == "Coref": corefType[interaction.get("e1")] = "Anaphora" corefType[interaction.get("e2")] = "Antecedent" for entity in entities: if entity.get("given") == "True": continue counts["entity"] += 1 print "entity", entity.get("id") print ETUtils.toStr(entity) matches = getMatchingPhrases(entity, phraseOffsets, phraseDict) count = 0 filteredCount = 0 for phrase in matches: cType = "UNKNOWN" if corefType.has_key(entity.get("id")): cType = corefType[entity.get("id")] print " match", count, ETUtils.toStr(phrase), "NE" + str( phraseNECounts[phrase] ), "ctype:" + cType, "ent:" + ETUtils.toStr(entity) count += 1 matchByType[phrase.get("type")][1] += 1 matchByType[phrase.get("type") + "_" + cType][1] += 1 matchByType[phrase.get("type") + "_" + cType + "_NE" + str(phraseNECounts[phrase])][1] += 1 if phrase.get("type") in filter: filteredCount += 1 filteredMatchByType[phrase.get("type")][1] += 1 # Matching if count == 0: print " NO MATCH", ETUtils.toStr(entity) counts["no-match"] += 1 else: counts["match"] += 1 # Multimatching if len(matches) > 1: bestMatch = selectBestMatch(entity, matches) print " MULTIMATCH(" + entity.get( "charOffset" ) + "," + str(entity.get("altOffset")) + ")", ", ".join([ x.get("type") + "_" + x.get("charOffset") for x in matches ]), "SEL(" + bestMatch.get("type") + "_" + bestMatch.get( "charOffset") + ")" # Filtered matching if filteredCount == 0: counts["no-match-filtered"] += 1 else: counts["match-filtered"] += 1 print "Match" for key in sorted(matchByType.keys()): print " ", key, " ", matchByType[key] print "Filtered", filteredMatchByType print "Counts", counts
def run(input, output=None, elementName="entity", processElement="document", splitNewlines=False, debug=False, pathnerPath=None, trovePath=None): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() # Write text to input file workdir = tempfile.mkdtemp() if debug: print >> sys.stderr, "PathNER work directory at", workdir infilePath = os.path.join(workdir, "pathner-in.txt") infile = codecs.open(infilePath, "wt", "utf-8") outfilePath = os.path.join(workdir, "pathner-out.txt") idCount = 0 # Put sentences in dictionary sDict = {} sentenceHasEntities = {} sCount = 0 for sentence in corpusRoot.getiterator(processElement): #infile.write("U" + str(idCount) + " " + sentence.get("text").replace("\n", " ").replace("\n", " ") + "\n") infile.write(sentence.get("text").replace("\n", " ").replace("\n", " ") + "\n") idCount += 1 sDict["U" + str(sCount)] = sentence sentenceHasEntities["U" + str(sCount)] = False sCount += 1 infile.close() # Define classpath for java if pathnerPath == None: pathnerPath = Settings.PATHNER_DIR libPath = "/lib/" if debug: print >> sys.stderr, "Directory of PathNER:", pathnerPath pathnerJarPath = pathnerPath + "/PathNER.jar" assert os.path.exists(pathnerJarPath), pathnerPath classPath = pathnerPath + "/bin" classPath += ":" + pathnerPath + libPath + "*" # Run parser print >> sys.stderr, "Running PathNER", pathnerJarPath cwd = os.getcwd() os.chdir(pathnerPath) args = Settings.JAVA.split() + ["-jar", pathnerJarPath, "--test", infilePath, "--output", outfilePath] print >> sys.stderr, "PathNER command:", " ".join(args) startTime = time.time() exitCode = subprocess.call(args) assert exitCode == 0, exitCode print >> sys.stderr, "PathNER time:", str(datetime.timedelta(seconds=time.time()-startTime)) os.chdir(cwd) sentencesWithEntities = 0 totalEntities = 0 nonSplitCount = 0 splitEventCount = 0 pathnerEntityCount = 0 removedEntityCount = 0 #Will use a simple method here: read the PathNER results and then do the matching in the sentences # Read PathNER results print >> sys.stderr, "Inserting entities" sentenceEntityCount = {} #mentionfile = codecs.open(os.path.join(workdir, "file_test_result.txt"), "rt", "utf-8") #outfilePath = pathnerPath + "/" + outfilePath print >>sys.stderr, 'Getting PathNER results from', outfilePath if os.path.isfile(outfilePath): #pathway mentions detected mentionfile = codecs.open(outfilePath, "rt", "utf-8") menDict = {} menSet = set() for line in mentionfile: #bannerId, offsets, word = line.strip().split("|", 2) pathNerTag, mention, pathNerId, confidence = line.strip().split("\t") menDict[mention] = pathNerId menSet.add(mention) mentionfile.close() print menSet #count for pathway entities epCount = 0 for sentence in corpusRoot.getiterator(processElement): #infile.write("U" + str(idCount) + " " + sentence.get("text").replace("\n", " ").replace("\n", " ") + "\n") sentText = sentence.get("text").replace("\n", " ").replace("\n", " ") + "\n" startOffsets = [] endOffsets = [] bannerEntities = sentence.findall("entity") bannerEntityCount = 0 for bannerEntity in bannerEntities: source = bannerEntity.get('source') text = bannerEntity.get('text') if not source == 'BANNER': print source, text bannerEntityCount += 1 startOffset = 0 endOffset = 0 bannerEntity2removed = set() for mention in menSet: starts = [match.start() for match in re.finditer(re.escape(mention), sentText)] #print 'Finding PathNER mention:', mention, starts for startOffset in starts: endOffset = startOffset + len(mention) if startOffset < 0: continue entities = makeEntityElements(int(startOffset), int(endOffset), sentence.get("text"), splitNewlines, elementName) for ent in entities: #Add processing for entities that are overlapped with the PathNER result entOffsets = ent.get("charOffset").split('-') entStart = int(entOffsets[0]) entEnd = int(entOffsets[1]) for bannerEntity in bannerEntities: bannerOffsets = bannerEntity.get('charOffset').split('-') bannerStart = int(bannerOffsets[0]) bannerEnd = int(bannerOffsets[1]) if debug: print 'PathNER entity:', entStart, entEnd, 'Banner entity:', bannerStart, bannerEnd #Are offsets overlapped or not? if entEnd <= bannerStart or bannerEnd <= entStart: #not overlapped continue else:#overlapped, show remove the banner entity bannerEntity2removed.add(bannerEntity) bannerEntityCount += 1 ent.set("id", sentence.get("id") + ".e" + str(bannerEntityCount)) epCount += 1 sentence.append(ent) pathnerEntityCount += 1 if debug: print 'Adding PathNER resutl:', mention print ETUtils.toStr(sentence) #Now really to delete the overlapped BANNER entities for bEntity in bannerEntity2removed: removedEntityCount += 1 sentence.remove(bEntity) if debug: print 'Removing entity ', bannerEntity.get('text'), bannerEntity.get('id') print ETUtils.toStr(sentence) print >> sys.stderr, "PathNER found", pathnerEntityCount, "entities and remove ", removedEntityCount, " overlapping BANNER entities. " print >> sys.stderr, "(" + str(sCount) + " sentences processed)" print >> sys.stderr, "New", elementName + "-elements:", totalEntities, "(Split", splitEventCount, "PathNER entities with newlines)" # Remove work directory if not debug: shutil.rmtree(workdir) else: print >> sys.stderr, "PathNER working directory for debugging at", workdir if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree