def findHeadsSyntactic(corpus, parse, tokenization): """ Determine the head token for a named entity or trigger. The head token is the token closest to the root for the subtree of the dependency parse spanned by the text of the element. @param entityElement: a semantic node (trigger or named entity) @type entityElement: cElementTree.Element @param verbose: Print selected head tokens on screen @param verbose: boolean """ counts = [0,0] sentences = [x for x in corpus.getiterator("sentence")] counter = ProgressCounter(len(sentences), "SYNTAX") for sentence in sentences: counter.update() tokElement = ETUtils.getElementByAttrib(sentence, "sentenceanalyses/tokenizations/tokenization", {"tokenizer":tokenization}) parseElement = ETUtils.getElementByAttrib(sentence, "sentenceanalyses/parses/parse", {"parser":parse}) if tokElement == None or parseElement == None: print >> sys.stderr, "Warning, sentence", sentence.get("id"), "missing parse or tokenization" tokens = tokElement.findall("token") tokenHeadScores = getTokenHeadScores(tokens, parseElement.findall("dependency"), sentenceId=sentence.get("id")) for entity in sentence.findall("entity"): if entity.get("headOffset") == None: headToken = getEntityHeadToken(entity, tokens, tokenHeadScores) # The ElementTree entity-element is modified by setting the headOffset attribute entity.set("headOffset", headToken.get("charOffset")) entity.set("headMethod", "Syntax") entity.set("headString", headToken.get("text")) counts[0] += 1 return counts
def getSentences(corpusRoot, requireEntities=False, skipIds=[], skipParsed=True): for sentence in corpusRoot.getiterator("sentence"): if sentence.get("id") in skipIds: print >> sys.stderr, "Skipping sentence", sentence.get("id") continue if requireEntities: if sentence.find("entity") == None: continue if skipParsed: if ETUtils.getElementByAttrib(sentence, "parse", {"parser":"McCC"}) != None: continue yield sentence
def findHeadsSyntactic(corpus, parse, tokenization): """ Determine the head token for a named entity or trigger. The head token is the token closest to the root for the subtree of the dependency parse spanned by the text of the element. @param entityElement: a semantic node (trigger or named entity) @type entityElement: cElementTree.Element @param verbose: Print selected head tokens on screen @param verbose: boolean """ counts = [0, 0] sentences = [x for x in corpus.getiterator("sentence")] counter = ProgressCounter(len(sentences), "SYNTAX") for sentence in sentences: counter.update() tokElement = ETUtils.getElementByAttrib( sentence, "sentenceanalyses/tokenizations/tokenization", {"tokenizer": tokenization}) parseElement = ETUtils.getElementByAttrib( sentence, "sentenceanalyses/parses/parse", {"parser": parse}) if tokElement == None or parseElement == None: print >> sys.stderr, "Warning, sentence", sentence.get( "id"), "missing parse or tokenization" tokens = tokElement.findall("token") tokenHeadScores = getTokenHeadScores( tokens, parseElement.findall("dependency"), sentenceId=sentence.get("id")) for entity in sentence.findall("entity"): if entity.get("headOffset") == None: headToken = getEntityHeadToken(entity, tokens, tokenHeadScores) # The ElementTree entity-element is modified by setting the headOffset attribute entity.set("headOffset", headToken.get("charOffset")) entity.set("headMethod", "Syntax") entity.set("headString", headToken.get("text")) counts[0] += 1 return counts
def processCorpus(input, parserName): print >> sys.stderr, "Loading corpus file", input corpusRoot = ETUtils.ETFromObj(input).getroot() documents = corpusRoot.findall("document") counts = defaultdict(int) matchByType = defaultdict(lambda: [0, 0]) filteredMatchByType = defaultdict(lambda: [0, 0]) filter = set(["NP", "TOK-tIN", "WHADVP", "WHNP", "TOK-tWP$", "TOK-tPRP$", "NP-IN"]) # # fix spans # for document in documents: # for sentence in document.findall("sentence"): # sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) # for entity in sentence.findall("entity"): # altOffsetString = entity.get("altOffset") # if altOffsetString == None: # continue # #print altOffsetString # altOffsets = Range.charOffsetToTuples(altOffsetString) # assert len(altOffsets) == 1 # for i in range(len(altOffsets)): # altOffset = altOffsets[i] # altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0]) # entity.set("altOffset", Range.tuplesToCharOffset(altOffsets)) # counter = ProgressCounter(len(documents), "Documents") for document in documents: for sentence in document.findall("sentence"): entities = sentence.findall("entity") parse = ETUtils.getElementByAttrib(sentence.find("sentenceanalyses"), "parse", {"parser": parserName}) if parse == None: continue tokenization = ETUtils.getElementByAttrib( sentence.find("sentenceanalyses"), "tokenization", {"tokenizer": parse.get("tokenizer")} ) phrases, phraseDict = makePhrases(parse, tokenization, entities) phraseOffsets = phraseDict.keys() # phraseOffsets.sort() phraseNECounts = getNECounts(phrases, entities) for value in phraseDict.values(): counts["phrases"] += len(value) for phrase in value: matchByType[phrase.get("type")][0] += 1 if phrase.get("type") in filter: filteredMatchByType[phrase.get("type")][0] += 1 counts["phrases-filtered"] += 1 if phrase.get("type").find("NP") != -1: matchByType[phrase.get("type") + "_NE" + str(phraseNECounts[phrase])][0] += 1 counts["tokens"] += len(tokenization.findall("token")) corefType = {} for interaction in sentence.findall("interaction"): if interaction.get("type") == "Coref": corefType[interaction.get("e1")] = "Anaphora" corefType[interaction.get("e2")] = "Antecedent" for entity in entities: if entity.get("given") == "True": continue counts["entity"] += 1 print "entity", entity.get("id") print ETUtils.toStr(entity) matches = getMatchingPhrases(entity, phraseOffsets, phraseDict) count = 0 filteredCount = 0 for phrase in matches: cType = "UNKNOWN" if corefType.has_key(entity.get("id")): cType = corefType[entity.get("id")] print " match", count, ETUtils.toStr(phrase), "NE" + str( phraseNECounts[phrase] ), "ctype:" + cType, "ent:" + ETUtils.toStr(entity) count += 1 matchByType[phrase.get("type")][1] += 1 matchByType[phrase.get("type") + "_" + cType][1] += 1 matchByType[phrase.get("type") + "_" + cType + "_NE" + str(phraseNECounts[phrase])][1] += 1 if phrase.get("type") in filter: filteredCount += 1 filteredMatchByType[phrase.get("type")][1] += 1 # Matching if count == 0: print " NO MATCH", ETUtils.toStr(entity) counts["no-match"] += 1 else: counts["match"] += 1 # Multimatching if len(matches) > 1: bestMatch = selectBestMatch(entity, matches) print " MULTIMATCH(" + entity.get("charOffset") + "," + str( entity.get("altOffset") ) + ")", ", ".join( [x.get("type") + "_" + x.get("charOffset") for x in matches] ), "SEL(" + bestMatch.get( "type" ) + "_" + bestMatch.get( "charOffset" ) + ")" # Filtered matching if filteredCount == 0: counts["no-match-filtered"] += 1 else: counts["match-filtered"] += 1 print "Match" for key in sorted(matchByType.keys()): print " ", key, " ", matchByType[key] print "Filtered", filteredMatchByType print "Counts", counts
def processCorpus(input, parserName): print >> sys.stderr, "Loading corpus file", input corpusRoot = ETUtils.ETFromObj(input).getroot() documents = corpusRoot.findall("document") counts = defaultdict(int) matchByType = defaultdict(lambda: [0, 0]) filteredMatchByType = defaultdict(lambda: [0, 0]) filter = set( ["NP", "TOK-tIN", "WHADVP", "WHNP", "TOK-tWP$", "TOK-tPRP$", "NP-IN"]) # # fix spans # for document in documents: # for sentence in document.findall("sentence"): # sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) # for entity in sentence.findall("entity"): # altOffsetString = entity.get("altOffset") # if altOffsetString == None: # continue # #print altOffsetString # altOffsets = Range.charOffsetToTuples(altOffsetString) # assert len(altOffsets) == 1 # for i in range(len(altOffsets)): # altOffset = altOffsets[i] # altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0]) # entity.set("altOffset", Range.tuplesToCharOffset(altOffsets)) #counter = ProgressCounter(len(documents), "Documents") for document in documents: for sentence in document.findall("sentence"): entities = sentence.findall("entity") parse = ETUtils.getElementByAttrib( sentence.find("sentenceanalyses"), "parse", {"parser": parserName}) if parse == None: continue tokenization = ETUtils.getElementByAttrib( sentence.find("sentenceanalyses"), "tokenization", {"tokenizer": parse.get("tokenizer")}) phrases, phraseDict = makePhrases(parse, tokenization, entities) phraseOffsets = phraseDict.keys() #phraseOffsets.sort() phraseNECounts = getNECounts(phrases, entities) for value in phraseDict.values(): counts["phrases"] += len(value) for phrase in value: matchByType[phrase.get("type")][0] += 1 if phrase.get("type") in filter: filteredMatchByType[phrase.get("type")][0] += 1 counts["phrases-filtered"] += 1 if phrase.get("type").find("NP") != -1: matchByType[phrase.get("type") + "_NE" + str(phraseNECounts[phrase])][0] += 1 counts["tokens"] += len(tokenization.findall("token")) corefType = {} for interaction in sentence.findall("interaction"): if interaction.get("type") == "Coref": corefType[interaction.get("e1")] = "Anaphora" corefType[interaction.get("e2")] = "Antecedent" for entity in entities: if entity.get("given") == "True": continue counts["entity"] += 1 print "entity", entity.get("id") print ETUtils.toStr(entity) matches = getMatchingPhrases(entity, phraseOffsets, phraseDict) count = 0 filteredCount = 0 for phrase in matches: cType = "UNKNOWN" if corefType.has_key(entity.get("id")): cType = corefType[entity.get("id")] print " match", count, ETUtils.toStr(phrase), "NE" + str( phraseNECounts[phrase] ), "ctype:" + cType, "ent:" + ETUtils.toStr(entity) count += 1 matchByType[phrase.get("type")][1] += 1 matchByType[phrase.get("type") + "_" + cType][1] += 1 matchByType[phrase.get("type") + "_" + cType + "_NE" + str(phraseNECounts[phrase])][1] += 1 if phrase.get("type") in filter: filteredCount += 1 filteredMatchByType[phrase.get("type")][1] += 1 # Matching if count == 0: print " NO MATCH", ETUtils.toStr(entity) counts["no-match"] += 1 else: counts["match"] += 1 # Multimatching if len(matches) > 1: bestMatch = selectBestMatch(entity, matches) print " MULTIMATCH(" + entity.get( "charOffset" ) + "," + str(entity.get("altOffset")) + ")", ", ".join([ x.get("type") + "_" + x.get("charOffset") for x in matches ]), "SEL(" + bestMatch.get("type") + "_" + bestMatch.get( "charOffset") + ")" # Filtered matching if filteredCount == 0: counts["no-match-filtered"] += 1 else: counts["match-filtered"] += 1 print "Match" for key in sorted(matchByType.keys()): print " ", key, " ", matchByType[key] print "Filtered", filteredMatchByType print "Counts", counts