Exemple #1
0
def findHeadsSyntactic(corpus, parse, tokenization):
    """
    Determine the head token for a named entity or trigger. The head token is the token closest
    to the root for the subtree of the dependency parse spanned by the text of the element.
    
    @param entityElement: a semantic node (trigger or named entity)
    @type entityElement: cElementTree.Element
    @param verbose: Print selected head tokens on screen
    @param verbose: boolean
    """
    counts = [0,0]
    sentences = [x for x in corpus.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "SYNTAX")
    for sentence in sentences:
        counter.update()
        tokElement = ETUtils.getElementByAttrib(sentence, "sentenceanalyses/tokenizations/tokenization", {"tokenizer":tokenization})
        parseElement = ETUtils.getElementByAttrib(sentence, "sentenceanalyses/parses/parse", {"parser":parse})
        if tokElement == None or parseElement == None:
            print >> sys.stderr, "Warning, sentence", sentence.get("id"), "missing parse or tokenization" 
        tokens = tokElement.findall("token")
        tokenHeadScores = getTokenHeadScores(tokens, parseElement.findall("dependency"), sentenceId=sentence.get("id"))
        for entity in sentence.findall("entity"):
            if entity.get("headOffset") == None:
                headToken = getEntityHeadToken(entity, tokens, tokenHeadScores)
                # The ElementTree entity-element is modified by setting the headOffset attribute
                entity.set("headOffset", headToken.get("charOffset"))
                entity.set("headMethod", "Syntax")
                entity.set("headString", headToken.get("text"))
                counts[0] += 1
    return counts
def getSentences(corpusRoot, requireEntities=False, skipIds=[], skipParsed=True):
    for sentence in corpusRoot.getiterator("sentence"):
        if sentence.get("id") in skipIds:
            print >> sys.stderr, "Skipping sentence", sentence.get("id")
            continue
        if requireEntities:
            if sentence.find("entity") == None:
                continue
        if skipParsed:
            if ETUtils.getElementByAttrib(sentence, "parse", {"parser":"McCC"}) != None:
                continue
        yield sentence
def findHeadsSyntactic(corpus, parse, tokenization):
    """
    Determine the head token for a named entity or trigger. The head token is the token closest
    to the root for the subtree of the dependency parse spanned by the text of the element.
    
    @param entityElement: a semantic node (trigger or named entity)
    @type entityElement: cElementTree.Element
    @param verbose: Print selected head tokens on screen
    @param verbose: boolean
    """
    counts = [0, 0]
    sentences = [x for x in corpus.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "SYNTAX")
    for sentence in sentences:
        counter.update()
        tokElement = ETUtils.getElementByAttrib(
            sentence, "sentenceanalyses/tokenizations/tokenization",
            {"tokenizer": tokenization})
        parseElement = ETUtils.getElementByAttrib(
            sentence, "sentenceanalyses/parses/parse", {"parser": parse})
        if tokElement == None or parseElement == None:
            print >> sys.stderr, "Warning, sentence", sentence.get(
                "id"), "missing parse or tokenization"
        tokens = tokElement.findall("token")
        tokenHeadScores = getTokenHeadScores(
            tokens,
            parseElement.findall("dependency"),
            sentenceId=sentence.get("id"))
        for entity in sentence.findall("entity"):
            if entity.get("headOffset") == None:
                headToken = getEntityHeadToken(entity, tokens, tokenHeadScores)
                # The ElementTree entity-element is modified by setting the headOffset attribute
                entity.set("headOffset", headToken.get("charOffset"))
                entity.set("headMethod", "Syntax")
                entity.set("headString", headToken.get("text"))
                counts[0] += 1
    return counts
Exemple #4
0
def processCorpus(input, parserName):
    print >> sys.stderr, "Loading corpus file", input
    corpusRoot = ETUtils.ETFromObj(input).getroot()
    documents = corpusRoot.findall("document")

    counts = defaultdict(int)
    matchByType = defaultdict(lambda: [0, 0])
    filteredMatchByType = defaultdict(lambda: [0, 0])
    filter = set(["NP", "TOK-tIN", "WHADVP", "WHNP", "TOK-tWP$", "TOK-tPRP$", "NP-IN"])

    #    # fix spans
    #    for document in documents:
    #        for sentence in document.findall("sentence"):
    #            sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
    #            for entity in sentence.findall("entity"):
    #                altOffsetString = entity.get("altOffset")
    #                if altOffsetString == None:
    #                    continue
    #                #print altOffsetString
    #                altOffsets = Range.charOffsetToTuples(altOffsetString)
    #                assert len(altOffsets) == 1
    #                for i in range(len(altOffsets)):
    #                    altOffset = altOffsets[i]
    #                    altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
    #                entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))

    # counter = ProgressCounter(len(documents), "Documents")
    for document in documents:
        for sentence in document.findall("sentence"):
            entities = sentence.findall("entity")
            parse = ETUtils.getElementByAttrib(sentence.find("sentenceanalyses"), "parse", {"parser": parserName})
            if parse == None:
                continue
            tokenization = ETUtils.getElementByAttrib(
                sentence.find("sentenceanalyses"), "tokenization", {"tokenizer": parse.get("tokenizer")}
            )
            phrases, phraseDict = makePhrases(parse, tokenization, entities)
            phraseOffsets = phraseDict.keys()
            # phraseOffsets.sort()
            phraseNECounts = getNECounts(phrases, entities)

            for value in phraseDict.values():
                counts["phrases"] += len(value)
                for phrase in value:
                    matchByType[phrase.get("type")][0] += 1
                    if phrase.get("type") in filter:
                        filteredMatchByType[phrase.get("type")][0] += 1
                        counts["phrases-filtered"] += 1
                    if phrase.get("type").find("NP") != -1:
                        matchByType[phrase.get("type") + "_NE" + str(phraseNECounts[phrase])][0] += 1
            counts["tokens"] += len(tokenization.findall("token"))

            corefType = {}
            for interaction in sentence.findall("interaction"):
                if interaction.get("type") == "Coref":
                    corefType[interaction.get("e1")] = "Anaphora"
                    corefType[interaction.get("e2")] = "Antecedent"

            for entity in entities:
                if entity.get("given") == "True":
                    continue
                counts["entity"] += 1
                print "entity", entity.get("id")
                print ETUtils.toStr(entity)
                matches = getMatchingPhrases(entity, phraseOffsets, phraseDict)
                count = 0
                filteredCount = 0
                for phrase in matches:
                    cType = "UNKNOWN"
                    if corefType.has_key(entity.get("id")):
                        cType = corefType[entity.get("id")]
                    print "  match", count, ETUtils.toStr(phrase), "NE" + str(
                        phraseNECounts[phrase]
                    ), "ctype:" + cType, "ent:" + ETUtils.toStr(entity)
                    count += 1
                    matchByType[phrase.get("type")][1] += 1
                    matchByType[phrase.get("type") + "_" + cType][1] += 1
                    matchByType[phrase.get("type") + "_" + cType + "_NE" + str(phraseNECounts[phrase])][1] += 1
                    if phrase.get("type") in filter:
                        filteredCount += 1
                        filteredMatchByType[phrase.get("type")][1] += 1
                # Matching
                if count == 0:
                    print "  NO MATCH", ETUtils.toStr(entity)
                    counts["no-match"] += 1
                else:
                    counts["match"] += 1
                # Multimatching
                if len(matches) > 1:
                    bestMatch = selectBestMatch(entity, matches)
                    print "  MULTIMATCH(" + entity.get("charOffset") + "," + str(
                        entity.get("altOffset")
                    ) + ")", ", ".join(
                        [x.get("type") + "_" + x.get("charOffset") for x in matches]
                    ), "SEL(" + bestMatch.get(
                        "type"
                    ) + "_" + bestMatch.get(
                        "charOffset"
                    ) + ")"
                # Filtered matching
                if filteredCount == 0:
                    counts["no-match-filtered"] += 1
                else:
                    counts["match-filtered"] += 1
    print "Match"
    for key in sorted(matchByType.keys()):
        print "  ", key, " ", matchByType[key]
    print "Filtered", filteredMatchByType
    print "Counts", counts
Exemple #5
0
def processCorpus(input, parserName):
    print >> sys.stderr, "Loading corpus file", input
    corpusRoot = ETUtils.ETFromObj(input).getroot()
    documents = corpusRoot.findall("document")

    counts = defaultdict(int)
    matchByType = defaultdict(lambda: [0, 0])
    filteredMatchByType = defaultdict(lambda: [0, 0])
    filter = set(
        ["NP", "TOK-tIN", "WHADVP", "WHNP", "TOK-tWP$", "TOK-tPRP$", "NP-IN"])

    #    # fix spans
    #    for document in documents:
    #        for sentence in document.findall("sentence"):
    #            sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
    #            for entity in sentence.findall("entity"):
    #                altOffsetString = entity.get("altOffset")
    #                if altOffsetString == None:
    #                    continue
    #                #print altOffsetString
    #                altOffsets = Range.charOffsetToTuples(altOffsetString)
    #                assert len(altOffsets) == 1
    #                for i in range(len(altOffsets)):
    #                    altOffset = altOffsets[i]
    #                    altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
    #                entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))

    #counter = ProgressCounter(len(documents), "Documents")
    for document in documents:
        for sentence in document.findall("sentence"):
            entities = sentence.findall("entity")
            parse = ETUtils.getElementByAttrib(
                sentence.find("sentenceanalyses"), "parse",
                {"parser": parserName})
            if parse == None:
                continue
            tokenization = ETUtils.getElementByAttrib(
                sentence.find("sentenceanalyses"), "tokenization",
                {"tokenizer": parse.get("tokenizer")})
            phrases, phraseDict = makePhrases(parse, tokenization, entities)
            phraseOffsets = phraseDict.keys()
            #phraseOffsets.sort()
            phraseNECounts = getNECounts(phrases, entities)

            for value in phraseDict.values():
                counts["phrases"] += len(value)
                for phrase in value:
                    matchByType[phrase.get("type")][0] += 1
                    if phrase.get("type") in filter:
                        filteredMatchByType[phrase.get("type")][0] += 1
                        counts["phrases-filtered"] += 1
                    if phrase.get("type").find("NP") != -1:
                        matchByType[phrase.get("type") + "_NE" +
                                    str(phraseNECounts[phrase])][0] += 1
            counts["tokens"] += len(tokenization.findall("token"))

            corefType = {}
            for interaction in sentence.findall("interaction"):
                if interaction.get("type") == "Coref":
                    corefType[interaction.get("e1")] = "Anaphora"
                    corefType[interaction.get("e2")] = "Antecedent"

            for entity in entities:
                if entity.get("given") == "True":
                    continue
                counts["entity"] += 1
                print "entity", entity.get("id")
                print ETUtils.toStr(entity)
                matches = getMatchingPhrases(entity, phraseOffsets, phraseDict)
                count = 0
                filteredCount = 0
                for phrase in matches:
                    cType = "UNKNOWN"
                    if corefType.has_key(entity.get("id")):
                        cType = corefType[entity.get("id")]
                    print "  match", count, ETUtils.toStr(phrase), "NE" + str(
                        phraseNECounts[phrase]
                    ), "ctype:" + cType, "ent:" + ETUtils.toStr(entity)
                    count += 1
                    matchByType[phrase.get("type")][1] += 1
                    matchByType[phrase.get("type") + "_" + cType][1] += 1
                    matchByType[phrase.get("type") + "_" + cType + "_NE" +
                                str(phraseNECounts[phrase])][1] += 1
                    if phrase.get("type") in filter:
                        filteredCount += 1
                        filteredMatchByType[phrase.get("type")][1] += 1
                # Matching
                if count == 0:
                    print "  NO MATCH", ETUtils.toStr(entity)
                    counts["no-match"] += 1
                else:
                    counts["match"] += 1
                # Multimatching
                if len(matches) > 1:
                    bestMatch = selectBestMatch(entity, matches)
                    print "  MULTIMATCH(" + entity.get(
                        "charOffset"
                    ) + "," + str(entity.get("altOffset")) + ")", ", ".join([
                        x.get("type") + "_" + x.get("charOffset")
                        for x in matches
                    ]), "SEL(" + bestMatch.get("type") + "_" + bestMatch.get(
                        "charOffset") + ")"
                # Filtered matching
                if filteredCount == 0: counts["no-match-filtered"] += 1
                else: counts["match-filtered"] += 1
    print "Match"
    for key in sorted(matchByType.keys()):
        print "  ", key, " ", matchByType[key]
    print "Filtered", filteredMatchByType
    print "Counts", counts