Beispiel #1
0
    def buildExamples(self, sentenceGraph):
        examples = []
        exampleIndex = 0
        if not self.tokenCounts.has_key(len(sentenceGraph.tokens)):
            self.tokenCounts[len(sentenceGraph.tokens)] = 0
        self.tokenCounts[len(sentenceGraph.tokens)] += 1
        for token in sentenceGraph.tokens:
            entityCounts = {}
            for entity in sentenceGraph.tokenIsEntityHead[token]:
                t = entity.get("type")
                if not entityCounts.has_key(t): entityCounts[t] = 0
                entityCounts[t] += 1
            for k, v in entityCounts.iteritems():
                if not self.counts.has_key(v): self.counts[v] = 0
                self.counts[v] += 1
                # per type
                if not self.countsPerType.has_key(k):
                    self.countsPerType[k] = {}
                if not self.countsPerType[k].has_key(v):
                    self.countsPerType[k][v] = 0
                self.countsPerType[k][v] += 1

            numEntities = len(sentenceGraph.tokenIsEntityHead[token])
            if not self.untypedCounts.has_key(numEntities):
                self.untypedCounts[numEntities] = 0
            self.untypedCounts[numEntities] += 1
            #count = len(sentenceGraph.tokenIsEntityHead[token])
            #if not self.counts.has_key(count): self.counts[count] = 0
            #self.counts[count] += 1
            if max(entityCounts.values() + [0]) >= 8:
                print "======================================"
                print "Entity", token.get("id")
                for e in sentenceGraph.tokenIsEntityHead[token]:
                    print ETUtils.toStr(e)
                print "======================================"

        return []


#        for entity in sentenceGraph.entities:
#            if entity

#        #undirected = sentenceGraph.getUndirectedDependencyGraph()
#        undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
#        ##undirected = sentenceGraph.dependencyGraph.to_undirected()
#        ###undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work
#        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
#
#        # Generate examples based on interactions between entities or interactions between tokens
#        if "entities" in self.styles:
#            loopRange = len(sentenceGraph.entities)
#        else:
#            loopRange = len(sentenceGraph.tokens)
#        for i in range(loopRange-1):
#            for j in range(i+1,loopRange):
#                eI = None
#                eJ = None
#                if "entities" in self.styles:
#                    eI = sentenceGraph.entities[i]
Beispiel #2
0
    def buildExamples(self, sentenceGraph):
        examples = []
        exampleIndex = 0
        if not self.tokenCounts.has_key(len(sentenceGraph.tokens)):
            self.tokenCounts[len(sentenceGraph.tokens)] = 0
        self.tokenCounts[len(sentenceGraph.tokens)] += 1
        for token in sentenceGraph.tokens:
            entityCounts = {}
            for entity in sentenceGraph.tokenIsEntityHead[token]:
                t = entity.get("type")
                if not entityCounts.has_key(t): entityCounts[t] = 0
                entityCounts[t] += 1
            for k,v in entityCounts.iteritems():
                if not self.counts.has_key(v): self.counts[v] = 0
                self.counts[v] += 1
                # per type
                if not self.countsPerType.has_key(k): self.countsPerType[k] = {}
                if not self.countsPerType[k].has_key(v): self.countsPerType[k][v] = 0
                self.countsPerType[k][v] += 1 
            
            numEntities = len(sentenceGraph.tokenIsEntityHead[token])
            if not self.untypedCounts.has_key(numEntities): self.untypedCounts[numEntities] = 0
            self.untypedCounts[numEntities] += 1
            #count = len(sentenceGraph.tokenIsEntityHead[token])
            #if not self.counts.has_key(count): self.counts[count] = 0
            #self.counts[count] += 1
            if max(entityCounts.values() + [0]) >= 8:
                print "======================================"
                print "Entity", token.get("id")
                for e in sentenceGraph.tokenIsEntityHead[token]:
                    print ETUtils.toStr(e)
                print "======================================"
        
        return []
        
#        for entity in sentenceGraph.entities:
#            if entity
            
        
#        #undirected = sentenceGraph.getUndirectedDependencyGraph()
#        undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
#        ##undirected = sentenceGraph.dependencyGraph.to_undirected()
#        ###undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work
#        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
#        
#        # Generate examples based on interactions between entities or interactions between tokens
#        if "entities" in self.styles:
#            loopRange = len(sentenceGraph.entities)
#        else:
#            loopRange = len(sentenceGraph.tokens)
#        for i in range(loopRange-1):
#            for j in range(i+1,loopRange):
#                eI = None
#                eJ = None
#                if "entities" in self.styles:
#                    eI = sentenceGraph.entities[i]
def loadEventXML(path, verbose=False):
    xml = ETUtils.ETFromObj(path)
    sentDict = {}
    for sentence in xml.getiterator("sentence"):
        sentenceText = getText(sentence).strip()
        if not sentDict.has_key(sentenceText):
            sentDict[sentenceText] = []

    for event in xml.getiterator("event"):
        sentenceText = getText(event).strip()
        if not sentDict.has_key(sentenceText):
            sentDict[sentenceText] = []
        events = sentDict[sentenceText]

        clue = event.find("clue")
        clueTuple = getClue(clue)
        eventType = event.find("type").get("class")
        if eventType == "Protein_amino_acid_phosphorylation":
            eventType = "Phosphorylation"
        if type(clueTuple) == types.StringType:
            if verbose:
                print "Event", eventType, "clue with no clueType:", ETUtils.toStr(
                    clue)
        else:
            assert sentenceText[clueTuple[1]:clueTuple[2] +
                                1] == clueTuple[0], (
                                    sentenceText,
                                    sentenceText[clueTuple[1]:clueTuple[2] +
                                                 1], clueTuple)
            event = (clueTuple[1], clueTuple[2], eventType, clueTuple[0])
            if event not in events:
                events.append(event)
    return sentDict
def loadEventXML(path, verbose=False):
    xml = ETUtils.ETFromObj(path)
    sentDict = {}
    for sentence in xml.getiterator("sentence"):
        sentenceText = getText(sentence).strip()
        if not sentDict.has_key(sentenceText):
            sentDict[sentenceText] = []

    for event in xml.getiterator("event"):
        sentenceText = getText(event).strip()
        if not sentDict.has_key(sentenceText):
            sentDict[sentenceText] = []
        events = sentDict[sentenceText]
        
        clue = event.find("clue")
        clueTuple = getClue(clue)
        eventType = event.find("type").get("class")
        if eventType == "Protein_amino_acid_phosphorylation":
            eventType = "Phosphorylation"
        if type(clueTuple) == types.StringType:
            if verbose: print "Event", eventType, "clue with no clueType:", ETUtils.toStr(clue)
        else:
            assert sentenceText[clueTuple[1]:clueTuple[2]+1] == clueTuple[0], (sentenceText, sentenceText[clueTuple[1]:clueTuple[2]+1], clueTuple)
            event = (clueTuple[1], clueTuple[2], eventType, clueTuple[0])
            if event not in events:
                events.append(event)
    return sentDict
Beispiel #5
0
def processCorpus(input, parserName):
    print >> sys.stderr, "Loading corpus file", input
    corpusRoot = ETUtils.ETFromObj(input).getroot()
    documents = corpusRoot.findall("document")

    counts = defaultdict(int)
    matchByType = defaultdict(lambda: [0, 0])
    filteredMatchByType = defaultdict(lambda: [0, 0])
    filter = set(
        ["NP", "TOK-tIN", "WHADVP", "WHNP", "TOK-tWP$", "TOK-tPRP$", "NP-IN"])

    #    # fix spans
    #    for document in documents:
    #        for sentence in document.findall("sentence"):
    #            sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
    #            for entity in sentence.findall("entity"):
    #                altOffsetString = entity.get("altOffset")
    #                if altOffsetString == None:
    #                    continue
    #                #print altOffsetString
    #                altOffsets = Range.charOffsetToTuples(altOffsetString)
    #                assert len(altOffsets) == 1
    #                for i in range(len(altOffsets)):
    #                    altOffset = altOffsets[i]
    #                    altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
    #                entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))

    #counter = ProgressCounter(len(documents), "Documents")
    for document in documents:
        for sentence in document.findall("sentence"):
            entities = sentence.findall("entity")
            parse = ETUtils.getElementByAttrib(
                sentence.find("sentenceanalyses"), "parse",
                {"parser": parserName})
            if parse == None:
                continue
            tokenization = ETUtils.getElementByAttrib(
                sentence.find("sentenceanalyses"), "tokenization",
                {"tokenizer": parse.get("tokenizer")})
            phrases, phraseDict = makePhrases(parse, tokenization, entities)
            phraseOffsets = phraseDict.keys()
            #phraseOffsets.sort()
            phraseNECounts = getNECounts(phrases, entities)

            for value in phraseDict.values():
                counts["phrases"] += len(value)
                for phrase in value:
                    matchByType[phrase.get("type")][0] += 1
                    if phrase.get("type") in filter:
                        filteredMatchByType[phrase.get("type")][0] += 1
                        counts["phrases-filtered"] += 1
                    if phrase.get("type").find("NP") != -1:
                        matchByType[phrase.get("type") + "_NE" +
                                    str(phraseNECounts[phrase])][0] += 1
            counts["tokens"] += len(tokenization.findall("token"))

            corefType = {}
            for interaction in sentence.findall("interaction"):
                if interaction.get("type") == "Coref":
                    corefType[interaction.get("e1")] = "Anaphora"
                    corefType[interaction.get("e2")] = "Antecedent"

            for entity in entities:
                if entity.get("isName") == "True":
                    continue
                counts["entity"] += 1
                print "entity", entity.get("id")
                print ETUtils.toStr(entity)
                matches = getMatchingPhrases(entity, phraseOffsets, phraseDict)
                count = 0
                filteredCount = 0
                for phrase in matches:
                    cType = "UNKNOWN"
                    if corefType.has_key(entity.get("id")):
                        cType = corefType[entity.get("id")]
                    print "  match", count, ETUtils.toStr(phrase), "NE" + str(
                        phraseNECounts[phrase]
                    ), "ctype:" + cType, "ent:" + ETUtils.toStr(entity)
                    count += 1
                    matchByType[phrase.get("type")][1] += 1
                    matchByType[phrase.get("type") + "_" + cType][1] += 1
                    matchByType[phrase.get("type") + "_" + cType + "_NE" +
                                str(phraseNECounts[phrase])][1] += 1
                    if phrase.get("type") in filter:
                        filteredCount += 1
                        filteredMatchByType[phrase.get("type")][1] += 1
                # Matching
                if count == 0:
                    print "  NO MATCH", ETUtils.toStr(entity)
                    counts["no-match"] += 1
                else:
                    counts["match"] += 1
                # Multimatching
                if len(matches) > 1:
                    bestMatch = selectBestMatch(entity, matches)
                    print "  MULTIMATCH(" + entity.get(
                        "charOffset"
                    ) + "," + str(entity.get("altOffset")) + ")", ", ".join([
                        x.get("type") + "_" + x.get("charOffset")
                        for x in matches
                    ]), "SEL(" + bestMatch.get("type") + "_" + bestMatch.get(
                        "charOffset") + ")"
                # Filtered matching
                if filteredCount == 0: counts["no-match-filtered"] += 1
                else: counts["match-filtered"] += 1
    print "Match"
    for key in sorted(matchByType.keys()):
        print "  ", key, " ", matchByType[key]
    print "Filtered", filteredMatchByType
    print "Counts", counts
Beispiel #6
0
def processCorpus(input, parserName):
    print >> sys.stderr, "Loading corpus file", input
    corpusRoot = ETUtils.ETFromObj(input).getroot()
    documents = corpusRoot.findall("document")

    counts = defaultdict(int)
    matchByType = defaultdict(lambda : [0,0])
    filteredMatchByType = defaultdict(lambda : [0,0])
    #filter = set(["NP", "TOK-tIN", "WHADVP", "WHNP", "TOK-tWP$", "TOK-tPRP$", "NP-IN"])
    filter = set(["ADJP",
                  "DT(-)-NP-IN",
                  "DT(-)-NP",
                  "NP",
                  "NP-IN",
                  "PP",
                  "S",
                  "S1",
                  "TOK-tJJ",
                  "TOK-tNN",
                  "TOK-tNNP",
                  "TOK-tNNS",
                  "VP",
                  "VP-IN"])
    
#    # fix spans
#    for document in documents:
#        for sentence in document.findall("sentence"):
#            sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
#            for entity in sentence.findall("entity"):
#                altOffsetString = entity.get("altOffset")
#                if altOffsetString == None:
#                    continue
#                #print altOffsetString
#                altOffsets = Range.charOffsetToTuples(altOffsetString)
#                assert len(altOffsets) == 1
#                for i in range(len(altOffsets)):
#                    altOffset = altOffsets[i] 
#                    altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
#                entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))
    
    #counter = ProgressCounter(len(documents), "Documents")
    for document in documents:
        for sentence in document.findall("sentence"):
            entities = sentence.findall("entity")
            parse = ETUtils.getElementByAttrib(sentence.find("sentenceanalyses"), "parse", {"parser":parserName})
            if parse == None:
                continue
            tokenization = ETUtils.getElementByAttrib(sentence.find("sentenceanalyses"), "tokenization", {"tokenizer":parse.get("tokenizer")})
            phrases, phraseDict = makePhrases(parse, tokenization, entities)
            phraseOffsets = phraseDict.keys()
            #phraseOffsets.sort()
            phraseNECounts = getNECounts(phrases, entities)
            
            for value in phraseDict.values():
                counts["phrases"] += len(value)
                for phrase in value:
                    matchByType[phrase.get("type")][0] += 1
                    if phrase.get("type") in filter:
                        filteredMatchByType[phrase.get("type")][0] += 1
                        counts["phrases-filtered"] += 1
                    if phrase.get("type").find("NP") != -1:
                        matchByType[phrase.get("type")+"_NE"+str(phraseNECounts[phrase])][0] += 1
            counts["tokens"] += len(tokenization.findall("token"))
            
            corefType = {}
            for interaction in sentence.findall("interaction"):
                if interaction.get("type") == "Coref":
                    corefType[interaction.get("e1")] = "Anaphora"
                    corefType[interaction.get("e2")] = "Antecedent"
            
            for entity in entities:
                if entity.get("isName") == "True":
                    continue
                counts["entity"] += 1
                print "entity", entity.get("id")
                print ETUtils.toStr(entity)
                matches = getMatchingPhrases(entity, phraseOffsets, phraseDict)
                count = 0
                filteredCount = 0
                for phrase in matches:
                    cType = "UNKNOWN"
                    if corefType.has_key(entity.get("id")):
                        cType = corefType[entity.get("id")]
                    print "  match", count, ETUtils.toStr(phrase), "NE" + str(phraseNECounts[phrase]), "ctype:" + cType, "ent:" + ETUtils.toStr(entity)
                    count += 1
                    matchByType[phrase.get("type")][1] += 1
                    matchByType[phrase.get("type")+"_"+cType][1] += 1
                    matchByType[phrase.get("type")+"_"+cType+"_NE"+str(phraseNECounts[phrase])][1] += 1
                    if phrase.get("type") in filter:
                        filteredCount += 1
                        filteredMatchByType[phrase.get("type")][1] += 1
                # Matching
                if count == 0:
                    print "  NO MATCH", ETUtils.toStr(entity)
                    for phrase in phrases:
                        print "     "  + phraseToStr(phrase)
                    counts["no-match"] += 1
                else:
                    counts["match"] += 1
                # Multimatching
                if len(matches) > 1:
                    bestMatch = selectBestMatch(entity, matches)
                    print "  MULTIMATCH("+ entity.get("charOffset")+","+str(entity.get("altOffset")) + ")", ", ".join([x.get("type") + "_" + x.get("charOffset") for x in matches]), "SEL(" + bestMatch.get("type") + "_" + bestMatch.get("charOffset") + ")"
                # Filtered matching
                if filteredCount == 0: counts["no-match-filtered"] += 1
                else: counts["match-filtered"] += 1
    print "Match"
    for key in sorted(matchByType.keys()):
        print "  ", key, " ", matchByType[key]
    print "Filtered", filteredMatchByType
    print "Counts", counts