Ejemplo n.º 1
0
def fixAltOffsets(input, output=None):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    docCount = 0
    sentencesCreated = 0
    sentences = [x for x in corpusRoot.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "FixAltOffsets")
    fixCount = 0
    # fix spans
    for sentence in sentences:
        counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ")
        sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        for entity in sentence.findall("entity"):
            altOffsetString = entity.get("altOffset")
            if altOffsetString == None:
                continue
            #print altOffsetString
            altOffsets = Range.charOffsetToTuples(altOffsetString)
            assert len(altOffsets) == 1
            for i in range(len(altOffsets)):
                altOffset = altOffsets[i] 
                altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
            entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))
            fixCount += 1
        
    print >> sys.stderr, "Fixed", fixCount, "altOffsets"
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Ejemplo n.º 2
0
    def _markNamedEntities(self):
        """
        This method is used to define which tokens belong to _named_ entities.
        Named entities are sometimes masked when testing learning of interactions, to
        prevent the system making a trivial decision based on commonly interacting names.
        This function assumes that all given entities are named entities.
        """
        self.tokenIsName = {}
        self.tokenIsEntity = {}
        self.tokenIsEntityHead = {}
        # Initialize the dictionaries
        for token in self.tokens:
            self.tokenIsName[token] = False
            self.tokenIsEntity[token] = False
            self.tokenIsEntityHead[token] = []
        for entity in self.entities:
            entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
            entityHeadOffset = Range.charOffsetToSingleTuple(
                entity.get("headOffset"))
            for token in self.tokens:
                tokenOffset = Range.charOffsetToSingleTuple(
                    token.get("charOffset"))
                for entityOffset in entityOffsets:
                    if Range.overlap(entityOffset, tokenOffset):
                        self.tokenIsEntity[token] = True
                        if entity.get("given") == "True":
                            self.tokenIsName[token] = True
#                        if entity.get("given") != None:
#                            if entity.get("given") == "True":
#                                self.tokenIsName[token] = True
#                        else:
#                            entity.set("given", "True")
#                            self.tokenIsName[token] = True
                if Range.overlap(entityHeadOffset, tokenOffset):
                    self.tokenIsEntityHead[token].append(entity)
Ejemplo n.º 3
0
def makeINSubPhrases(phrases, tokens, phraseDict, filter=None):
    newPhrases = []
    for phrase in phrases:
        if filter != None and phrase.get("type") not in filter:
            continue
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        phraseBegin = int(phrase.get("begin"))
        phraseEnd = int(phrase.get("end"))
        prevToken = None
        tokCount = 0
        for token in tokens[phraseBegin:phraseEnd + 1]:
            if token.get("POS") == "IN" and prevToken != None:
                newPhraseOffset = (phraseOffset[0],
                                   Range.charOffsetToSingleTuple(
                                       prevToken.get("charOffset"))[-1])
                newPhrase = makePhrase(
                    phrase.get("type") + "-IN", newPhraseOffset, phraseBegin,
                    phraseBegin + tokCount - 1)
                if not phraseDict.has_key(newPhraseOffset):
                    #print "NEW PHRASE:", ETUtils.toStr(newPhrase)
                    newPhrases.append(newPhrase)
                    phraseDict[newPhraseOffset] = [newPhrase]
            prevToken = token
            tokCount += 1
    return newPhrases
Ejemplo n.º 4
0
 def getMetaMapFeatures(self, token, sentenceGraph, features):
     analyses = sentenceGraph.sentenceElement.find("analyses")
     if analyses == None:
         return
     metamap = analyses.find("metamap")
     if metamap == None:
         return
     tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
     skipAttr = set(["charOffset", "text"])
     for phrase in metamap.findall("phrase"):
         phraseOffset = Range.charOffsetToSingleTuple(
             phrase.get("charOffset"))
         if Range.overlap(tokenOffset, phraseOffset):
             attr = phrase.attrib
             attrNames = sorted(attr.keys())
             for attrName in attrNames:
                 if attrName in skipAttr:
                     continue
                 elif attrName == "score":
                     features["_metamap_score"] = 0.001 * abs(
                         int(attr[attrName]))
                 else:
                     attrValues = attr[attrName].split(",")
                     for attrValue in attrValues:
                         features["_metamap_" + attrName + "_" +
                                  attrValue.replace(" ", "-")] = 1
Ejemplo n.º 5
0
def insertElements(corpus, specAnn):
    for document in corpus.iter('document'):
        docId = document.get("origId")
        assert docId in specAnn, docId
        for sentence in document.iter('sentence'):
            sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
            analyses = sentence.find("analyses")
            if not analyses:
                analyses = ET.SubElement(sentence, "analyses")
            #entitiesElement = sentence.find("entities")
            # Find the container
            container = analyses.find("entities") #None
#             for entitiesElement in entitiesElements:
#                 if entitiesElement.get("source") == "SPECIES":
#                     container = entitiesElement
#                     break
            if not container:
                container = ET.SubElement(analyses, "entities")
            #container.set("source", "SPECIES")
            # Map the spans
            for span in specAnn[docId][:]:
                offset = span.get("offset")
                if Range.overlap(offset, sentOffset):
                    if sentOffset[0] > offset[0] or sentOffset[1] < offset[1]:
                        continue
                    specAnn[docId].remove(span)
                    charOffset = (offset[0] - sentOffset[0], offset[1] - sentOffset[0])
                    matchingText = sentence.get("text")[charOffset[0]:charOffset[1]]
                    spanText = span.get("text")
                    #print matchingText, spanText
                    assert matchingText == spanText, (matchingText, spanText, charOffset)
                    span.set("charOffset", "-".join([str(x) for x in charOffset]))
                    assert not "--" in span.get("charOffset"), [str(x) for x in charOffset]
                    del span.attrib["offset"] #span.set("offset", "")
                    container.append(span)
Ejemplo n.º 6
0
 def _markNamedEntities(self):
     """
     This method is used to define which tokens belong to _named_ entities.
     Named entities are sometimes masked when testing learning of interactions, to
     prevent the system making a trivial decision based on commonly interacting names.
     """
     self.tokenIsName = {}
     self.tokenIsEntity = {}
     self.tokenIsEntityHead = {}
     # Initialize the dictionaries
     for token in self.tokens:
         self.tokenIsName[token] = False
         self.tokenIsEntity[token] = False
         self.tokenIsEntityHead[token] = []
     for entity in self.entities:
         entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
         entityHeadOffset = Range.charOffsetToSingleTuple(entity.get("headOffset"))
         for token in self.tokens:
             tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
             for entityOffset in entityOffsets:
                 if Range.overlap(entityOffset, tokenOffset):
                     self.tokenIsEntity[token] = True
                     if entity.get("isName") != None:
                         if entity.get("isName") == "True":
                             self.tokenIsName[token] = True
                     else:
                         entity.set("isName", "True")
                         self.tokenIsName[token] = True
             if Range.overlap(entityHeadOffset, tokenOffset):
                 self.tokenIsEntityHead[token].append(entity)
Ejemplo n.º 7
0
def getHeads(corpus):
    corpus = ETUtils.ETFromObj(corpus)
    headDict = {}
    headDict["None"] = {}
    for sentence in corpus.getiterator("sentence"):
        headOffsetStrings = set()
        for entity in sentence.findall("entity"):
            eType = entity.get("type")
            if not headDict.has_key(eType):
                headDict[eType] = {}
            eText = entity.get("text")
            headOffset = entity.get("headOffset")
            headOffsetStrings.add(headOffset)
            headOffset = Range.charOffsetToSingleTuple(headOffset)
            charOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
            if headOffset == charOffset:
                if not headDict[eType].has_key(eText): headDict[eType][eText] = 0
                headDict[eType][eText] += 1
            else:
                headText = sentenceText[headOffset[0]-charOffset[0]:headOffset[1]-charOffset[0]+1]
                if not headDict[eType].has_key(headText): headDict[eType][headText] = 0
                headDict[eType][headText] += 1
        for token in tokens:
            if not token.get("charOffset") in headOffsetStrings: # token is not the head of any entity
                headText = token.get("text")
                if not headDict["None"].has_key(headText): headDict["None"][headText] = 0
                headDict["None"][headText] += 1
                
    return headDict
Ejemplo n.º 8
0
def moveElements(document):
    entMap = {}
    entSentence = {}
    entSentenceIndex = {}
    sentences = document.findall("sentence")
    sentenceCount = 0
    for sentence in sentences:
        sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        # Move entities
        entCount = 0
        for entity in document.findall("entity"):
            entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
            overlaps = False
            for entityOffset in entityOffsets:
                if Range.overlap(sentenceOffset, entityOffset):
                    overlaps = True
                    break
            if overlaps:
                document.remove(entity)
                sentence.append(entity)
                entityId = entity.get("id")
                entityIdLastPart = entityId.rsplit(".", 1)[-1]
                if entityIdLastPart.startswith("e"):
                    entity.set("id", sentence.get("id") + "." + entityIdLastPart)
                    entMap[entityId] = sentence.get("id") + "." + entityIdLastPart
                else:
                    entity.set("docId", entityId)
                    entity.set("id", sentence.get("id") + ".e" + str(entCount))
                    entMap[entityId] = sentence.get("id") + ".e" + str(entCount)
                entSentence[entityId] = sentence
                entSentenceIndex[entityId] = sentenceCount
                #newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
                newEntityOffsets = []
                for entityOffset in entityOffsets:
                    newEntityOffsets.append( (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) )
                entity.set("origOffset", entity.get("charOffset"))
                #entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1]))
                entity.set("charOffset", Range.tuplesToCharOffset(newEntityOffsets)) 
                entCount += 1
        sentenceCount += 1
    # Move interactions
    intCount = 0
    for interaction in document.findall("interaction"):
        #if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]:
        #    targetSentence = entSentence[interaction.get("e1")]
        #else:
        #    targetSentence = entSentence[interaction.get("e2")]
        
        # Interactions go to a sentence always by e1, as this is the event they are an argument of.
        # If an intersentence interaction is a relation, this shouldn't matter.
        targetSentence = entSentence[interaction.get("e1")]  
        document.remove(interaction)
        targetSentence.append(interaction)
        interaction.set("id", targetSentence.get("id") + ".i" + str(intCount))
        interaction.set("e1", entMap[interaction.get("e1")])
        interaction.set("e2", entMap[interaction.get("e2")])
        intCount += 1
Ejemplo n.º 9
0
def selectBestMatch(entity, phrases):
    entOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
    if entity.get("altOffset") != None:
        entOffset = Range.charOffsetToSingleTuple(entity.get("altOffset"))
    best = (sys.maxint, None)
    for phrase in phrases:
        matchValue = Range.mismatch(entOffset, Range.charOffsetToSingleTuple(phrase.get("charOffset")))
        if best[0] > matchValue:
            best = (matchValue, phrase)
    return best[1]
Ejemplo n.º 10
0
def exportChemProtPredictions(xml, outPath, fileTypes="predictions", setNames=None):
    if fileTypes == "all":
        fileTypes = ["predictions", "abstracts", "entities", "relations"]
    elif isinstance(fileTypes, basestring):
        fileTypes = fileTypes.split(",")
    for fileType in fileTypes:
        if fileType not in ["predictions", "abstracts", "entities", "relations"]:
            raise Exception("Unknown ChemProt file type '" + str(fileType) + "'")
    xml = ETUtils.ETFromObj(xml)
    #with open(outPath, "wt") as f
    outFiles = {}
    openFiles = {}
    for document in xml.getiterator("document"):
        docId = document.get("origId")
        setName = document.get("set")
        if setNames != None:
            setName = setNames.get(setName, setName)
        if setName not in outFiles:
            outFiles[setName] = {}
        outFile = openOutFile(setName, outPath, "abstracts", fileTypes, outFiles, openFiles)
        if outFile != None:
            docText = document.get("text")
            #assert docText.count("\t") == 1, (docText.count("\t"), document.attrib)
            #title, abstract = docText.split("\t")
            #titleLength = document.get("titleLength")
            titleOffset = Range.charOffsetToSingleTuple(document.get("titleOffset"))
            assert titleOffset[0] == 0
            outFile.write("\t".join([docId, docText[:titleOffset[1]], docText[titleOffset[1]+1:]]) + "\n")  
        entityById = {}
        for entity in document.getiterator("entity"):
            outFile = openOutFile(setName, outPath, "entities", fileTypes, outFiles, openFiles)
            if outFile != None:
                eType = entity.get("type")
                if entity.get("normalized") != None and entity.get("type") == "GENE":
                    eType += "-Y" if entity.get("normalized") == "True" else "-N"
                offset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
                outFile.write("\t".join([docId, entity.get("origId"), eType, str(offset[0]), str(offset[1]), entity.get("text")]) + "\n")
            assert entity.get("id") not in entityById
            entityById[entity.get("id")] = entity
        for interaction in document.getiterator("interaction"):
            e1 = entityById[interaction.get("e1")]
            e2 = entityById[interaction.get("e2")]
            outFile = openOutFile(setName, outPath, "relations", fileTypes, outFiles, openFiles)
            if outFile != None:
                evaluated = "X"
                if interaction.get("evaluated") != None:
                    evaluated = "Y " if interaction.get("evaluated") == "True" else "N "
                outFile.write("\t".join([docId, interaction.get("type"), evaluated, interaction.get("relType"), "Arg1:" + e1.get("origId"), "Arg2:" + e2.get("origId")]) + "\n")
            outFile = openOutFile(setName, outPath, "predictions", fileTypes, outFiles, openFiles)
            if outFile != None:
                outFile.write("\t".join([docId, interaction.get("type"), "Arg1:" + e1.get("origId"), "Arg2:" + e2.get("origId")]) + "\n")
    print >> sys.stderr, "Closing output files"
    for f in openFiles.values():
        f.close()
    return xml 
Ejemplo n.º 11
0
 def getTokens(self, entity, tokenTuples):
     offset = entity.get("charOffset")
     assert offset != None
     offset = Range.charOffsetToSingleTuple(offset)
     match = []
     for tokenTuple in tokenTuples:
         if Range.overlap(offset, tokenTuple[0]):
             match.append(tokenTuple[1].get("text"))
         elif len(match) > 0:  # passed end
             break
     return match
Ejemplo n.º 12
0
def getNECounts(phrases, entities):
    counts = {}
    for phrase in phrases:
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        counts[phrase] = 0
        for entity in entities:
            if entity.get("given") != "True":  # only check names
                continue
            if Range.contains(phraseOffset, Range.charOffsetToSingleTuple(entity.get("charOffset"))):
                counts[phrase] += 1
    return counts
Ejemplo n.º 13
0
def processElements(xml):
    for ddi in xml.getiterator("ddi"):
        ddi.tag = "interaction"
    for entity in xml.getiterator("entity"):
        entity.set("given", "True")
        # Reformat disjoint character offsets and update character range format for TEES 2.0+
        charOffsets = Range.charOffsetToTuples(entity.get("charOffset"), rangeSep=";")
        updatedCharOffsets = []
        for charOffset in charOffsets:
            updatedCharOffsets.append( (charOffset[0], charOffset[1]+1) )
        entity.set("charOffset", Range.tuplesToCharOffset(updatedCharOffsets))
Ejemplo n.º 14
0
 def getTokens(self, entity, tokenTuples):
     offset = entity.get("charOffset")
     assert offset != None
     offset = Range.charOffsetToSingleTuple(offset)
     match = []
     for tokenTuple in tokenTuples:
         if Range.overlap(offset, tokenTuple[0]):
             match.append(tokenTuple[1].get("text"))
         elif len(match) > 0:  # passed end
             break
     return match
Ejemplo n.º 15
0
def selectBestMatch(entity, phrases):
    entOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
    if entity.get("altOffset") != None:
        entOffset = Range.charOffsetToSingleTuple(entity.get("altOffset"))
    best = (sys.maxint, None)
    for phrase in phrases:
        matchValue = Range.mismatch(
            entOffset, Range.charOffsetToSingleTuple(phrase.get("charOffset")))
        if best[0] > matchValue:
            best = (matchValue, phrase)
    return best[1]
Ejemplo n.º 16
0
def processElements(xml):
    for ddi in xml.getiterator("ddi"):
        ddi.tag = "interaction"
    for entity in xml.getiterator("entity"):
        entity.set("given", "True")
        # Reformat disjoint character offsets and update character range format for TEES 2.0+
        charOffsets = Range.charOffsetToTuples(entity.get("charOffset"),
                                               rangeSep=";")
        updatedCharOffsets = []
        for charOffset in charOffsets:
            updatedCharOffsets.append((charOffset[0], charOffset[1] + 1))
        entity.set("charOffset", Range.tuplesToCharOffset(updatedCharOffsets))
Ejemplo n.º 17
0
def fixEntities(xml):
    counts = defaultdict(int)
    for sentence in xml.getiterator("sentence"):
        sText = sentence.get("text")
        for entity in sentence.findall("entity"):
            charOffset = entity.get("charOffset")
            if charOffset == "-":
                assert False, str(entity)
                sentence.remove(entity)
                counts["removed-invalid"] += 1
            else:
                charOffset = Range.charOffsetToSingleTuple(charOffset)
                # fix length
                realLength = len(entity.get("text"))
                lenDiff = (charOffset[1] - charOffset[0] + 1) - realLength
                if lenDiff != realLength:
                    counts["incorrect-ent-offset"] += 1
                    counts["incorrect-ent-offset-diff"+str(lenDiff)] += 1
                    if abs(lenDiff) > 2:
                        print "Warning, lenDiff:", (lenDiff, charOffset, sText, entity.get("text"), entity.get("id"))
                charOffset = (charOffset[0], charOffset[0] + realLength)
                # find starting position
                entIndex = sText.find(entity.get("text"), charOffset[0])
                if entIndex == -1:
                    for i in [-1,-2,-3]:
                        entIndex = sText.find(entity.get("text"), charOffset[0]+i)
                        if entIndex != -1:
                            break
                if entIndex != 0: # could be lowercase
                    sTextLower = sText.lower()
                    for i in [0,-1,-2,-3]:
                        lowerEntIndex = sTextLower.find(entity.get("text"), charOffset[0]+i)
                        if lowerEntIndex != -1:
                            break
                    if lowerEntIndex != -1 and abs(lowerEntIndex - charOffset[0]) < abs(entIndex - charOffset[0]):
                        entIndex = lowerEntIndex
                assert entIndex != -1, (charOffset, sText, entity.get("text"), entity.get("id"))
                indexDiff = entIndex - charOffset[0]
                if indexDiff != 0:
                    counts["incorrect-ent-index"] += 1
                    counts["incorrect-ent-index-diff"+str(indexDiff)] += 1
                    print "Warning, indexDiff:", (indexDiff, charOffset, sText, entity.get("text"), entity.get("id"))
                # move offset       
                charOffset = (charOffset[0]+indexDiff, charOffset[1]+indexDiff)
                # validate new offset
                sEntity = sText[charOffset[0]:charOffset[1]]
                assert sEntity == entity.get("text") or sEntity.lower() == entity.get("text"), (charOffset, sText, entity.get("text"), entity.get("id"))
                entity.set("charOffset", Range.tuplesToCharOffset( (charOffset[0], charOffset[1])))
                entity.set("given", "True")
        for interaction in sentence.findall("interaction"):
            interaction.set("type", "DDI")
    print "Fix counts:", counts
Ejemplo n.º 18
0
def getNECounts(phrases, entities):
    counts = {}
    for phrase in phrases:
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        counts[phrase] = 0
        for entity in entities:
            if entity.get("given") != "True":  # only check names
                continue
            if Range.contains(
                    phraseOffset,
                    Range.charOffsetToSingleTuple(entity.get("charOffset"))):
                counts[phrase] += 1
    return counts
Ejemplo n.º 19
0
 def getRelativePosition(self, entity1Range, entity2Range, token):
     offset = Range.charOffsetToSingleTuple(token.get("charOffset"))
     if Range.overlap(entity1Range, offset):
         return "Entity1"
     if Range.overlap(entity2Range, offset):
         return "Entity2"
     entitiesRange = (min(entity1Range[0],entity2Range[0]),max(entity1Range[1],entity2Range[1]))
     if offset[1] < entitiesRange[0]:
         return "Fore"
     elif offset[1] > entitiesRange[1]:
         return "After"
     else:
         return "Between"
Ejemplo n.º 20
0
 def getRelativePosition(self, entity1Range, entity2Range, token):
     offset = Range.charOffsetToSingleTuple(token.get("charOffset"))
     if Range.overlap(entity1Range, offset):
         return "Entity1"
     if Range.overlap(entity2Range, offset):
         return "Entity2"
     entitiesRange = (min(entity1Range[0], entity2Range[0]),
                      max(entity1Range[1], entity2Range[1]))
     if offset[1] < entitiesRange[0]:
         return "Fore"
     elif offset[1] > entitiesRange[1]:
         return "After"
     else:
         return "Between"
Ejemplo n.º 21
0
def getMatchingPhrases(entity, phraseOffsets, phraseDict):
    matches = []
    if entity.get("isName") == "True":
        return []
    maxOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
    minOffset = entity.get("altOffset")
    if minOffset != None:
        minOffset = Range.charOffsetToSingleTuple(minOffset)
    else:
        minOffset = maxOffset
    for phraseOffset in phraseOffsets:
        if Range.contains(maxOffset, phraseOffset) and Range.contains(phraseOffset, minOffset):
            matches.extend(phraseDict[phraseOffset])
    return matches
Ejemplo n.º 22
0
def getMatchingPhrases(entity, phraseOffsets, phraseDict):
    matches = []
    if entity.get("isName") == "True":
        return []
    maxOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
    minOffset = entity.get("altOffset")
    if minOffset != None:
        minOffset = Range.charOffsetToSingleTuple(minOffset)
    else:
        minOffset = maxOffset
    for phraseOffset in phraseOffsets:
        if Range.contains(maxOffset, phraseOffset) and Range.contains(
                phraseOffset, minOffset):
            matches.extend(phraseDict[phraseOffset])
    return matches
Ejemplo n.º 23
0
def makeDDI13SubmissionFile(input, output, mode="interactions", idfilter=None):
    xml = ETUtils.ETFromObj(input)
    outFile = open(output, "wt")
    for sentence in xml.getiterator("sentence"):
        sentenceId = sentence.get("id")
        if idfilter != None and idfilter not in sentenceId:
            continue
        # Output entities
        if mode == "entities":
            for entity in sentence.findall("entity"):
                if entity.get("type") != "neg":
                    outFile.write(sentenceId)
                    offsets = Range.charOffsetToTuples(
                        entity.get("charOffset"))
                    for i in range(len(offsets)):
                        offsets[i] = (offsets[i][0], offsets[i][1] - 1)
                    outFile.write(
                        "|" + Range.tuplesToCharOffset(offsets, rangeSep=";"))
                    outFile.write("|" + entity.get("text"))
                    outFile.write("|" + entity.get("type"))
                    outFile.write("\n")
        if mode == "interactions":
            # First determine which pairs interact
            intMap = defaultdict(lambda: defaultdict(lambda: None))
            for interaction in sentence.findall("interaction"):
                # Make mapping both ways to discard edge directionality. This isn't actually needed,
                # since MultiEdgeExampleBuilder builds entity pairs in the same order as this function,
                # but shouldn't harm to include it and now it works regardless of pair direction.
                if interaction.get("type") != "neg" and interaction.get(
                        "given") != "True":
                    intMap[interaction.get("e1")][interaction.get(
                        "e2")] = interaction
                    intMap[interaction.get("e2")][interaction.get(
                        "e1")] = interaction
            # Then write all pairs to the output file
            entities = sentence.findall("entity")
            for i in range(0, len(entities) - 1):
                for j in range(i + 1, len(entities)):
                    eIId = entities[i].get("id")
                    eJId = entities[j].get("id")
                    outFile.write(sentenceId + "|" + eIId + "|" + eJId + "|")
                    if intMap[eIId][eJId] != None:
                        interaction = intMap[eIId][eJId]
                        assert interaction.get("type") != "neg"
                        outFile.write("1|" + interaction.get("type") + "\n")
                    else:
                        outFile.write("0|null\n")
    outFile.close()
Ejemplo n.º 24
0
def addParseElements(doc, docEl):
    if docEl.tag != "sentence":
        return
    sentAnalysesEl = ET.SubElement(docEl, "analyses")
    #parsesEl = ET.SubElement(sentAnalysesEl, "parses")
    parseEl = ET.SubElement(sentAnalysesEl, "parse")
    #tokenizationsEl = ET.SubElement(sentAnalysesEl, "tokenizations")
    tokenizationEl = ET.SubElement(sentAnalysesEl, "tokenization")
    parseEl.set("parser", "gold")
    parseEl.set("tokenizer", "gold")
    tokenizationEl.set("tokenizer", "gold")
    tokenMap = {}
    for word in doc.words:
        tokEl = ET.SubElement(tokenizationEl, "token")
        tokEl.set("id", word.id)
        tokEl.set("text", word.text)
        tokEl.set("POS", "None")
        assert len(word.charOffsets) == 1, (word, word.charOffsets)
        tokEl.set("charOffset", Range.tuplesToCharOffset(word.charOffsets))
        tokenMap[word.id] = tokEl
    for dep in doc.dependencies:
        depEl = ET.SubElement(parseEl, "dependency")
        depEl.set("id", dep.id)
        depEl.set("type", dep.type)
        assert len(dep.arguments) == 2
        depEl.set("t1", dep.arguments[0].target.id)
        depEl.set("t2", dep.arguments[1].target.id)
        if dep.type.find(":") != -1:
            word1Type, word2Type = dep.type.split("(")[0].split(":")[-1].split("-")
            tokenMap[dep.arguments[0].target.id].set("POS", word1Type)
            tokenMap[dep.arguments[1].target.id].set("POS", word2Type)
Ejemplo n.º 25
0
def makeEntityElement(ann, idCount, docEl):
    entEl = ET.Element("entity")
    entEl.set("type", ann.type)
    entEl.set("text", ann.text)
    # identifiers
    protId = docEl.get("id") + ".e" + str(idCount)
    entEl.set("id", protId)
    if ann.id != None:
        entEl.set("origId", docEl.get("origId") + "." + str(ann.id))
    # offsets
    entEl.set("charOffset", Range.tuplesToCharOffset(ann.charOffsets))
    if len(ann.alternativeOffsets) > 0:
        altOffs = []
        for alternativeOffset in ann.alternativeOffsets:
            altOffs.append(
                str(alternativeOffset[0]) + "-" +
                str(alternativeOffset[1] - 1))
        entEl.set("altOffset", ",".join(altOffs))
    if ann.normalization != None:
        entEl.set("normalization", ann.normalization)
    addExtraToElement(entEl, ann.extra)
    # determine if given data
    assert ann.fileType in ["a1", "a2", "rel"], ann.fileType
    if ann.fileType == "a1":  #protein.isName():
        entEl.set("given", "True")
    #else:
    #    entEl.set("given", "False")
    return entEl
Ejemplo n.º 26
0
def addParseElements(doc, docEl):
    if docEl.tag != "sentence":
        return
    sentAnalysesEl = ET.SubElement(docEl, "analyses")
    #parsesEl = ET.SubElement(sentAnalysesEl, "parses")
    parseEl = ET.SubElement(sentAnalysesEl, "parse")
    #tokenizationsEl = ET.SubElement(sentAnalysesEl, "tokenizations")
    tokenizationEl = ET.SubElement(sentAnalysesEl, "tokenization")
    parseEl.set("parser", "gold")
    parseEl.set("tokenizer", "gold")
    tokenizationEl.set("tokenizer", "gold")
    tokenMap = {}
    for word in doc.words:
        tokEl = ET.SubElement(tokenizationEl, "token")
        tokEl.set("id", word.id)
        tokEl.set("text", word.text)
        tokEl.set("POS", "None")
        assert len(word.charOffsets) == 1, (word, word.charOffsets)
        tokEl.set("charOffset", Range.tuplesToCharOffset(word.charOffsets))
        tokenMap[word.id] = tokEl
    for dep in doc.dependencies:
        depEl = ET.SubElement(parseEl, "dependency")
        depEl.set("id", dep.id)
        depEl.set("type", dep.type)
        assert len(dep.arguments) == 2
        depEl.set("t1", dep.arguments[0].target.id)
        depEl.set("t2", dep.arguments[1].target.id)
        if dep.type.find(":") != -1:
            word1Type, word2Type = dep.type.split("(")[0].split(":")[-1].split(
                "-")
            tokenMap[dep.arguments[0].target.id].set("POS", word1Type)
            tokenMap[dep.arguments[1].target.id].set("POS", word2Type)
Ejemplo n.º 27
0
def makeEntityElement(ann, idCount, docEl):
    entEl = ET.Element("entity")
    entEl.set("type", ann.type)
    entEl.set("text", ann.text)
    # identifiers
    protId = docEl.get("id") + ".e" + str(idCount)
    entEl.set("id", protId)
    if ann.id != None:
        entEl.set("origId", docEl.get("origId") + "." + str(ann.id))
    # offsets
    entEl.set("charOffset", Range.tuplesToCharOffset(ann.charOffsets))
    if len(ann.alternativeOffsets) > 0:
        altOffs = []
        for alternativeOffset in ann.alternativeOffsets:
            altOffs.append( str(alternativeOffset[0]) + "-" + str(alternativeOffset[1]-1) ) 
        entEl.set("altOffset", ",".join(altOffs))
    if ann.normalization != None:
        entEl.set("normalization", ann.normalization)
    addExtraToElement(entEl, ann.extra)
    # determine if given data
    assert ann.fileType in ["a1", "a2", "rel"], ann.fileType
    if ann.fileType == "a1": #protein.isName():
        entEl.set("given", "True")
    #else:
    #    entEl.set("given", "False")
    return entEl
Ejemplo n.º 28
0
 def addSentence(self, sentenceGraph):
     if sentenceGraph == None:
         return
     tokens = sorted([(Range.charOffsetToSingleTuple(x.get("charOffset")), x) for x in sentenceGraph.tokens])
     indexByTokenId = {tokens[i][1].get("id"):i for i in range(len(tokens))}
     assert len(indexByTokenId) == len(tokens) # check that there were no duplicate ids
     entityById = {x.get("id"):x for x in sentenceGraph.entities}
     events = {}
     for interaction in sentenceGraph.interactions:
         e1Id = interaction.get("e1")
         e2Id = interaction.get("e2")
         e1 = entityById[e1Id]
         e2 = entityById[e2Id]
         t1 = sentenceGraph.entityHeadTokenByEntity[e1]
         t2 = sentenceGraph.entityHeadTokenByEntity[e2]
         index1 = indexByTokenId[t1.get("id")]
         index2 = indexByTokenId[t2.get("id")]
         intSpan = abs(index1 - index2)
         self.interactionSpans[intSpan] = self.interactionSpans.get(intSpan, 0) + 1
         self.intSpan["min"] = min(self.intSpan.get("min"), intSpan)
         self.intSpan["max"] = max(self.intSpan.get("max"), intSpan)
         if interaction.get("event") == "True":
             if e1Id not in events:
                 events[e1Id] = {"min":9999, "max":-9999}
             events[e1Id]["min"] = min(events[e1Id]["min"], index1, index2)
             events[e1Id]["max"] = max(events[e1Id]["max"], index1, index2)
     for eventId in sorted(events.keys()):
         eventSpan = events[eventId]["max"] - events[eventId]["min"]
         self.eventSpans[eventSpan] = self.eventSpans.get(eventSpan, 0) + 1
         self.eventSpan["min"] = min(self.eventSpan.get("min"), eventSpan)
         self.eventSpan["max"] = max(self.eventSpan.get("max"), eventSpan)
Ejemplo n.º 29
0
 def prepareTokens(self, tokens):
     tokenTuples = []
     for token in tokens:
         tokenTuples.append(
             (Range.charOffsetToSingleTuple(token.get("charOffset")),
              token))
     return tokenTuples
Ejemplo n.º 30
0
    def getPatterns(self, e1, e2):
        e1Range = Range.charOffsetToSingleTuple(e1.get("charOffset"))
        e2Range = Range.charOffsetToSingleTuple(e2.get("charOffset"))

        tokenPositions = {}
        for token in self.sentenceGraph.tokens:
            tokenPositions[token.get("id")] = self.getRelativePosition(
                e1Range, e2Range, token)

        prevTokenText = None
        prevToken2Text = None
        prevPosition = None
        patternForeBetween = {}
        patternBetween = {}
        patternBetweenAfter = {}
        for token in self.sentenceGraph.tokens:
            if self.sentenceGraph.tokenIsName[token]:
                continue

            id = token.get("id")
            text = token.get("text").lower()

            if prevPosition != tokenPositions[id]:
                prevTokenText = None
                prevToken2Text = None

            if tokenPositions[id] == "Fore":
                self.addToPattern(patternForeBetween, text, prevTokenText,
                                  prevToken2Text)
            elif tokenPositions[id] == "Between":
                self.addToPattern(patternForeBetween, text, prevTokenText,
                                  prevToken2Text)
                self.addToPattern(patternBetween, text, prevTokenText,
                                  prevToken2Text)
                self.addToPattern(patternBetweenAfter, text, prevTokenText,
                                  prevToken2Text)
            elif tokenPositions[id] == "After":
                self.addToPattern(patternBetweenAfter, text, prevTokenText,
                                  prevToken2Text)

            prevPosition = tokenPositions[id]
            #if tokenPositions[id].find("Entity") != -1:
            prevToken2Text = prevTokenText
            prevTokenText = text

        return patternForeBetween, patternBetween, patternBetweenAfter
Ejemplo n.º 31
0
def getPhraseDict(phrases):
    phraseDict = {}
    # Define offsets
    for phrase in phrases:
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        if not phraseDict.has_key(phraseOffset):
            phraseDict[phraseOffset] = []
        phraseDict[phraseOffset].append(phrase)
    return phraseDict
Ejemplo n.º 32
0
def makeDETSubPhrases(phrases, tokens, phraseDict, filter=None):
    newPhrases = []
    for phrase in phrases:
        if filter != None and phrase.get("type") not in filter:
            continue
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        phraseBegin = int(phrase.get("begin"))
        phraseEnd = int(phrase.get("end"))
        if phraseBegin > 0 and tokens[phraseBegin - 1].get("POS") == "DT":
            newPhraseOffset = (Range.charOffsetToSingleTuple(
                tokens[phraseBegin - 1].get("charOffset"))[0], phraseOffset[1])
            newPhrase = makePhrase("DT-" + phrase.get("type"), newPhraseOffset,
                                   phraseBegin - 1, phraseEnd)
            if not phraseDict.has_key(newPhraseOffset):
                #print "NEW PHRASE:", ETUtils.toStr(newPhrase)
                newPhrases.append(newPhrase)
                phraseDict[newPhraseOffset] = [newPhrase]
    return newPhrases
Ejemplo n.º 33
0
def getAttributes(element):
    attrib = element.attrib.copy()
    #attrib[TAGKEY] = element.tag
    for key in attrib:
        if "offset" in key.lower():
            attrib[key] = Range.charOffsetToTuples(attrib[key])
            if len(attrib[key]) == 1:
                attrib[key] = attrib[key][0]
    return attrib
Ejemplo n.º 34
0
def getAttributes(element):
    attrib = element.attrib.copy()
    #attrib[TAGKEY] = element.tag
    for key in attrib:
        if "offset" in key.lower():
            attrib[key] = Range.charOffsetToTuples(attrib[key])
            if len(attrib[key]) == 1:
                attrib[key] = attrib[key][0]
    return attrib
Ejemplo n.º 35
0
def getPhraseDict(phrases):
    phraseDict = {}
    # Define offsets
    for phrase in phrases:
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        if not phraseDict.has_key(phraseOffset):
            phraseDict[phraseOffset] = []
        phraseDict[phraseOffset].append(phrase)
    return phraseDict
Ejemplo n.º 36
0
def moveElements(document):
    entMap = {}
    entSentence = {}
    entSentenceIndex = {}
    sentences = document.findall("sentence")
    sentenceCount = 0
    for sentence in sentences:
        sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        # Move entities
        entCount = 0
        for entity in document.findall("entity"):
            entityOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
            if Range.overlap(sentenceOffset, entityOffset):
                document.remove(entity)
                sentence.append(entity)
                entityId = entity.get("id")
                entityIdLastPart = entityId.rsplit(".", 1)[-1]
                if entityIdLastPart.startswith("e"):
                    entity.set("id", sentence.get("id") + "." + entityIdLastPart)
                    entMap[entityId] = sentence.get("id") + "." + entityIdLastPart
                else:
                    entity.set("docId", entityId)
                    entity.set("id", sentence.get("id") + ".e" + str(entCount))
                    entMap[entityId] = sentence.get("id") + ".e" + str(entCount)
                entSentence[entityId] = sentence
                entSentenceIndex[entityId] = sentenceCount
                newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
                entity.set("origOffset", entity.get("charOffset"))
                entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1])) 
                entCount += 1
        sentenceCount += 1
    # Move interactions
    intCount = 0
    for interaction in document.findall("interaction"):
        if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]:
            targetSentence = entSentence[interaction.get("e1")]
        else:
            targetSentence = entSentence[interaction.get("e2")]
        document.remove(interaction)
        targetSentence.append(interaction)
        interaction.set("id", targetSentence.get("id") + ".i" + str(intCount))
        interaction.set("e1", entMap[interaction.get("e1")])
        interaction.set("e2", entMap[interaction.get("e2")])
        intCount += 1
Ejemplo n.º 37
0
def makeDDI13SubmissionFile(input, output, mode="interactions", idfilter=None):
    xml = ETUtils.ETFromObj(input)
    outFile = open(output, "wt")
    for sentence in xml.getiterator("sentence"):
        sentenceId = sentence.get("id")
        if idfilter != None and idfilter not in sentenceId:
            continue
        # Output entities
        if mode == "entities":
            for entity in sentence.findall("entity"):
                if entity.get("type") != "neg":
                    outFile.write(sentenceId)
                    offsets = Range.charOffsetToTuples(entity.get("charOffset"))
                    for i in range(len(offsets)):
                        offsets[i] = (offsets[i][0], offsets[i][1]-1)
                    outFile.write("|" + Range.tuplesToCharOffset(offsets, rangeSep=";"))
                    outFile.write("|" + entity.get("text"))
                    outFile.write("|" + entity.get("type"))
                    outFile.write("\n")    
        if mode == "interactions":
            # First determine which pairs interact
            intMap = defaultdict(lambda:defaultdict(lambda:None))
            for interaction in sentence.findall("interaction"):
                # Make mapping both ways to discard edge directionality. This isn't actually needed,
                # since MultiEdgeExampleBuilder builds entity pairs in the same order as this function,
                # but shouldn't harm to include it and now it works regardless of pair direction.
                if interaction.get("type") != "neg" and interaction.get("given") != "True":
                    intMap[interaction.get("e1")][interaction.get("e2")] = interaction
                    intMap[interaction.get("e2")][interaction.get("e1")] = interaction
            # Then write all pairs to the output file
            entities = sentence.findall("entity")
            for i in range(0, len(entities)-1):
                for j in range(i+1, len(entities)):
                    eIId = entities[i].get("id")
                    eJId = entities[j].get("id")
                    outFile.write(sentenceId + "|" + eIId + "|" + eJId + "|")
                    if intMap[eIId][eJId] != None:
                        interaction = intMap[eIId][eJId]
                        assert interaction.get("type") != "neg"
                        outFile.write("1|" + interaction.get("type") + "\n")
                    else:
                        outFile.write("0|null\n")
    outFile.close()
Ejemplo n.º 38
0
def makeDETSubPhrases(phrases, tokens, phraseDict, filter=None):
    newPhrases = []
    for phrase in phrases:
        if filter != None and phrase.get("type") not in filter:
            continue
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        phraseBegin = int(phrase.get("begin"))
        phraseEnd = int(phrase.get("end"))
        if phraseBegin > 0 and tokens[phraseBegin - 1].get("POS") == "DT":
            newPhraseOffset = (
                Range.charOffsetToSingleTuple(tokens[phraseBegin - 1].get("charOffset"))[0],
                phraseOffset[1],
            )
            newPhrase = makePhrase("DT-" + phrase.get("type"), newPhraseOffset, phraseBegin - 1, phraseEnd)
            if not phraseDict.has_key(newPhraseOffset):
                # print "NEW PHRASE:", ETUtils.toStr(newPhrase)
                newPhrases.append(newPhrase)
                phraseDict[newPhraseOffset] = [newPhrase]
    return newPhrases
Ejemplo n.º 39
0
def updateXML(root, removeAnalyses=True):
    counts = defaultdict(int)
    for document in root.findall("document"):
        sentencePos = 0
        counts["documents"] += 1
        for sentence in document.findall("sentence"):
            counts["sentences"] += 1
            # Remove the original parses
            analyses = sentence.find("sentenceanalyses")
            if analyses != None:
                counts["analyses"] += 1
                if removeAnalyses:
                    counts["removed-analyses"] += 1
                    sentence.remove(analyses)
            # Add an artifical sentence offset so that sentences can be exported as a single document
            sentenceText = sentence.get("text")
            sentence.set("charOffset", Range.tuplesToCharOffset((sentencePos, sentencePos + len(sentenceText))))
            # Update the character offsets of all entities from the old format (begin,end) to the new one (begin,end+1)
            for entity in sentence.findall("entity"):
                counts["entities"] += 1
                offsets = [(x[0], x[1] + 1) for x in Range.charOffsetToTuples(entity.get("charOffset"))]
                entityText = entity.get("text")
                for offset, entitySpan in zip(offsets, [sentenceText[x[0]:x[1]] for x in offsets]):
                    counts["entity-offsets"] += 1
                    lenOffset = offset[1] - offset[0]
                    offsetText, entityText = entityText[:lenOffset].strip(), entityText[lenOffset:].strip()
                    assert offsetText == entitySpan, (offsets, (entity.get("text"), entitySpan), (offsetText, entityText), sentenceText)
                entity.set("charOffset", Range.tuplesToCharOffset(offsets))
            # Convert positive pairs into interaction elements
            numInteractions = 0
            for pair in sentence.findall("pair"):
                counts["pairs"] += 1
                sentence.remove(pair)
                if pair.get("interaction") == "True":
                    del pair.attrib["interaction"]
                    pair.set("id", pair.get("id").rsplit(".", 1)[0] + ".i" + str(numInteractions))
                    pair.set("type", "PPI")
                    ET.SubElement(sentence, "interaction", pair.attrib)
                    numInteractions += 1
                    counts["interactions"] += 1
            sentencePos += len(sentenceText) + 1
    print >> sys.stderr, "Updated Interaction XML format:", dict(counts)
    return root
Ejemplo n.º 40
0
def insertElements(corpus, specAnn):
    for document in corpus.iter('document'):
        docId = document.get("origId")
        assert docId in specAnn, docId
        for sentence in document.iter('sentence'):
            sentOffset = Range.charOffsetToSingleTuple(
                sentence.get("charOffset"))
            analyses = sentence.find("analyses")
            if not analyses:
                analyses = ET.SubElement(sentence, "analyses")
            #entitiesElement = sentence.find("entities")
            # Find the container
            container = analyses.find("entities")  #None
            #             for entitiesElement in entitiesElements:
            #                 if entitiesElement.get("source") == "SPECIES":
            #                     container = entitiesElement
            #                     break
            if not container:
                container = ET.SubElement(analyses, "entities")
            #container.set("source", "SPECIES")
            # Map the spans
            for span in specAnn[docId][:]:
                offset = span.get("offset")
                if Range.overlap(offset, sentOffset):
                    if sentOffset[0] > offset[0] or sentOffset[1] < offset[1]:
                        continue
                    specAnn[docId].remove(span)
                    charOffset = (offset[0] - sentOffset[0],
                                  offset[1] - sentOffset[0])
                    matchingText = sentence.get(
                        "text")[charOffset[0]:charOffset[1]]
                    spanText = span.get("text")
                    #print matchingText, spanText
                    assert matchingText == spanText, (matchingText, spanText,
                                                      charOffset)
                    span.set("charOffset",
                             "-".join([str(x) for x in charOffset]))
                    assert not "--" in span.get("charOffset"), [
                        str(x) for x in charOffset
                    ]
                    del span.attrib["offset"]  #span.set("offset", "")
                    container.append(span)
Ejemplo n.º 41
0
def makeTokenSubPhrases(tokens, phraseDict, includePOS=["PRP$", "IN", "WP$"]):
    newPhrases = []
    for i in range(len(tokens)):
        token = tokens[i]
        tokPOS = token.get("POS")
        if tokPOS in includePOS:
            tokOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
            if not phraseDict.has_key(tokOffset):
                newPhrase = makePhrase("TOK-t" + tokPOS, tokOffset, i, i)
                newPhrases.append(newPhrase)
                phraseDict[tokOffset] = [newPhrase]
    return newPhrases
Ejemplo n.º 42
0
def makeTokenSubPhrases(tokens, phraseDict, includePOS=["PRP$", "IN", "WP$"]):
    newPhrases = []
    for i in range(len(tokens)):
        token = tokens[i]
        tokPOS = token.get("POS")
        if tokPOS in includePOS:
            tokOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
            if not phraseDict.has_key(tokOffset):
                newPhrase = makePhrase("TOK-t" + tokPOS, tokOffset, i, i)
                newPhrases.append(newPhrase)
                phraseDict[tokOffset] = [newPhrase]
    return newPhrases
Ejemplo n.º 43
0
 def getPatterns(self, e1, e2):
     e1Range = Range.charOffsetToSingleTuple(e1.get("charOffset"))
     e2Range = Range.charOffsetToSingleTuple(e2.get("charOffset"))
     
     tokenPositions = {}
     for token in self.sentenceGraph.tokens:
         tokenPositions[token.get("id")] = self.getRelativePosition(e1Range,e2Range,token)
     
     prevTokenText = None
     prevToken2Text = None
     prevPosition = None
     patternForeBetween = {}
     patternBetween = {}
     patternBetweenAfter = {}
     for token in self.sentenceGraph.tokens:
         if self.sentenceGraph.tokenIsName[token]:
             continue
             
         id = token.get("id")
         text = token.get("text").lower()
         
         if prevPosition != tokenPositions[id]:
             prevTokenText = None
             prevToken2Text = None
         
         if tokenPositions[id] == "Fore":
             self.addToPattern(patternForeBetween, text, prevTokenText, prevToken2Text)
         elif tokenPositions[id] == "Between":
             self.addToPattern(patternForeBetween, text, prevTokenText, prevToken2Text)
             self.addToPattern(patternBetween, text, prevTokenText, prevToken2Text)
             self.addToPattern(patternBetweenAfter, text, prevTokenText, prevToken2Text)
         elif tokenPositions[id] == "After":
             self.addToPattern(patternBetweenAfter, text, prevTokenText, prevToken2Text)
         
         prevPosition = tokenPositions[id]
         #if tokenPositions[id].find("Entity") != -1:
         prevToken2Text = prevTokenText
         prevTokenText = text
 
     return patternForeBetween, patternBetween, patternBetweenAfter
Ejemplo n.º 44
0
    def writeXMLSentence(self,
                         examples,
                         predictionsByExample,
                         sentenceObject,
                         classSet,
                         classIds,
                         goldSentence=None,
                         exampleStyle=None,
                         structureAnalyzer=None):
        self.assertSameSentence(examples)

        sentenceElement = sentenceObject.sentence
        sentenceId = sentenceElement.get("id")
        sentenceText = sentenceElement.get("text")
        # detach analyses-element
        sentenceAnalysesElement = None
        sentenceAnalysesElement = sentenceElement.find("sentenceanalyses")
        if sentenceAnalysesElement == None:
            sentenceAnalysesElement = sentenceElement.find("analyses")
        if sentenceAnalysesElement != None:
            sentenceElement.remove(sentenceAnalysesElement)
        # remove pairs and interactions
        interactions = self.removeChildren(sentenceElement,
                                           ["pair", "interaction"])
        # remove entities
        newEntityIdCount = IDUtils.getNextFreeId(
            sentenceElement.findall("entity"))
        nonNameEntities = self.removeNonNameEntities(sentenceElement)

        # add new pairs
        for example in examples:
            prediction = predictionsByExample[example[0]]
            entityElement = ET.Element("entity")
            #entityElement.attrib["given"] = "False"
            headToken = example[3]["t"]
            for token in sentenceObject.tokens:
                if token.get("id") == headToken:
                    headToken = token
                    break
            entityElement.set("charOffset", example[3]["charOffset"])
            entityElement.set("headOffset", headToken.get("charOffset"))
            entityElement.set("phraseType", example[3]["ptype"])
            entOffset = Range.charOffsetToSingleTuple(example[3]["charOffset"])
            entityElement.set("text", sentenceText[entOffset[0]:entOffset[1]])
            entityElement.set("id", sentenceId + ".e" + str(newEntityIdCount))
            self.setElementType(entityElement, prediction, classSet, classIds)
            newEntityIdCount += 1
            sentenceElement.append(entityElement)

        # re-attach the analyses-element
        if sentenceAnalysesElement != None:
            sentenceElement.append(sentenceAnalysesElement)
Ejemplo n.º 45
0
def fixIndices(phrases, tokens):
    fixCount = 0
    phraseCount = 0
    for phrase in phrases:
        fixed = False
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        phraseBegin = int(phrase.get("begin"))
        phraseEnd = int(phrase.get("end"))
        for i in range(len(tokens)):
            token = tokens[i]
            tokOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
            if tokOffset[0] == phraseOffset[0]:
                if phraseBegin != i:
                    phrase.set("begin", str(i))
                    fixed = True
            if tokOffset[1] == phraseOffset[1]:
                if phraseEnd != i:
                    phrase.set("end", str(i))
                    fixed = True
                break
        if fixed:
            fixCount += 1
        phraseCount += 1
Ejemplo n.º 46
0
def makeINSubPhrases(phrases, tokens, phraseDict, filter=None):
    newPhrases = []
    for phrase in phrases:
        if filter != None and phrase.get("type") not in filter:
            continue
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        phraseBegin = int(phrase.get("begin"))
        phraseEnd = int(phrase.get("end"))
        prevToken = None
        tokCount = 0
        for token in tokens[phraseBegin : phraseEnd + 1]:
            if token.get("POS") == "IN" and prevToken != None:
                newPhraseOffset = (phraseOffset[0], Range.charOffsetToSingleTuple(prevToken.get("charOffset"))[-1])
                newPhrase = makePhrase(
                    phrase.get("type") + "-IN", newPhraseOffset, phraseBegin, phraseBegin + tokCount - 1
                )
                if not phraseDict.has_key(newPhraseOffset):
                    # print "NEW PHRASE:", ETUtils.toStr(newPhrase)
                    newPhrases.append(newPhrase)
                    phraseDict[newPhraseOffset] = [newPhrase]
            prevToken = token
            tokCount += 1
    return newPhrases
Ejemplo n.º 47
0
def fixIndices(phrases, tokens):
    fixCount = 0
    phraseCount = 0
    for phrase in phrases:
        fixed = False
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        phraseBegin = int(phrase.get("begin"))
        phraseEnd = int(phrase.get("end"))
        for i in range(len(tokens)):
            token = tokens[i]
            tokOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
            if tokOffset[0] == phraseOffset[0]:
                if phraseBegin != i:
                    phrase.set("begin", str(i))
                    fixed = True
            if tokOffset[1] == phraseOffset[1]:
                if phraseEnd != i:
                    phrase.set("end", str(i))
                    fixed = True
                break
        if fixed:
            fixCount += 1
        phraseCount += 1
Ejemplo n.º 48
0
def getEntityHeadToken(entity, tokens, tokenHeadScores):
    if entity.get("headOffset") != None:
        charOffsets = Range.charOffsetToTuples(entity.get("headOffset"))
    elif entity.get("charOffset") != "":
        charOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
    else:
        charOffsets = []
    # Each entity can consist of multiple syntactic tokens, covered by its
    # charOffset-range. One of these must be chosen as the head token.
    headTokens = []  # potential head tokens
    for token in tokens:
        tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
        for offset in charOffsets:
            if Range.overlap(offset, tokenOffset):
                headTokens.append(token)
    if len(headTokens) == 1:  # An unambiguous head token was found
        selectedHeadToken = headTokens[0]
    else:  # One head token must be chosen from the candidates
        selectedHeadToken = findHeadToken(headTokens, tokenHeadScores)
        #if verbose:
        #    print >> sys.stderr, "Selected head:", token.attrib["id"], token.attrib["text"]
    assert selectedHeadToken != None, entityElement.get("id")
    return selectedHeadToken
Ejemplo n.º 49
0
 def getMetaMapFeatures(self, token, sentenceGraph, features):
     analyses = sentenceGraph.sentenceElement.find("analyses")
     if analyses == None:
         return
     metamap = analyses.find("metamap")
     if metamap == None:
         return
     tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
     skipAttr = set(["charOffset", "text"])
     for phrase in metamap.findall("phrase"):
         phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
         if Range.overlap(tokenOffset, phraseOffset):
             attr = phrase.attrib
             attrNames = sorted(attr.keys())
             for attrName in attrNames:
                 if attrName in skipAttr:
                     continue
                 elif attrName == "score":
                     features["_metamap_score"] = 0.001 * abs(int(attr[attrName]))
                 else:
                     attrValues = attr[attrName].split(",")
                     for attrValue in attrValues: 
                         features["_metamap_"+attrName+"_"+attrValue.replace(" ", "-")] = 1
Ejemplo n.º 50
0
def getEntityHeadToken(entity, tokens, tokenHeadScores):
    if entity.get("headOffset") != None:
        charOffsets = Range.charOffsetToTuples(entity.get("headOffset"))
    elif entity.get("charOffset") != "":
        charOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
    else:
        charOffsets = []
    # Each entity can consist of multiple syntactic tokens, covered by its
    # charOffset-range. One of these must be chosen as the head token.
    headTokens = [] # potential head tokens
    for token in tokens:
        tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
        for offset in charOffsets:
            if Range.overlap(offset, tokenOffset):
                headTokens.append(token)
    if len(headTokens)==1: # An unambiguous head token was found
        selectedHeadToken = headTokens[0]
    else: # One head token must be chosen from the candidates
        selectedHeadToken = findHeadToken(headTokens, tokenHeadScores)
        #if verbose:
        #    print >> sys.stderr, "Selected head:", token.attrib["id"], token.attrib["text"]
    assert selectedHeadToken != None, entityElement.get("id")
    return selectedHeadToken
Ejemplo n.º 51
0
def getHeads(corpus):
    corpus = ETUtils.ETFromObj(corpus)
    headDict = {}
    headDict["None"] = {}
    for sentence in corpus.getiterator("sentence"):
        headOffsetStrings = set()
        for entity in sentence.findall("entity"):
            eType = entity.get("type")
            if not headDict.has_key(eType):
                headDict[eType] = {}
            eText = entity.get("text")
            headOffset = entity.get("headOffset")
            headOffsetStrings.add(headOffset)
            headOffset = Range.charOffsetToSingleTuple(headOffset)
            charOffset = Range.charOffsetToSingleTuple(
                entity.get("charOffset"))
            if headOffset == charOffset:
                if not headDict[eType].has_key(eText):
                    headDict[eType][eText] = 0
                headDict[eType][eText] += 1
            else:
                headText = sentenceText[headOffset[0] -
                                        charOffset[0]:headOffset[1] -
                                        charOffset[0] + 1]
                if not headDict[eType].has_key(headText):
                    headDict[eType][headText] = 0
                headDict[eType][headText] += 1
        for token in tokens:
            if not token.get(
                    "charOffset"
            ) in headOffsetStrings:  # token is not the head of any entity
                headText = token.get("text")
                if not headDict["None"].has_key(headText):
                    headDict["None"][headText] = 0
                headDict["None"][headText] += 1

    return headDict
Ejemplo n.º 52
0
def fixAltOffsets(input, output=None):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()

    docCount = 0
    sentencesCreated = 0
    sentences = [x for x in corpusRoot.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "FixAltOffsets")
    fixCount = 0
    # fix spans
    for sentence in sentences:
        counter.update(
            1, "Fixing AltOffsets for sentence (" + sentence.get("id") + "): ")
        sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        for entity in sentence.findall("entity"):
            altOffsetString = entity.get("altOffset")
            if altOffsetString == None:
                continue
            #print altOffsetString
            altOffsets = Range.charOffsetToTuples(altOffsetString)
            assert len(altOffsets) == 1
            for i in range(len(altOffsets)):
                altOffset = altOffsets[i]
                altOffsets[i] = (altOffset[0] - sentOffset[0],
                                 altOffset[1] - sentOffset[0])
            entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))
            fixCount += 1

    print >> sys.stderr, "Fixed", fixCount, "altOffsets"

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Ejemplo n.º 53
0
def removeNamedEntityPhrases(entities, phrases, phraseDict):
    neOffsets = set()
    for entity in entities:
        if entity.get("given") != "True":
            continue
        neOffsets.add(entity.get("charOffset"))
    phrasesToKeep = []
    for phrase in phrases:
        phraseOffset = phrase.get("charOffset")
        if phraseOffset in neOffsets:
            phraseOffsetTuple = Range.charOffsetToSingleTuple(phraseOffset)
            if phraseOffsetTuple in phraseDict:
                del phraseDict[phraseOffsetTuple]
        else:
            phrasesToKeep.append(phrase)
    #print >> sys.stderr, "Removed", len(phrases) - len(phrasesToKeep), "named entity phrases"
    return phrasesToKeep
Ejemplo n.º 54
0
def removeNamedEntityPhrases(entities, phrases, phraseDict):
    neOffsets = set()
    for entity in entities:
        if entity.get("given") != "True":
            continue
        neOffsets.add(entity.get("charOffset"))
    phrasesToKeep = []
    for phrase in phrases:
        phraseOffset = phrase.get("charOffset")
        if phraseOffset in neOffsets:
            phraseOffsetTuple = Range.charOffsetToSingleTuple(phraseOffset)
            if phraseOffsetTuple in phraseDict:
                del phraseDict[phraseOffsetTuple]
        else:
            phrasesToKeep.append(phrase)
    # print >> sys.stderr, "Removed", len(phrases) - len(phrasesToKeep), "named entity phrases"
    return phrasesToKeep
Ejemplo n.º 55
0
 def markNamedEntities(self, entityElements):
     """ Marks tokens belonging to named entities
     """
     namedEntityTokens = []
     for entityElement in entityElements:
         offsets = []
         offsetStrings = entityElement.attrib["charOffset"].split(",")
         for offsetString in offsetStrings:
             charFrom, charTo = offsetString.split("-")
             offset = (int(charFrom), int(charTo))
             offsets.append(offset)
         for k, v in self.tokensById.iteritems():
             for offset in offsets:
                 if Range.overlap(offset, v.charOffset):
                     v.entities.append(entityElement.attrib["id"])
                     namedEntityTokens.append(v.id)
     return namedEntityTokens
Ejemplo n.º 56
0
 def markNamedEntities(self, entityElements):
     """ Marks tokens belonging to named entities
     """
     namedEntityTokens = []
     for entityElement in entityElements:
         offsets = []
         offsetStrings = entityElement.attrib["charOffset"].split(",")
         for offsetString in offsetStrings:
             charFrom, charTo = offsetString.split("-")
             offset = (int(charFrom), int(charTo))
             offsets.append(offset)
         for k,v in self.tokensById.iteritems():
             for offset in offsets:
                 if Range.overlap(offset, v.charOffset):
                     v.entities.append(entityElement.attrib["id"])
                     namedEntityTokens.append(v.id)
     return namedEntityTokens
Ejemplo n.º 57
0
    def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None, exampleStyle=None, structureAnalyzer=None):        
        self.assertSameSentence(examples)
        
        sentenceElement = sentenceObject.sentence
        sentenceId = sentenceElement.get("id")
        sentenceText = sentenceElement.get("text")
        # detach analyses-element
        sentenceAnalysesElement = None
        sentenceAnalysesElement = sentenceElement.find("sentenceanalyses")
        if sentenceAnalysesElement == None:
            sentenceAnalysesElement = sentenceElement.find("analyses")
        if sentenceAnalysesElement != None:
            sentenceElement.remove(sentenceAnalysesElement)
        # remove pairs and interactions
        interactions = self.removeChildren(sentenceElement, ["pair", "interaction"])
        # remove entities
        newEntityIdCount = IDUtils.getNextFreeId(sentenceElement.findall("entity"))
        nonNameEntities = self.removeNonNameEntities(sentenceElement)
            
        # add new pairs
        for example in examples:
            prediction = predictionsByExample[example[0]]
            entityElement = ET.Element("entity")
            #entityElement.attrib["given"] = "False"
            headToken = example[3]["t"]
            for token in sentenceObject.tokens:
                if token.get("id") == headToken:
                    headToken = token
                    break
            entityElement.set("charOffset", example[3]["charOffset"]) 
            entityElement.set("headOffset", headToken.get("charOffset"))
            entityElement.set("phraseType", example[3]["ptype"])
            entOffset = Range.charOffsetToSingleTuple(example[3]["charOffset"])
            entityElement.set("text", sentenceText[entOffset[0]:entOffset[1]])
            entityElement.set("id", sentenceId + ".e" + str(newEntityIdCount))
            self.setElementType(entityElement, prediction, classSet, classIds)
            newEntityIdCount += 1
            sentenceElement.append(entityElement)

        # re-attach the analyses-element
        if sentenceAnalysesElement != None:
            sentenceElement.append(sentenceAnalysesElement)
Ejemplo n.º 58
0
def convert(metamapEl, sentenceEl):
    """
    Convert MetaMap XML into phrase-elements
    """
    newMetamapEl = ET.Element("metamap") # make a new metamap element
    utteranceCount = 0
    for utterance in metamapEl.getiterator("Utterance"): # process all utterances (sentences)
        utteranceCount += 1
        #print "UT:", utterance.find("UttText").text
        uttOffsetBegin = int(utterance.find("UttStartPos").text)
        for phrase in utterance.getiterator("Phrase"): # process all phrases for each utterance
            #print "Phrase:", phrase.find("PhraseText").text
            phraseEl = ET.Element("phrase")
            phraseOffset = [int(phrase.find("PhraseStartPos").text), int(phrase.find("PhraseStartPos").text) + int(phrase.find("PhraseLength").text)]
            phraseOffset = [phraseOffset[0] - uttOffsetBegin, phraseOffset[1] - uttOffsetBegin]
            phraseEl.set("charOffset", Range.tuplesToCharOffset(phraseOffset))
            phraseEl.set("text", phrase.find("PhraseText").text)
            for candidate in phrase.getiterator("Candidate"): # process first candidate of each phrase
                phraseEl.set("score", candidate.find("CandidateScore").text)
                phraseEl.set("cui", candidate.find("CandidateCUI").text)
                phraseEl.set("matched", candidate.find("CandidateMatched").text)
                phraseEl.set("preferred", candidate.find("CandidatePreferred").text)
                semTypes = set()
                for semType in candidate.getiterator("SemType"):
                    semTypes.add(semType.text)
                phraseEl.set("semTypes", ",".join(sorted(list(semTypes))))
                sources = set()
                for source in candidate.getiterator("Source"):
                    sources.add(source.text)
                phraseEl.set("sources", ",".join(sorted(list(sources))))
                break
            if phraseEl.get("matched") != None: # include only matched phrases as new elements
                newMetamapEl.append(phraseEl)
            #print ET.tostring(phraseEl, "utf-8")
    
    if utteranceCount > 1:
        print >> sys.stderr, "Warning, sentence", sentenceEl.get("id"), "has", utteranceCount, "utterances"
    return newMetamapEl
Ejemplo n.º 59
0
 def addSentence(self, sentenceGraph):
     if sentenceGraph == None:
         return
     tokens = sorted([(Range.charOffsetToSingleTuple(x.get("charOffset")),
                       x) for x in sentenceGraph.tokens])
     indexByTokenId = {
         tokens[i][1].get("id"): i
         for i in range(len(tokens))
     }
     assert len(indexByTokenId) == len(
         tokens)  # check that there were no duplicate ids
     entityById = {x.get("id"): x for x in sentenceGraph.entities}
     events = {}
     for interaction in sentenceGraph.interactions:
         e1Id = interaction.get("e1")
         e2Id = interaction.get("e2")
         e1 = entityById[e1Id]
         e2 = entityById[e2Id]
         t1 = sentenceGraph.entityHeadTokenByEntity[e1]
         t2 = sentenceGraph.entityHeadTokenByEntity[e2]
         index1 = indexByTokenId[t1.get("id")]
         index2 = indexByTokenId[t2.get("id")]
         intSpan = abs(index1 - index2)
         self.interactionSpans[intSpan] = self.interactionSpans.get(
             intSpan, 0) + 1
         self.intSpan["min"] = min(self.intSpan.get("min"), intSpan)
         self.intSpan["max"] = max(self.intSpan.get("max"), intSpan)
         if interaction.get("event") == "True":
             if e1Id not in events:
                 events[e1Id] = {"min": 9999, "max": -9999}
             events[e1Id]["min"] = min(events[e1Id]["min"], index1, index2)
             events[e1Id]["max"] = max(events[e1Id]["max"], index1, index2)
     for eventId in sorted(events.keys()):
         eventSpan = events[eventId]["max"] - events[eventId]["min"]
         self.eventSpans[eventSpan] = self.eventSpans.get(eventSpan, 0) + 1
         self.eventSpan["min"] = min(self.eventSpan.get("min"), eventSpan)
         self.eventSpan["max"] = max(self.eventSpan.get("max"), eventSpan)