def loadEventXML(path, verbose=False):
    xml = ETUtils.ETFromObj(path)
    sentDict = {}
    for sentence in xml.getiterator("sentence"):
        sentenceText = getText(sentence).strip()
        if not sentDict.has_key(sentenceText):
            sentDict[sentenceText] = []

    for event in xml.getiterator("event"):
        sentenceText = getText(event).strip()
        if not sentDict.has_key(sentenceText):
            sentDict[sentenceText] = []
        events = sentDict[sentenceText]
        
        clue = event.find("clue")
        clueTuple = getClue(clue)
        eventType = event.find("type").get("class")
        if eventType == "Protein_amino_acid_phosphorylation":
            eventType = "Phosphorylation"
        if type(clueTuple) == types.StringType:
            if verbose: print "Event", eventType, "clue with no clueType:", ETUtils.toStr(clue)
        else:
            assert sentenceText[clueTuple[1]:clueTuple[2]+1] == clueTuple[0], (sentenceText, sentenceText[clueTuple[1]:clueTuple[2]+1], clueTuple)
            event = (clueTuple[1], clueTuple[2], eventType, clueTuple[0])
            if event not in events:
                events.append(event)
    return sentDict
Exemple #2
0
def loadEventXML(path, verbose=False):
    xml = ETUtils.ETFromObj(path)
    sentDict = {}
    for sentence in xml.getiterator("sentence"):
        sentenceText = getText(sentence).strip()
        if not sentDict.has_key(sentenceText):
            sentDict[sentenceText] = []

    for event in xml.getiterator("event"):
        sentenceText = getText(event).strip()
        if not sentDict.has_key(sentenceText):
            sentDict[sentenceText] = []
        events = sentDict[sentenceText]
        
        clue = event.find("clue")
        clueTuple = getClue(clue)
        eventType = event.find("type").get("class")
        if eventType == "Protein_amino_acid_phosphorylation":
            eventType = "Phosphorylation"
        if type(clueTuple) == types.StringType:
            if verbose: print "Event", eventType, "clue with no clueType:", ETUtils.toStr(clue)
        else:
            assert sentenceText[clueTuple[1]:clueTuple[2]+1] == clueTuple[0], (sentenceText, sentenceText[clueTuple[1]:clueTuple[2]+1], clueTuple)
            event = (clueTuple[1], clueTuple[2], eventType, clueTuple[0])
            if event not in events:
                events.append(event)
    return sentDict
Exemple #3
0
def processCorpus(input, parserName):
    print >> sys.stderr, "Loading corpus file", input
    corpusRoot = ETUtils.ETFromObj(input).getroot()
    documents = corpusRoot.findall("document")

    counts = defaultdict(int)
    matchByType = defaultdict(lambda: [0, 0])
    filteredMatchByType = defaultdict(lambda: [0, 0])
    filter = set(["NP", "TOK-tIN", "WHADVP", "WHNP", "TOK-tWP$", "TOK-tPRP$", "NP-IN"])

    #    # fix spans
    #    for document in documents:
    #        for sentence in document.findall("sentence"):
    #            sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
    #            for entity in sentence.findall("entity"):
    #                altOffsetString = entity.get("altOffset")
    #                if altOffsetString == None:
    #                    continue
    #                #print altOffsetString
    #                altOffsets = Range.charOffsetToTuples(altOffsetString)
    #                assert len(altOffsets) == 1
    #                for i in range(len(altOffsets)):
    #                    altOffset = altOffsets[i]
    #                    altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
    #                entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))

    # counter = ProgressCounter(len(documents), "Documents")
    for document in documents:
        for sentence in document.findall("sentence"):
            entities = sentence.findall("entity")
            parse = ETUtils.getElementByAttrib(sentence.find("sentenceanalyses"), "parse", {"parser": parserName})
            if parse == None:
                continue
            tokenization = ETUtils.getElementByAttrib(
                sentence.find("sentenceanalyses"), "tokenization", {"tokenizer": parse.get("tokenizer")}
            )
            phrases, phraseDict = makePhrases(parse, tokenization, entities)
            phraseOffsets = phraseDict.keys()
            # phraseOffsets.sort()
            phraseNECounts = getNECounts(phrases, entities)

            for value in phraseDict.values():
                counts["phrases"] += len(value)
                for phrase in value:
                    matchByType[phrase.get("type")][0] += 1
                    if phrase.get("type") in filter:
                        filteredMatchByType[phrase.get("type")][0] += 1
                        counts["phrases-filtered"] += 1
                    if phrase.get("type").find("NP") != -1:
                        matchByType[phrase.get("type") + "_NE" + str(phraseNECounts[phrase])][0] += 1
            counts["tokens"] += len(tokenization.findall("token"))

            corefType = {}
            for interaction in sentence.findall("interaction"):
                if interaction.get("type") == "Coref":
                    corefType[interaction.get("e1")] = "Anaphora"
                    corefType[interaction.get("e2")] = "Antecedent"

            for entity in entities:
                if entity.get("given") == "True":
                    continue
                counts["entity"] += 1
                print "entity", entity.get("id")
                print ETUtils.toStr(entity)
                matches = getMatchingPhrases(entity, phraseOffsets, phraseDict)
                count = 0
                filteredCount = 0
                for phrase in matches:
                    cType = "UNKNOWN"
                    if corefType.has_key(entity.get("id")):
                        cType = corefType[entity.get("id")]
                    print "  match", count, ETUtils.toStr(phrase), "NE" + str(
                        phraseNECounts[phrase]
                    ), "ctype:" + cType, "ent:" + ETUtils.toStr(entity)
                    count += 1
                    matchByType[phrase.get("type")][1] += 1
                    matchByType[phrase.get("type") + "_" + cType][1] += 1
                    matchByType[phrase.get("type") + "_" + cType + "_NE" + str(phraseNECounts[phrase])][1] += 1
                    if phrase.get("type") in filter:
                        filteredCount += 1
                        filteredMatchByType[phrase.get("type")][1] += 1
                # Matching
                if count == 0:
                    print "  NO MATCH", ETUtils.toStr(entity)
                    counts["no-match"] += 1
                else:
                    counts["match"] += 1
                # Multimatching
                if len(matches) > 1:
                    bestMatch = selectBestMatch(entity, matches)
                    print "  MULTIMATCH(" + entity.get("charOffset") + "," + str(
                        entity.get("altOffset")
                    ) + ")", ", ".join(
                        [x.get("type") + "_" + x.get("charOffset") for x in matches]
                    ), "SEL(" + bestMatch.get(
                        "type"
                    ) + "_" + bestMatch.get(
                        "charOffset"
                    ) + ")"
                # Filtered matching
                if filteredCount == 0:
                    counts["no-match-filtered"] += 1
                else:
                    counts["match-filtered"] += 1
    print "Match"
    for key in sorted(matchByType.keys()):
        print "  ", key, " ", matchByType[key]
    print "Filtered", filteredMatchByType
    print "Counts", counts
Exemple #4
0
def processCorpus(input, parserName):
    print >> sys.stderr, "Loading corpus file", input
    corpusRoot = ETUtils.ETFromObj(input).getroot()
    documents = corpusRoot.findall("document")

    counts = defaultdict(int)
    matchByType = defaultdict(lambda: [0, 0])
    filteredMatchByType = defaultdict(lambda: [0, 0])
    filter = set(
        ["NP", "TOK-tIN", "WHADVP", "WHNP", "TOK-tWP$", "TOK-tPRP$", "NP-IN"])

    #    # fix spans
    #    for document in documents:
    #        for sentence in document.findall("sentence"):
    #            sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
    #            for entity in sentence.findall("entity"):
    #                altOffsetString = entity.get("altOffset")
    #                if altOffsetString == None:
    #                    continue
    #                #print altOffsetString
    #                altOffsets = Range.charOffsetToTuples(altOffsetString)
    #                assert len(altOffsets) == 1
    #                for i in range(len(altOffsets)):
    #                    altOffset = altOffsets[i]
    #                    altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
    #                entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))

    #counter = ProgressCounter(len(documents), "Documents")
    for document in documents:
        for sentence in document.findall("sentence"):
            entities = sentence.findall("entity")
            parse = ETUtils.getElementByAttrib(
                sentence.find("sentenceanalyses"), "parse",
                {"parser": parserName})
            if parse == None:
                continue
            tokenization = ETUtils.getElementByAttrib(
                sentence.find("sentenceanalyses"), "tokenization",
                {"tokenizer": parse.get("tokenizer")})
            phrases, phraseDict = makePhrases(parse, tokenization, entities)
            phraseOffsets = phraseDict.keys()
            #phraseOffsets.sort()
            phraseNECounts = getNECounts(phrases, entities)

            for value in phraseDict.values():
                counts["phrases"] += len(value)
                for phrase in value:
                    matchByType[phrase.get("type")][0] += 1
                    if phrase.get("type") in filter:
                        filteredMatchByType[phrase.get("type")][0] += 1
                        counts["phrases-filtered"] += 1
                    if phrase.get("type").find("NP") != -1:
                        matchByType[phrase.get("type") + "_NE" +
                                    str(phraseNECounts[phrase])][0] += 1
            counts["tokens"] += len(tokenization.findall("token"))

            corefType = {}
            for interaction in sentence.findall("interaction"):
                if interaction.get("type") == "Coref":
                    corefType[interaction.get("e1")] = "Anaphora"
                    corefType[interaction.get("e2")] = "Antecedent"

            for entity in entities:
                if entity.get("given") == "True":
                    continue
                counts["entity"] += 1
                print "entity", entity.get("id")
                print ETUtils.toStr(entity)
                matches = getMatchingPhrases(entity, phraseOffsets, phraseDict)
                count = 0
                filteredCount = 0
                for phrase in matches:
                    cType = "UNKNOWN"
                    if corefType.has_key(entity.get("id")):
                        cType = corefType[entity.get("id")]
                    print "  match", count, ETUtils.toStr(phrase), "NE" + str(
                        phraseNECounts[phrase]
                    ), "ctype:" + cType, "ent:" + ETUtils.toStr(entity)
                    count += 1
                    matchByType[phrase.get("type")][1] += 1
                    matchByType[phrase.get("type") + "_" + cType][1] += 1
                    matchByType[phrase.get("type") + "_" + cType + "_NE" +
                                str(phraseNECounts[phrase])][1] += 1
                    if phrase.get("type") in filter:
                        filteredCount += 1
                        filteredMatchByType[phrase.get("type")][1] += 1
                # Matching
                if count == 0:
                    print "  NO MATCH", ETUtils.toStr(entity)
                    counts["no-match"] += 1
                else:
                    counts["match"] += 1
                # Multimatching
                if len(matches) > 1:
                    bestMatch = selectBestMatch(entity, matches)
                    print "  MULTIMATCH(" + entity.get(
                        "charOffset"
                    ) + "," + str(entity.get("altOffset")) + ")", ", ".join([
                        x.get("type") + "_" + x.get("charOffset")
                        for x in matches
                    ]), "SEL(" + bestMatch.get("type") + "_" + bestMatch.get(
                        "charOffset") + ")"
                # Filtered matching
                if filteredCount == 0: counts["no-match-filtered"] += 1
                else: counts["match-filtered"] += 1
    print "Match"
    for key in sorted(matchByType.keys()):
        print "  ", key, " ", matchByType[key]
    print "Filtered", filteredMatchByType
    print "Counts", counts
Exemple #5
0
def run(input, output=None, elementName="entity", processElement="document", splitNewlines=False, debug=False, pathnerPath=None, trovePath=None):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    # Write text to input file
    workdir = tempfile.mkdtemp()
    if debug:
        print >> sys.stderr, "PathNER work directory at", workdir
    
    infilePath = os.path.join(workdir, "pathner-in.txt")
    infile = codecs.open(infilePath, "wt", "utf-8")
    outfilePath = os.path.join(workdir, "pathner-out.txt")
    idCount = 0

    # Put sentences in dictionary
    sDict = {}
    sentenceHasEntities = {}
    sCount = 0
    for sentence in corpusRoot.getiterator(processElement):
        #infile.write("U" + str(idCount) + " " + sentence.get("text").replace("\n", " ").replace("\n", " ") + "\n")
        infile.write(sentence.get("text").replace("\n", " ").replace("\n", " ") + "\n")
        idCount += 1
        sDict["U" + str(sCount)] = sentence
        sentenceHasEntities["U" + str(sCount)] = False
        sCount += 1

    infile.close()
    
    # Define classpath for java
    if pathnerPath == None:
        pathnerPath = Settings.PATHNER_DIR
    libPath = "/lib/"

    if debug:
        print >> sys.stderr, "Directory of PathNER:", pathnerPath
    pathnerJarPath = pathnerPath + "/PathNER.jar"
    assert os.path.exists(pathnerJarPath), pathnerPath

    classPath = pathnerPath + "/bin"
    classPath += ":" + pathnerPath + libPath + "*"
    
    # Run parser
    print >> sys.stderr, "Running PathNER", pathnerJarPath
    cwd = os.getcwd()
    os.chdir(pathnerPath)

    args = Settings.JAVA.split() + ["-jar", pathnerJarPath, "--test", infilePath, "--output", outfilePath]

    print >> sys.stderr, "PathNER command:", " ".join(args)
    startTime = time.time()
    exitCode = subprocess.call(args)
    assert exitCode == 0, exitCode
    print >> sys.stderr, "PathNER time:", str(datetime.timedelta(seconds=time.time()-startTime))
    os.chdir(cwd)
    
    sentencesWithEntities = 0
    totalEntities = 0
    nonSplitCount = 0
    splitEventCount = 0
    pathnerEntityCount = 0
    removedEntityCount = 0
    
    #Will use a simple method here: read the PathNER results and then do the matching in the sentences
    
    # Read PathNER results
    print >> sys.stderr, "Inserting entities"

    sentenceEntityCount = {}
    #mentionfile = codecs.open(os.path.join(workdir, "file_test_result.txt"), "rt", "utf-8")
    #outfilePath = pathnerPath + "/" + outfilePath
    print >>sys.stderr, 'Getting PathNER results from', outfilePath

    if os.path.isfile(outfilePath): #pathway mentions detected

        mentionfile = codecs.open(outfilePath, "rt", "utf-8")
        menDict = {}
        menSet = set()
        for line in mentionfile:
            #bannerId, offsets, word = line.strip().split("|", 2)
            pathNerTag, mention, pathNerId, confidence = line.strip().split("\t")
            menDict[mention] = pathNerId
            menSet.add(mention)
        mentionfile.close()

        print menSet
        #count for pathway entities
        epCount = 0 
        for sentence in corpusRoot.getiterator(processElement):
            #infile.write("U" + str(idCount) + " " + sentence.get("text").replace("\n", " ").replace("\n", " ") + "\n")
            sentText = sentence.get("text").replace("\n", " ").replace("\n", " ") + "\n"
            startOffsets = []
            endOffsets = []

            bannerEntities = sentence.findall("entity")
            bannerEntityCount = 0

            for bannerEntity in bannerEntities:
                source = bannerEntity.get('source')
                text = bannerEntity.get('text')

                if not source == 'BANNER':
                    print source, text

                bannerEntityCount += 1

            startOffset = 0
            endOffset = 0
            bannerEntity2removed = set()

            for mention in menSet:
                starts = [match.start() for match in re.finditer(re.escape(mention), sentText)]

                #print 'Finding PathNER mention:', mention, starts

                for startOffset in starts:
                    endOffset = startOffset + len(mention)

                    if  startOffset < 0:
                        continue

                    entities = makeEntityElements(int(startOffset), int(endOffset), sentence.get("text"), splitNewlines, elementName)

                    for ent in entities:
                        #Add processing for entities that are overlapped with the PathNER result
                        
                        entOffsets = ent.get("charOffset").split('-')
                        entStart = int(entOffsets[0])
                        entEnd = int(entOffsets[1])

                        for bannerEntity in bannerEntities:
                
                            bannerOffsets = bannerEntity.get('charOffset').split('-')
                            bannerStart = int(bannerOffsets[0])
                            bannerEnd = int(bannerOffsets[1])

                            if debug:
                                print 'PathNER entity:', entStart, entEnd, 'Banner entity:', bannerStart, bannerEnd

                            #Are offsets overlapped or not?
                            if entEnd <= bannerStart or bannerEnd <= entStart: #not overlapped
                                continue
                            else:#overlapped, show remove the banner entity
                                bannerEntity2removed.add(bannerEntity)

                        bannerEntityCount += 1
                        ent.set("id", sentence.get("id") + ".e" + str(bannerEntityCount))
                        epCount += 1

                        sentence.append(ent)
                        pathnerEntityCount += 1
                        
                        if debug:
                            print 'Adding PathNER resutl:', mention
                            print ETUtils.toStr(sentence)
                        
            #Now really to delete the overlapped BANNER entities
            for bEntity in bannerEntity2removed:
                removedEntityCount += 1
                sentence.remove(bEntity)
                
                if debug:
                    print 'Removing entity ', bannerEntity.get('text'), bannerEntity.get('id')
                    print ETUtils.toStr(sentence)

        print >> sys.stderr, "PathNER found", pathnerEntityCount, "entities and remove ", removedEntityCount, " overlapping BANNER entities. "
        print >> sys.stderr, "(" + str(sCount) + " sentences processed)"
        print >> sys.stderr, "New", elementName + "-elements:", totalEntities, "(Split", splitEventCount, "PathNER entities with newlines)"
    
    # Remove work directory
    if not debug:
        shutil.rmtree(workdir)
    else:
        print >> sys.stderr, "PathNER working directory for debugging at", workdir
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree