Exemple #1
0
 def load(self,
          input,
          dataSetNames=None,
          corpusName=None,
          output=None,
          extensions=None):
     if isinstance(input, basestring) and input.isdigit():
         return self.downloadPubmed(input, output)
     elif isinstance(input, basestring) and (os.path.isdir(input)
                                             or input.endswith(".tar.gz")
                                             or input.endswith(".txt")
                                             or "," in input):
         return self.convert(input,
                             dataSetNames,
                             corpusName,
                             output,
                             extensions=extensions)
     elif isinstance(input, basestring) and not os.path.exists(input):
         fullPath = os.path.join(Settings.CORPUS_DIR, input)
         print >> sys.stderr, "Loading installed corpus from", fullPath
         if os.path.exists(fullPath):
             return ETUtils.ETFromObj(fullPath)
         else:
             #setPaths = [fullPath + x for x in ("-train.xml", "-devel.xml", "-test.xml")]
             pattern = input + ".+\.xml"  #"|".join([input + x for x in ("-train.xml", "-devel.xml", "-test.xml")])
             matching = Utils.InteractionXML.MergeSets.getMatchingFiles(
                 pattern, Settings.CORPUS_DIR)
             if len(matching) == 0:
                 matching = Utils.InteractionXML.MergeSets.getMatchingFiles(
                     pattern)
             if len(matching
                    ) > 0:  #any([os.path.exists(x) for x in setPaths]):
                 return Utils.InteractionXML.MergeSets.mergeSets(pattern)
             else:
                 raise Exception("Cannot find input '" + str(input) + "'")
     elif isinstance(input, dict):
         return Utils.InteractionXML.MergeSets.mergeSets(pattern)
     else:
         print >> sys.stderr, "Processing source as interaction XML"
         return ETUtils.ETFromObj(input)
Exemple #2
0
def convertXML(xml, outPath):
    xml = ETUtils.ETFromObj(xml)
    corpusObj = {"name": None, "children": []}
    root = xml.getroot()
    for document in root.getiterator("document"):
        docObj = addChild(corpusObj, document)
        for sentence in document.getiterator("sentence"):
            sentObj = addChild(docObj, sentence)
            for elType in ("entity", "interaction"):
                for element in sentence.getiterator(elType):
                    addChild(sentObj, element)
    with open(outPath, "wt") as f:
        json.dump(corpusObj, f, indent=2, cls=IJSONEncoder)
Exemple #3
0
def validateCorpus(input, output, strict=True):
    print >> sys.stderr, "Validating XML"
    print >> sys.stderr, "Loading corpus file", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()
    
    counts = validate(corpusRoot, strict)
    print >> sys.stderr, "Corpus validated:", dict(counts)
    
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
def negateEvents(input, output=None, verbose=False):
    if not (ET.iselement(input) and input.tag == "sentence"):
        print >> sys.stderr, "Loading corpus file", input
        corpusTree = ETUtils.ETFromObj(input)
        corpusRoot = corpusTree.getroot()

    if not (ET.iselement(input) and input.tag == "sentence"):
        sentences = corpusRoot.getiterator("sentence")
    else:
        sentences = [input]
    counts = defaultdict(int)
    for sentence in sentences:
        for entity in sentence.findall("entity"):
            counts["all-entities"] += 1
            eType = entity.get("type")
            if not isNegatableEPITrigger(eType):
                counts["out-of-scope"] += 1
                continue
            eBaseType = getEPIBaseType(eType)
            eText = entity.get("text").lower()
            eNewType = determineNewType(eType, eText)

            # Insert changed charOffset
            counts["entities"] += 1
            if verbose:
                print "Entity", entity.get("id"), [entity.get("text")], [
                    eType, eBaseType, eNewType
                ],
            if eNewType != eBaseType:
                counts["negated"] += 1
                if verbose: print "NEGATED",
            if eNewType == eType:
                counts["correct"] += 1
                if verbose: print "CORRECT"
            else:
                counts["incorrect"] += 1
                if eNewType == eBaseType:
                    counts["incorrect-pos"] += 1
                else:
                    counts["incorrect-neg"] += 1
                if verbose: print "INCORRECT"
            entity.set("type", eNewType)
    if verbose:
        print counts

    if not (ET.iselement(input) and input.tag == "sentence"):
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusRoot, output)
        return corpusTree
Exemple #5
0
def makeDDI13SubmissionFile(input, output, mode="interactions", idfilter=None):
    xml = ETUtils.ETFromObj(input)
    outFile = open(output, "wt")
    for sentence in xml.getiterator("sentence"):
        sentenceId = sentence.get("id")
        if idfilter != None and idfilter not in sentenceId:
            continue
        # Output entities
        if mode == "entities":
            for entity in sentence.findall("entity"):
                if entity.get("type") != "neg":
                    outFile.write(sentenceId)
                    offsets = Range.charOffsetToTuples(
                        entity.get("charOffset"))
                    for i in range(len(offsets)):
                        offsets[i] = (offsets[i][0], offsets[i][1] - 1)
                    outFile.write(
                        "|" + Range.tuplesToCharOffset(offsets, rangeSep=";"))
                    outFile.write("|" + entity.get("text"))
                    outFile.write("|" + entity.get("type"))
                    outFile.write("\n")
        if mode == "interactions":
            # First determine which pairs interact
            intMap = defaultdict(lambda: defaultdict(lambda: None))
            for interaction in sentence.findall("interaction"):
                # Make mapping both ways to discard edge directionality. This isn't actually needed,
                # since MultiEdgeExampleBuilder builds entity pairs in the same order as this function,
                # but shouldn't harm to include it and now it works regardless of pair direction.
                if interaction.get("type") != "neg" and interaction.get(
                        "given") != "True":
                    intMap[interaction.get("e1")][interaction.get(
                        "e2")] = interaction
                    intMap[interaction.get("e2")][interaction.get(
                        "e1")] = interaction
            # Then write all pairs to the output file
            entities = sentence.findall("entity")
            for i in range(0, len(entities) - 1):
                for j in range(i + 1, len(entities)):
                    eIId = entities[i].get("id")
                    eJId = entities[j].get("id")
                    outFile.write(sentenceId + "|" + eIId + "|" + eJId + "|")
                    if intMap[eIId][eJId] != None:
                        interaction = intMap[eIId][eJId]
                        assert interaction.get("type") != "neg"
                        outFile.write("1|" + interaction.get("type") + "\n")
                    else:
                        outFile.write("0|null\n")
    outFile.close()
Exemple #6
0
def processCorpus(inputFilename, outputFilename, rules, reverse=False):
    print >> sys.stderr, "Deleting elements, rules =", rules
    print >> sys.stderr, "Loading corpus file", inputFilename
    corpusTree = ETUtils.ETFromObj(inputFilename)
    corpusRoot = corpusTree.getroot()

    countsByType = defaultdict(int)
    removeElements(corpusRoot, rules, reverse, countsByType)

    print >> sys.stderr, "Deleted elements"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ":", countsByType[k]

    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
def loadCorpus(filename,
               parse=None,
               tokenization=None,
               removeIntersentenceInteractions=True,
               removeNameInfo=False):
    try:
        import xml.etree.cElementTree as ET
    except ImportError:
        import cElementTree as ET
    import sys, gzip

    if type(filename) == types.StringType:
        print >> sys.stderr, "Loading corpus file", filename
    corpusTree = ETUtils.ETFromObj(filename)
    corpusRoot = corpusTree.getroot()
    return CorpusElements(corpusRoot, parse, tokenization,
                          removeIntersentenceInteractions, corpusTree,
                          removeNameInfo)
def getTriggers(corpus):
    """
    Returns a dictionary of "entity type"->"entity text"->"count"
    """
    corpus = ETUtils.ETFromObj(corpus)
    trigDict = {}
    for entity in corpus.getroot().getiterator("entity"):
        if entity.get("given") == "True":
            continue
        eType = entity.get("type")
        if not trigDict.has_key(eType):
            trigDict[eType] = {}
        eText = entity.get("text")
        eText = PorterStemmer.stem(eText)
        if not trigDict[eType].has_key(eText):
            trigDict[eType][eText] = 0
        trigDict[eType][eText] += 1
    return trigDict
Exemple #9
0
def mergeSets(input, corpusDir=None, output=None, allowNone=False):
    # Find the files
    if isinstance(input, dict):
        filenames = [{"path":input[x], "set":x} for x in input]
    else:
        if corpusDir == None:
            if os.path.dirname(input):
                corpusDir = os.path.dirname(input)
                input = os.path.basename(input)
            else:
                corpusDir = os.path.normpath(Settings.DATAPATH + "/corpora")
        print >> sys.stderr, "Searching for corpus files at " + corpusDir + " using pattern " + input
        filenames = [{"path":os.path.join(corpusDir, x), "set":None} for x in getMatchingFiles(input, corpusDir)]
    
    # Merge the files
    print >> sys.stderr, "Merging input files", filenames
    if len(filenames) == 0:
        if allowNone:
            print >> sys.stderr, "Nothing to merge"
            return
        else:
            raise Exception("No input files found for merging")
    newRoot = None
    counts = defaultdict(int)
    for filename in filenames:
        print >> sys.stderr, "Merging file", filename["path"]
        xml = ETUtils.ETFromObj(filename["path"]).getroot()
        if newRoot == None:
            newRoot = ET.Element("corpus", xml.attrib)
        else:
            assert newRoot.attrib == xml.attrib
        for doc in xml.iter("document"):
            assert doc.get("set") != None, doc.attrib
            if filename["set"] != None:
                assert filename["set"] == doc.get("set")
            counts["set=" + doc.get("set")] += 1
            counts["set(" + filename["path"] + ")=" + doc.get("set")] += 1
        for element in xml:
            newRoot.append(element)
    print >> sys.stderr, dict(counts)
    if output != None:
        print "Writing merged corpus to", output
        ETUtils.write(newRoot, output)
    return ET.ElementTree(newRoot)
Exemple #10
0
def mixSets(input, output, docOrigIds, sourceSet, targetSet):
    print >> sys.stderr, "Mixing Sets", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()

    if docOrigIds != None:
        for document in corpusRoot.getiterator("document"):
            docId = document.get("pmid")
            if docId == None:
                docId = document.get("origId")
            if docId in docOrigIds:
                assert document.get("set") == sourceSet
                document.set("set", targetSet)
                docOrigIds.remove(docId)
        assert len(docOrigIds) == 0, docOrigIds

    sentenceIds = None
    if sentenceIds != None:
        for document in corpusRoot.getiterator("document"):
            removed = []
            for sentence in document.findall("sentence"):
                assert document.get("set") == sourceSet
                sentenceId = sentence.get("id")
                if sentenceId in sentenceIds:
                    removed.append(document.remove(sentence))
                    sentenceIds.remove(sentenceId)
            if len(removed) > 0:
                newDoc = ET.Element("document")
                for attr in document.attrib:
                    newDoc.set(attr, document.get(attr))
                newDoc.set("id", None)
                newDoc.set("set", targetSet)
                for sentence in removed:
                    newDoc.append(sentence)
                corpusRoot.append(newDoc)
        assert len(sentenceIds) == None

        RecalculateIds.recalculateIds(corpusTree, onlyWithinSentence=False)

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemple #11
0
def processCorpus(input, output, rules):
    print >> sys.stderr, "Deleting attributes, rules =", rules
    print >> sys.stderr, "Loading corpus file", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()

    countsByType = {}
    for key in sorted(rules.keys()):
        for attribute in rules[key]:
            countsByType[key + ":" + attribute] = 0
        removeAttributes(corpusRoot, key, rules[key], countsByType)

    print >> sys.stderr, "Removed"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ":", countsByType[k]

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemple #12
0
def processCorpus(input, output, rules):
    if rules == None:
        raise Exception("No mapping rules defined")
    elif isinstance(rules, basestring):
        rules = eval(rules)
    print >> sys.stderr, "Mapping attributes, rules =", rules
    print >> sys.stderr, "Loading corpus file", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()

    counts = defaultdict(int)
    for key in sorted(rules.keys()):
        mapAttributes(corpusRoot, key, rules[key], counts)

    print >> sys.stderr, "Mapped", dict(counts)

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemple #13
0
def parseStats(input):
    print >> sys.stderr, "Loading input file", input
    inputTree = ETUtils.ETFromObj(input)
    inputRoot = inputTree.getroot()
    counts = defaultdict(int)
    for sentence in inputRoot.getiterator("sentence"):
        counts["sentence"] += 1
        analysesElement = sentence.find("sentenceanalyses")
        if analysesElement == None:
            counts["sentence-no-analyses"] += 1
            continue
        # Create parses element (if needed)
        parsesElement = analysesElement.find("parses")
        if parsesElement == None:
            counts["sentence-no-parses"] += 1
            continue
        # Loop through parses
        for parseElement in parsesElement:
            parserName = parseElement.get("parser")
            counts["parse:" + parserName] += 1
            if parseElement.get("pennstring") in ["", None]:
                counts["parse:" + parserName + "(no penn)"] += 1
            if len(parseElement.findall("dependency")) == 0:
                counts["parse:" + parserName + "(no dependencies)"] += 1
            if len(parseElement.findall("phrase")) == 0:
                counts["parse:" + parserName + "(no phrases)"] += 1
        # Tokenizations
        tokenizationsElement = analysesElement.find("tokenizations")
        if tokenizationsElement == None:
            counts["sentence-no-tokenizations"] += 1
            continue
        # Loop through tokenizations
        for tokenizationElement in tokenizationsElement:
            tokenizerName = tokenizationElement.get("tokenizer")
            counts["tokenization:" + tokenizerName] += 1
            if len(tokenizationElement.findall("token")) == 0:
                counts["tokenization:" + tokenizerName + "(no tokens)"] += 1

    print >> sys.stderr, "Parse statistics for", input
    for key in sorted(counts.keys()):
        print >> sys.stderr, " ", key + ":", counts[key]
Exemple #14
0
def visualize(inPath, outPath, sentId, parseName):
    setSVGOptions()
    
    xml = ETUtils.ETFromObj(inPath)
    sentences = {x.get("id"):x for x in xml.iter("sentence")}
    if sentId not in sentences:
        print >> sys.stderr, "Sentence id '" + sentId + "' not found"
        return
    sentence = sentences[sentId]
    parse = IXMLUtils.getParseElement(sentence, parseName)
    if not parse:
        print >> sys.stderr, "Sentence has no parse with name '" + parseName + "'"
        return
    
    tokenization = IXMLUtils.getTokenizationElement(sentence, parse.get("tokenizer"))
    graph = SentenceGraph(sentence, [x for x in tokenization.findall("token")], [x for x in parse.findall("dependency")])
    graph.mapInteractions([x for x in sentence.findall("entity")], [x for x in sentence.findall("interaction")])
    svgTokens = tokensToSVG(tokenization.findall("token"))
    svgEdges = edgesToSVG(svgTokens, graph)
    #writeSVG({x.id:x for x in svgTokens}, svgEdges, outPath)
    writeSVG(svgTokens, svgEdges, outPath)
Exemple #15
0
def catenateElements(inputs, inputDir):
    print >> sys.stderr, "##### Catenate interaction XML as elements #####"

    output = {}
    for dataSet in ("devel", "train"):
        root = ET.Element("corpus", {"source": ",".join(inputs)})
        tree = ET.ElementTree(root)
        print "Processing corpus dataset", dataSet
        output[dataSet] = tree
        for input in inputs:
            corpusPath = os.path.join(inputDir, input + "-" + dataSet + ".xml")
            print >> sys.stderr, "Catenating", corpusPath
            if not os.path.exists(corpusPath):
                print "Input", corpusPath, "not found"
                continue
            xml = ETUtils.ETFromObj(corpusPath)
            for document in xml.getiterator("document"):
                root.append(document)
        RecalculateIds.recalculateIds(tree)

    return output
def findHeads(corpus, stringsFrom, methods, parse, tokenization):
    for m in methods:
        assert m in ["REMOVE", "SYNTAX", "DICT"]
    corpus = ETUtils.ETFromObj(corpus)
    counts = {}
    for method in methods:
        print >> sys.stderr, method, "pass"
        if method == "REMOVE":
            counts[method] = removeHeads(corpus)
        elif method == "DICT":
            counts[method] = findHeadsDictionary(corpus, stringsFrom, parse,
                                                 tokenization)
        elif method == "SYNTAX":
            counts[method] = findHeadsSyntactic(corpus, parse, tokenization)
        print >> sys.stderr, method, "pass added", counts[method][
            0], "and removed", counts[method][1], "heads"

    print >> sys.stderr, "Summary (pass/added/removed):"
    for method in methods:
        print >> sys.stderr, " ", method, "/", counts[method][0], "/", counts[
            method][1]
Exemple #17
0
def toSTFormat(input,
               output=None,
               outputTag="a2",
               useOrigIds=False,
               debug=False,
               skipArgs=[],
               validate=True,
               writeExtra=False,
               allAsRelations=False):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()

    nonEntitySiteCount = 0
    documents = []
    for document in corpusRoot.findall("document"):
        stDoc = Document()
        stDoc.id = document.get("pmid")
        if stDoc.id == None:
            stDoc.id = document.get("origId")
        addTextToSTDoc(stDoc, document)
        documents.append(stDoc)
        eMap = {}
        tMap = {}
        entityElementMap = {}  # for task 3
        addEntitiesToSTDoc(stDoc, document, tMap, eMap, entityElementMap,
                           useOrigIds)
        addInteractionsToSTDoc(stDoc, document, tMap, eMap, entityElementMap,
                               skipArgs, allAsRelations)

    if output != None:
        print >> sys.stderr, "Writing output to", output
        writeSet(documents,
                 output,
                 resultFileTag=outputTag,
                 debug=debug,
                 writeExtra=writeExtra)
    return documents
Exemple #18
0
def splitMergedElements(inputFilename, outputFilename=None):
    print >> sys.stderr, "##### Split elements with merged types #####"
    print >> sys.stderr, "Loading corpus", inputFilename
    corpusTree = ETUtils.ETFromObj(inputFilename)
    corpusRoot = corpusTree.getroot()

    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = {"entity": [0, 0], "interaction": [0, 0], "pair": [0, 0]}
    for document in documents:
        counter.update()
        for sentence in document.findall("sentence"):
            processSentence(sentence, countsByType)
    print >> sys.stderr, "Results"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ": removed", countsByType[k][
            0], "created", countsByType[k][1]

    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
Exemple #19
0
def convertAndEvaluate(xml, task, a2Tag, goldDir=None, debug=False):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(xml)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    if task == None:
        task = corpusRoot.get("source")

    print >> sys.stderr, "BioNLP Shared Task evaluation for task", task
    tempdir = tempfile.mkdtemp()
    subTasks = None
    if "." in task:
        mainTask, subTasks = task.split(".")
        subTasks = [int(x) for x in subTasks]
    if subTasks != None:
        for subTask in subTasks:
            skipArgs = []
            skipModifiers = False
            if subTask == 1:
                skipArgs = ['AtLoc', 'ToLoc', 'Site']
                skipModifiers = True
            elif subTask == 2:
                skipModifiers = True
            print >> sys.stderr, "---------------", "Converting task", task, "corpus for GENIA sub task", subTask, "---------------"
            print >> sys.stderr, "Skipping arguments:", skipArgs, " Skipping modifiers:", skipModifiers
            outDir = os.path.join(tempdir, "events-" + str(subTask))
            Utils.STFormat.ConvertXML.toSTFormat(corpusTree,
                                                 outDir,
                                                 outputTag=a2Tag,
                                                 skipArgs=skipArgs,
                                                 skipModifiers=skipModifiers)
            evaluate(outDir, mainTask + "." + str(subTask), goldDir, debug)
    else:
        outDir = os.path.join(tempdir, "events")
        print >> sys.stderr, "---------------", "Converting task", task, "corpus", "---------------"
        Utils.STFormat.ConvertXML.toSTFormat(xml, outDir, outputTag=a2Tag)
        evaluate(outDir, task, goldDir, debug)
    shutil.rmtree(tempdir)
    return xml
def loadDrugBank(filename, preTag="{http://drugbank.ca}", verbose=False):
    data = defaultdict(lambda : defaultdict(list))
    print "Loading DrugBank XML"
    xml = ETUtils.ETFromObj(filename)
    print "Processing DrugBank XML"
    root = xml.getroot()
    assert root.tag == preTag+"drugs", root.tag
    for drug in root.findall(preTag+"drug"):
        id = drug.find(preTag+"drugbank-id").text
        name = drug.find(preTag+"name").text
        if verbose: print id, name
        assert id not in data
        data[id]["name"] = name
        # TODO: Enzymes & targets
        # TODO: hydrophobicity
        getNestedItems(drug, "synonym", data[id], preTag)
        getNestedItems(drug, "brand", data[id], preTag)
        getNestedItems(drug, "group", data[id], preTag)
        getNestedItems(drug, "category", data[id], preTag, "categories")
        interactions = drug.find(preTag+"drug-interactions").findall(preTag+"drug-interaction")
        for interaction in interactions:
            data[id]["interaction"].append( [interaction.find(preTag+"drug").text, None, interaction.find(preTag+"description").text,] )
    return data
def getHeads(corpus):
    corpus = ETUtils.ETFromObj(corpus)
    headDict = {}
    headDict["None"] = {}
    for sentence in corpus.getiterator("sentence"):
        headOffsetStrings = set()
        for entity in sentence.findall("entity"):
            eType = entity.get("type")
            if not headDict.has_key(eType):
                headDict[eType] = {}
            eText = entity.get("text")
            headOffset = entity.get("headOffset")
            headOffsetStrings.add(headOffset)
            headOffset = Range.charOffsetToSingleTuple(headOffset)
            charOffset = Range.charOffsetToSingleTuple(
                entity.get("charOffset"))
            if headOffset == charOffset:
                if not headDict[eType].has_key(eText):
                    headDict[eType][eText] = 0
                headDict[eType][eText] += 1
            else:
                headText = sentenceText[headOffset[0] -
                                        charOffset[0]:headOffset[1] -
                                        charOffset[0] + 1]
                if not headDict[eType].has_key(headText):
                    headDict[eType][headText] = 0
                headDict[eType][headText] += 1
        for token in tokens:
            if not token.get(
                    "charOffset"
            ) in headOffsetStrings:  # token is not the head of any entity
                headText = token.get("text")
                if not headDict["None"].has_key(headText):
                    headDict["None"][headText] = 0
                headDict["None"][headText] += 1

    return headDict
Exemple #22
0
    def analyze(self, inputs, model=None, verbose=False):
        self._init()
        if type(inputs) in types.StringTypes or not isinstance(
                inputs, collections.Sequence):
            inputs = [inputs]
        for xml in inputs:
            print >> sys.stderr, "Analyzing", xml
            xml = ETUtils.ETFromObj(xml)

            for document in xml.getiterator("document"):
                # Collect elements into dictionaries
                entityById = {}
                for entity in document.getiterator("entity"):
                    entityById[entity.get("id")] = entity
                interactions = []
                interactionsByE1 = defaultdict(list)
                for interaction in document.getiterator("interaction"):
                    interactions.append(interaction)
                    interactionsByE1[interaction.get("e1")].append(interaction)
                siteOfTypes = self.buildSiteOfMap(interactions,
                                                  interactionsByE1, entityById)
                # Add entity elements to analysis
                for entity in document.getiterator("entity"):
                    self.addEntityElement(entity, interactionsByE1)
                # Add interaction elements to analysis
                for interaction in interactions:
                    self.addInteractionElement(interaction, entityById,
                                               siteOfTypes[interaction])
                # Calculate event definition argument limits from event instances
                for event in self.events.values():
                    event.countArguments()

        self._updateSupportingAnalyses()
        if verbose:
            print >> sys.stderr, self.toString()
        if model != None:
            self.save(model)
def makeSubset(input, output=None, ratio=1.0, seed=0, invert=False):
    if ratio == 1.0:
        if output != None:
            shutil.copy2(input, output)
            return output
        else:
            return input
    print >> sys.stderr, "====== Making subset ======"
    print >> sys.stderr, "Subset for ", input, "ratio", ratio, "seed", seed
    xml = ETUtils.ETFromObj(input).getroot()
    count = 0
    sentCount = 0
    for document in xml.findall("document"):
        sentCount += len(document.findall("sentence"))
        count += 1
    totalFolds = min(100, count)
    selectedFolds = int(ratio * min(100, count))
    division = Core.Split.getFolds(count, totalFolds, seed)
    #print division, selectedFolds - 1
    index = 0
    removeCount = 0
    sentRemoveCount = 0
    for document in xml.findall("document"):
        removal = division[index] > selectedFolds - 1
        if invert:
            removal = not removal
        if removal:
            xml.remove(document)
            sentRemoveCount += len(document.findall("sentence"))
            removeCount += 1
        index += 1
    print >> sys.stderr, "Subset", "doc:", count, "removed:", removeCount, "sent:", sentCount, "sentremoved:", sentRemoveCount
    xml.set("subsetRatio", str(ratio))
    xml.set("subsetSeed", str(seed))
    if output != None:
        ETUtils.write(xml, output)
    return output
def fixAltOffsets(input, output=None):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()

    docCount = 0
    sentencesCreated = 0
    sentences = [x for x in corpusRoot.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "FixAltOffsets")
    fixCount = 0
    # fix spans
    for sentence in sentences:
        counter.update(
            1, "Fixing AltOffsets for sentence (" + sentence.get("id") + "): ")
        sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        for entity in sentence.findall("entity"):
            altOffsetString = entity.get("altOffset")
            if altOffsetString == None:
                continue
            #print altOffsetString
            altOffsets = Range.charOffsetToTuples(altOffsetString)
            assert len(altOffsets) == 1
            for i in range(len(altOffsets)):
                altOffset = altOffsets[i]
                altOffsets[i] = (altOffset[0] - sentOffset[0],
                                 altOffset[1] - sentOffset[0])
            entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))
            fixCount += 1

    print >> sys.stderr, "Fixed", fixCount, "altOffsets"

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemple #25
0
    def writeXML(self,
                 examples,
                 predictions,
                 corpus,
                 outputFile,
                 classSet=None,
                 parse=None,
                 tokenization=None,
                 goldCorpus=None,
                 exampleStyle=None,
                 structureAnalyzer=None):
        """
        Writes task 3 examples to interaction XML. Assumes task 3 classification
        is done with SVMMulticlass Classifier, used for two classes.
        """
        print >> sys.stderr, "Adding task 3 to Interaction XML"
        examples, predictions = self.loadExamples(examples, predictions)

        if type(classSet) == types.StringType:  # class names are in file
            classSet = IdSet(filename=classSet)
        classIds = None
        if classSet != None:
            classIds = classSet.getIds()

        corpusTree = ETUtils.ETFromObj(corpus)
        corpusRoot = corpusTree.getroot()

        # Determine subtask
        task3Type = None
        for example in examples:
            assert example[3].has_key("t3type")
            task3Type = example[3]["t3type"]
            break
        if task3Type == None:
            if outputFile != None:
                print >> sys.stderr, "Writing corpus to", outputFile
                ETUtils.write(corpusRoot, outputFile)
            return corpusTree
        assert task3Type in ["multiclass", "speculation", "negation"]

        # Remove the task 3 subtask information if it already exists
        for entity in corpusRoot.getiterator("entity"):
            if task3Type == "multiclass":
                entity.set("speculation", "False")
                entity.set("negation", "False")
            elif task3Type == "speculation":
                entity.set("speculation", "False")
            else:  # task3Type == "negation"
                entity.set("negation", "False")

        specMap = {}
        negMap = {}
        for example, prediction in itertools.izip(examples, predictions):
            assert example[3]["xtype"] == "task3"
            if example[3]["t3type"] == "multiclass":
                if isinstance(prediction, dict):
                    encoded = prediction["prediction"]
                    predictedModifiers = [
                        classSet.getName(i) for i in range(len(encoded))
                        if encoded[i] == 1
                    ]
                else:
                    predictedClassName = classSet.getName(prediction[0])
                    predictedModifiers = ""
                    if predictedClassName != "neg":
                        predictedModifiers = predictedClassName.split("---")
                if "negation" in predictedModifiers:
                    assert not negMap.has_key(example[3]["entity"])
                    negMap[example[3]["entity"]] = (True, prediction)
                if "speculation" in predictedModifiers:
                    assert not specMap.has_key(example[3]["entity"])
                    specMap[example[3]["entity"]] = (True, prediction)
            else:
                if example[3]["t3type"] == "speculation":
                    map = specMap
                else:
                    map = negMap
                if prediction[0] != 1:
                    assert not map.has_key(example[3]["entity"])
                    map[example[3]["entity"]] = (True, prediction)
                else:
                    assert not map.has_key(example[3]["entity"])
                    map[example[3]["entity"]] = (False, prediction)

        for entity in corpusRoot.getiterator("entity"):
            eId = entity.get("id")
            if task3Type == "multiclass":
                if specMap.has_key(eId):
                    entity.set("speculation", str(specMap[eId][0]))
                    entity.set(
                        "modConf",
                        self.getPredictionStrengthString(
                            specMap[eId][1], classSet, classIds))
                if negMap.has_key(eId):
                    entity.set("negation", str(negMap[eId][0]))
                    entity.set(
                        "modConf",
                        self.getPredictionStrengthString(
                            negMap[eId][1], classSet, classIds))
            else:
                if task3Type == "speculation":
                    if specMap.has_key(eId):
                        entity.set("speculation", str(specMap[eId][0]))
                        entity.set(
                            "specConf",
                            self.getPredictionStrengthString(
                                specMap[eId][1], classSet, classIds, [""]))
                elif task3Type == "negation":
                    if negMap.has_key(eId):
                        entity.set("negation", str(negMap[eId][0]))
                        entity.set(
                            "negConf",
                            self.getPredictionStrengthString(
                                negMap[eId][1], classSet, classIds,
                                ["", "speculation"]))

        # Write corpus
        if outputFile != None:
            print >> sys.stderr, "Writing corpus to", outputFile
            ETUtils.write(corpusRoot, outputFile)
        return corpusTree
    optparser.add_option("-d",
                         "--debug",
                         default=False,
                         action="store_true",
                         dest="debug",
                         help="Debug mode")
    optparser.add_option("-v",
                         "--validate",
                         default=None,
                         dest="validate",
                         help="validate input",
                         metavar="FILE")
    (options, args) = optparser.parse_args()

    s = StructureAnalyzer()
    if options.load:
        s.load(None, options.input)
    else:
        s.analyze(options.input.split(","))
    print >> sys.stderr, "--- Structure Analysis ----"
    print >> sys.stderr, s.toString()
    if options.validate != None:
        print >> sys.stderr, "--- Validation ----"
        xml = ETUtils.ETFromObj(options.validate)
        s.validate(xml, simulation=False, debug=options.debug)
        if options.output != None:
            ETUtils.write(xml, options.output)
    elif options.output != None:
        print >> sys.stderr, "Structure analysis saved to", options.output
        s.save(None, options.output)
Exemple #27
0
def processCorpus(input, parserName):
    print >> sys.stderr, "Loading corpus file", input
    corpusRoot = ETUtils.ETFromObj(input).getroot()
    documents = corpusRoot.findall("document")

    counts = defaultdict(int)
    matchByType = defaultdict(lambda: [0, 0])
    filteredMatchByType = defaultdict(lambda: [0, 0])
    filter = set(
        ["NP", "TOK-tIN", "WHADVP", "WHNP", "TOK-tWP$", "TOK-tPRP$", "NP-IN"])

    #    # fix spans
    #    for document in documents:
    #        for sentence in document.findall("sentence"):
    #            sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
    #            for entity in sentence.findall("entity"):
    #                altOffsetString = entity.get("altOffset")
    #                if altOffsetString == None:
    #                    continue
    #                #print altOffsetString
    #                altOffsets = Range.charOffsetToTuples(altOffsetString)
    #                assert len(altOffsets) == 1
    #                for i in range(len(altOffsets)):
    #                    altOffset = altOffsets[i]
    #                    altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
    #                entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))

    #counter = ProgressCounter(len(documents), "Documents")
    for document in documents:
        for sentence in document.findall("sentence"):
            entities = sentence.findall("entity")
            parse = ETUtils.getElementByAttrib(
                sentence.find("sentenceanalyses"), "parse",
                {"parser": parserName})
            if parse == None:
                continue
            tokenization = ETUtils.getElementByAttrib(
                sentence.find("sentenceanalyses"), "tokenization",
                {"tokenizer": parse.get("tokenizer")})
            phrases, phraseDict = makePhrases(parse, tokenization, entities)
            phraseOffsets = phraseDict.keys()
            #phraseOffsets.sort()
            phraseNECounts = getNECounts(phrases, entities)

            for value in phraseDict.values():
                counts["phrases"] += len(value)
                for phrase in value:
                    matchByType[phrase.get("type")][0] += 1
                    if phrase.get("type") in filter:
                        filteredMatchByType[phrase.get("type")][0] += 1
                        counts["phrases-filtered"] += 1
                    if phrase.get("type").find("NP") != -1:
                        matchByType[phrase.get("type") + "_NE" +
                                    str(phraseNECounts[phrase])][0] += 1
            counts["tokens"] += len(tokenization.findall("token"))

            corefType = {}
            for interaction in sentence.findall("interaction"):
                if interaction.get("type") == "Coref":
                    corefType[interaction.get("e1")] = "Anaphora"
                    corefType[interaction.get("e2")] = "Antecedent"

            for entity in entities:
                if entity.get("given") == "True":
                    continue
                counts["entity"] += 1
                print "entity", entity.get("id")
                print ETUtils.toStr(entity)
                matches = getMatchingPhrases(entity, phraseOffsets, phraseDict)
                count = 0
                filteredCount = 0
                for phrase in matches:
                    cType = "UNKNOWN"
                    if corefType.has_key(entity.get("id")):
                        cType = corefType[entity.get("id")]
                    print "  match", count, ETUtils.toStr(phrase), "NE" + str(
                        phraseNECounts[phrase]
                    ), "ctype:" + cType, "ent:" + ETUtils.toStr(entity)
                    count += 1
                    matchByType[phrase.get("type")][1] += 1
                    matchByType[phrase.get("type") + "_" + cType][1] += 1
                    matchByType[phrase.get("type") + "_" + cType + "_NE" +
                                str(phraseNECounts[phrase])][1] += 1
                    if phrase.get("type") in filter:
                        filteredCount += 1
                        filteredMatchByType[phrase.get("type")][1] += 1
                # Matching
                if count == 0:
                    print "  NO MATCH", ETUtils.toStr(entity)
                    counts["no-match"] += 1
                else:
                    counts["match"] += 1
                # Multimatching
                if len(matches) > 1:
                    bestMatch = selectBestMatch(entity, matches)
                    print "  MULTIMATCH(" + entity.get(
                        "charOffset"
                    ) + "," + str(entity.get("altOffset")) + ")", ", ".join([
                        x.get("type") + "_" + x.get("charOffset")
                        for x in matches
                    ]), "SEL(" + bestMatch.get("type") + "_" + bestMatch.get(
                        "charOffset") + ")"
                # Filtered matching
                if filteredCount == 0: counts["no-match-filtered"] += 1
                else: counts["match-filtered"] += 1
    print "Match"
    for key in sorted(matchByType.keys()):
        print "  ", key, " ", matchByType[key]
    print "Filtered", filteredMatchByType
    print "Counts", counts
Exemple #28
0
def mainFunc(input,
             output=None,
             parseName="McCC",
             tokenizationName=None,
             newParseName=None,
             newTokenizationName=None,
             logFileName=None,
             removeOld=True):
    print >> sys.stderr, "Protein Name Splitter"
    if logFileName != None:
        print >> sys.stderr, "Writing log to", logFileName
        logFile = open(logFileName, "wt")
    else:
        logFile = None
    #if input.endswith(".gz"):
    #    inFile = gzip.GzipFile(input)
    #else:
    #    inFile = open(input)
    tree = ETUtils.ETFromObj(input)

    if tokenizationName == None:
        tokenizationName = parseName

    #tree = ElementTree.parse(inFile)
    root = tree.getroot()

    sentences = [x for x in root.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "Split Protein Names")
    counter.showMilliseconds = True
    missingTokCount = 0
    for sentence in sentences:
        sId = sentence.get("id")
        counter.update(1, "Splitting names (" + sId + "): ")

        tok = getTokenization(tokenizationName,
                              sentence,
                              sId,
                              remove=removeOld)
        if tok == None:
            missingTokCount += 1
            continue

        assert tok is not None, "Missing tokenization '%s' in sentence %s!" % (
            tokenizationName, sId)

        parse = getParse(parseName,
                         tokenizationName,
                         sentence,
                         sId,
                         remove=removeOld)
        assert parse is not None, "Missing parse '%s' in sentence %s!" % (
            parseName, sId)

        split = splitTokens(tok, sentence, logFile)

        # Default names
        if removeOld:
            if newTokenizationName == None:
                newTokenizationName = tok.get("tokenizer")
            if newParseName == None:
                newParseName = parse.get("parser")
        else:
            if newTokenizationName == None:
                newTokenizationName = "split-" + tok.get("tokenizer")
            if newParseName == None:
                newParseName = "split-" + parse.get("parser")

        # add a new tokenization with the split tokens.
        splittok = addTokenization(newTokenizationName, sentence, sId)
        addTokensToTree(split, splittok)
        for a in tok.attrib:
            if splittok.get(a) == None:
                splittok.set(a, tok.get(a))
        #splittok.set("split-")

        # make a mapping from original to split token ids. Store the
        # head token when given.
        tokenIdMap = {}
        for t in split:
            if t.head:
                head = t.head
                # traverse
                while head.head is not None:
                    assert head.head != t, "Cyclic heads"
                    head = head.head

                # should match (nah, punctuation problems)
                # assert t.origId not in tokenIdMap or tokenIdMap[t.origId] == head.id, "Head conflict"
                tokenIdMap[t.origId] = head.id
            else:
                # only allow overwrite of existing entry if the current token
                # is not punctuation.
                if t.origId not in tokenIdMap or not t.isPunct():
                    tokenIdMap[t.origId] = t.id

        # make a copy of the specified parse that refers to the split tokens
        # instead of the originals.
        newparse = addParse(newParseName, newTokenizationName, sentence, sId)
        for a in parse.attrib:
            if newparse.get(a) == None:
                newparse.set(a, parse.get(a))
        newparse.set("ProteinNameSplitter", "True")
        splittok.set("ProteinNameSplitter", "True")

        depSeqId = 0  #1
        for d in parse.getiterator("dependency"):
            t1, t2, dType = d.get("t1"), d.get("t2"), d.get("type")
            assert t1 in tokenIdMap and t2 in tokenIdMap, "INTERNAL ERROR"

            dep = ElementTree.SubElement(newparse, "dependency")
            dep.set("t1", tokenIdMap[t1])
            dep.set("t2", tokenIdMap[t2])
            dep.set("type", dType)
            dep.set("id", "sd_%d" % depSeqId)
            depSeqId += 1

        # Add in new dependencies between the split parts.
        for t in [tok for tok in split if tok.head is not None]:
            dep = ElementTree.SubElement(newparse, "dependency")
            dep.set("t1", t.head.id)
            dep.set("t2", t.id)
            dep.set("type", t.depType)
            dep.set("split", "PNS")
            dep.set("id", "spd_%d" % depSeqId)
            depSeqId += 1

        for phrase in parse.getiterator("phrase"):
            newparse.append(phrase)

            # debugging
            #print >> sys.stderr, "NEW DEP IN", sId

    print >> sys.stderr, "Tokenization missing from", missingTokCount, "sentences"

    #indent(root)
    if logFile != None:
        logFile.close()

    # debugging
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(tree, output)
    return tree
Exemple #29
0
def mergeSentences(input, output, verbose=False):
    print >> sys.stderr, "Merging sentences into documents"
    print >> sys.stderr, "Loading corpus file", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()

    counts = defaultdict(int)
    for document in corpusRoot.findall("document"):
        counts["documents"] += 1
        # Check that the entity has only sentence elements as children
        children = [x for x in document]
        docChildTypes = sorted(set([x.tag for x in children]))
        if len(docChildTypes) == 0:
            counts["documents-with-no-sentences"] += 1
            continue
        elif len(docChildTypes) > 1 or docChildTypes[0] != "sentence":
            raise Exception("Document '" + str(document.get("id")) +
                            "' has non-sentence children: " +
                            str(docChildTypes))
        # Process all the child sentence elements
        docId = document.get("id")
        interactions = []
        entities = []
        entityById = {}
        interactionById = {}
        combinedText = ""
        calculatedOffset = (0, 0)
        for sentence in children:
            document.remove(sentence)
            sentenceText = sentence.get("head", "") + sentence.get(
                "text", "") + sentence.get("tail", "")
            sentOffset = sentence.get("charOffset")
            if sentence == children[0]:
                noDefinedOffsets = sentOffset == None
            elif (sentOffset == None) != noDefinedOffsets:
                raise Exception("Only some sentences in document '" + docId +
                                "' have defined offsets")
            if sentOffset == None:
                if sentence != children[-1]:
                    sentenceText = sentenceText + " "
                calculatedOffset = (calculatedOffset[1],
                                    calculatedOffset[1] + len(sentenceText))
                sentOffset = calculatedOffset
            else:
                sentOffset = Range.charOffsetToSingleTuple(sentOffset)
            combinedText += sentenceText
            # Collect and update the entity elements
            for entity in sentence.findall("entity"):
                # Map sentence-level entity offsets to document level
                for offsetKey in ("charOffset", "headOffset"):
                    if entity.get(offsetKey) != None:
                        offset = Range.charOffsetToTuples(
                            entity.get(offsetKey))
                        for i in range(len(offset)):
                            offset[i] = (offset[i][0] + sentOffset[0],
                                         offset[i][1] + sentOffset[0])
                        entity.set(offsetKey, Range.tuplesToCharOffset(offset))
                # Compare mapped offsets to origOffset, if available
                if entity.get("origOffset") != None:
                    if entity.get("charOffset") != entity.get("origOffset"):
                        raise Exception(
                            "Document '" + str(document.get("id")) +
                            "' entity '" + str(entity.get("id")) +
                            "' new charOffset differs from origOffset: " +
                            str([
                                entity.get("charOffset"),
                                entity.get("origOffset")
                            ]))
                    counts["checked-origOffsets"] += 1
                    del entity.attrib["origOffset"]
                assert entity.get("id") not in entityById
                entityById[entity.get(
                    "id"
                )] = entity  # For re-mapping the interaction 'e1' and 'e2' attributes
                entities.append(entity)
                counts["moved-entities"] += 1
            # Collect and update the interaction elements
            for interaction in sentence.findall("interaction"):
                assert interaction.get("id") not in interactionById
                interactionById[interaction.get(
                    "id"
                )] = interaction  # For re-mapping the interaction 'siteOf' attributes
                interactions.append(interaction)
                counts["moved-interactions"] += 1
        # Check that the combined sentence text matches the document text, if available
        if document.get("text") != None and document.get(
                "text") != combinedText:
            if combinedText == document.get(
                    "text")[0:len(combinedText)] and document.get(
                        "text")[len(combinedText):].strip() == "":
                if verbose:
                    print >> sys.stderr, "Warning, document '" + document.get(
                        "id"
                    ) + "' text has trailing whitespace not included in the combined sentence text"
                combinedText = document.get("text")
                counts["missing-trailing-whitespace"] += 1
            else:
                raise Exception(
                    "Document '" + str(document.get("id")) +
                    "' text differs from combined sentence text: " +
                    str([document.get("text"), combinedText]))
            counts["checked-document-texts"] += 1
        # Check that the entities' texts match the document text
        for entity in entities:
            offset = Range.charOffsetToTuples(entity.get("charOffset"))
            if len(offset) == 1:  # Compare only continous entities
                if not Range.contains((0, len(combinedText)), offset[0]):
                    raise Exception(
                        "Document '" + str(document.get("id")) + "' entity '" +
                        str(entity.get("id")) +
                        "' offset is not contained in combined sentence text: "
                        + str([
                            entity.attrib, offset, [0, len(combinedText)],
                            combinedText
                        ]))
                combTextSpan = combinedText[offset[0][0]:offset[0][1]]
                if entity.get("text") != combTextSpan:
                    raise Exception(
                        "Document '" + str(document.get("id")) + "' entity '" +
                        str(entity.get("id")) +
                        "' text does not match combined sentence text: " +
                        str([entity.get("text"), combTextSpan]))
                counts["checked-charOffsets"] += 1
        # Set the combined text as the document text
        document.set("text", combinedText)
        # Update entity and interaction ids (not done earlier so that possible error messages will refer to original ids, also because of siteOf-remapping)
        for i in range(len(entities)):
            entities[i].set("id", docId + ".e" +
                            str(i))  # Update the id for the document level
        for i in range(len(interactions)):
            interaction.set("id", docId + ".i" +
                            str(i))  # Update the id for the document level
        # Update interaction e1 and e2 ids (cannot be done earlier because interactions may refer to entities from multiple sentences)
        for i in range(len(interactions)):
            interaction = interactions[i]
            for entKey in ("e1", "e2"):
                interaction.set(entKey,
                                entityById[interaction.get(entKey)].get("id"))
            if interaction.get("siteOf") != None:
                interaction.set(
                    "siteOf",
                    interactionById[interaction.get("siteOf")].get("id"))
        # Add the entity and interaction elements to the document
        document.extend(entities)
        document.extend(interactions)
    print >> sys.stderr, "Counts:", dict(counts)

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemple #30
0
def run(input,
        output=None,
        elementName="entity",
        processElement="document",
        splitNewlines=False,
        debug=False,
        bannerPath=None,
        trovePath=None):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()

    # Write text to input file
    workdir = tempfile.mkdtemp()
    if debug:
        print >> sys.stderr, "BANNER work directory at", workdir
    infile = codecs.open(os.path.join(workdir, "input.txt"), "wt", "utf-8")
    idCount = 0
    for sentence in corpusRoot.getiterator(processElement):
        infile.write(
            "U" + str(idCount) + " " +
            sentence.get("text").replace("\n", " ").replace("\n", " ") + "\n")
        idCount += 1
    infile.close()

    # Define classpath for java
    if bannerPath == None:
        bannerPath = Settings.BANNER_DIR
    libPath = "/lib/"
    #    if not os.path.exists(bannerPath + libPath):
    #        libPath = "/libs/"
    #        assert os.path.exists(bannerPath + libPath)
    assert os.path.exists(bannerPath + libPath + "banner.jar"), bannerPath
    oldVersion = True
    classPath = bannerPath + "/bin"
    for filename in os.listdir(bannerPath + libPath):
        #if filename.endswith(".jar"):
        #    classPath += ":" + bannerPath + libPath + filename
        if filename == "uima":
            oldVersion = False
    classPath += ":" + bannerPath + libPath + "*"
    #    classPath += ":" + bannerPath + libPath + "banner.jar"
    #    classPath += ":" + bannerPath + libPath + "dragontool.jar"
    #    classPath += ":" + bannerPath + libPath + "heptag.jar"
    #    classPath += ":" + bannerPath + libPath + "commons-collections-3.2.1.jar"
    #    classPath += ":" + bannerPath + libPath + "commons-configuration-1.6.jar"
    #    classPath += ":" + bannerPath + libPath + "commons-lang-2.4.jar"
    #    classPath += ":" + bannerPath + libPath + "mallet.jar"
    #    classPath += ":" + bannerPath + libPath + "commons-logging-1.1.1.jar"
    if oldVersion:
        if trovePath == None:
            trovePath = Settings.JAVA_TROVE_PATH
        assert os.path.exists(trovePath), trovePath
        classPath += ":" + trovePath  # ":/usr/share/java/trove.jar"
        print >> sys.stderr, "Trove library at", trovePath

    config = makeConfigXML(workdir, bannerPath, oldVersion)

    # Run parser
    print >> sys.stderr, "Running BANNER", bannerPath
    cwd = os.getcwd()
    os.chdir(bannerPath)
    if oldVersion:  # old version
        args = Settings.JAVA.split() + [
            "-cp", classPath, "banner.eval.TestModel", config
        ]
    else:
        args = Settings.JAVA.split() + [
            "-cp", classPath, "banner.eval.BANNER", "test", config
        ]
    print >> sys.stderr, "BANNER command:", " ".join(args)
    startTime = time.time()
    exitCode = subprocess.call(args)
    assert exitCode == 0, exitCode
    print >> sys.stderr, "BANNER time:", str(
        datetime.timedelta(seconds=time.time() - startTime))
    os.chdir(cwd)

    # Put sentences in dictionary
    sDict = {}
    sentenceHasEntities = {}
    sCount = 0
    for sentence in corpusRoot.getiterator(processElement):
        sDict["U" + str(sCount)] = sentence
        sentenceHasEntities["U" + str(sCount)] = False
        sCount += 1

    sentencesWithEntities = 0
    totalEntities = 0
    nonSplitCount = 0
    splitEventCount = 0

    # TODO: mention.txt appears to contain predicted entities directly
    # To be able to feed BANNER documents (or poorly chopped sentences)
    # one should probably remove newlines, as BANNER separates its input
    # on newlines. Replacing all \r and \n characters should preserve the
    # character offsets.

    # Read BANNER results
    print >> sys.stderr, "Inserting entities"
    if oldVersion:
        outfile = codecs.open(os.path.join(workdir, "output.txt"), "rt",
                              "utf-8")
        idfile = codecs.open(os.path.join(workdir, "ids.txt"), "rt", "utf-8")
        # Add output to sentences
        for line in outfile:
            bannerId = idfile.readline().strip()
            sentence = sDict[bannerId]

            # Find or create container elements
            sentenceId = sentence.get("id")

            sText = sentence.get("text")
            start = 0
            entityCount = 0
            beginOffset = None
            # Add tokens
            splits = line.strip().split()
            for split in splits:
                tokenText, tag = split.rsplit("|", 1)
                # Determine offsets by aligning BANNER-generated tokens to original text
                cStart = sText.find(tokenText, start)
                assert cStart != -1, (tokenText, tag, sText, line)
                cEnd = cStart + len(tokenText) - 1
                start = cStart + len(tokenText)

                if tag == "O":
                    if beginOffset != None:
                        ## Make element
                        #ent = ET.Element(elementName)
                        #ent.set("id", sentenceId + ".e" + str(entityCount))
                        #ent.set("charOffset", str(beginOffset) + "-" + str(prevEnd))
                        #ent.set("type", "Protein")
                        #ent.set("given", "True")
                        #ent.set("source", "BANNER")
                        #ent.set("text", sText[beginOffset:prevEnd+1])
                        entities = makeEntityElements(beginOffset, prevEnd,
                                                      sText, splitNewlines,
                                                      elementName)
                        assert len(entities) > 0
                        nonSplitCount += 1
                        if len(entities) > 1:
                            splitEventCount += 1
                        for ent in entities:
                            ent.set("id", sentenceId + ".e" + str(entityCount))
                            sentence.append(ent)
                            if not sentenceHasEntities[bannerId]:
                                sentencesWithEntities += 1
                                sentenceHasEntities[bannerId] = True
                            totalEntities += 1
                            entityCount += 1
                        beginOffset = None
                else:
                    if beginOffset == None:
                        beginOffset = cStart
                prevEnd = cEnd
        outfile.close()
        idfile.close()
    else:
        sentenceEntityCount = {}
        mentionfile = codecs.open(os.path.join(workdir, "mention.txt"), "rt",
                                  "utf-8")
        for line in mentionfile:
            bannerId, offsets, word = line.strip().split("|", 2)
            offsets = offsets.split()
            sentence = sDict[bannerId]
            map = getWhiteSpaceLessStringMap(sentence.get("text"))
            offsets[0], offsets[1] = fixWhiteSpaceLessOffset(
                word, sentence.get("text"), int(offsets[0]), int(offsets[1]),
                map)
            #offsets[0], offsets[1] = fixStrangeOffset(line.strip(), word, int(offsets[0]), int(offsets[1]), sentence.get("text"))
            entities = makeEntityElements(int(offsets[0]), int(offsets[1]),
                                          sentence.get("text"), splitNewlines,
                                          elementName)
            entityText = "\n".join([x.get("text") for x in entities])
            assert entityText == word, (entityText, word, bannerId, offsets,
                                        sentence.get("id"),
                                        sentence.get("text"))
            assert len(entities) > 0, (line.strip(), sentence.get("text"))
            nonSplitCount += 1
            if len(entities) > 1:
                splitEventCount += 1
            if bannerId not in sentenceEntityCount:
                sentenceEntityCount[bannerId] = 0
            for ent in entities:
                ent.set(
                    "id",
                    sentence.get("id") + ".e" +
                    str(sentenceEntityCount[bannerId]))
                sentence.append(ent)
                if not sentenceHasEntities[bannerId]:
                    sentencesWithEntities += 1
                    sentenceHasEntities[bannerId] = True
                totalEntities += 1
                sentenceEntityCount[bannerId] += 1
        mentionfile.close()

    print >> sys.stderr, "BANNER found", nonSplitCount, "entities in", sentencesWithEntities, processElement + "-elements",
    print >> sys.stderr, "(" + str(sCount) + " sentences processed)"
    print >> sys.stderr, "New", elementName + "-elements:", totalEntities, "(Split", splitEventCount, "BANNER entities with newlines)"

    # Remove work directory
    if not debug:
        shutil.rmtree(workdir)
    else:
        print >> sys.stderr, "BANNER working directory for debugging at", workdir

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree