Example #1
0
def catenate(input1, input2, output):
    print >> sys.stderr, "##### Catenate interaction XML #####"
    c1 = RecalculateIds.recalculateIds(input1, None, False, 0)
    numDocs = len(c1.getroot().findall("document"))
    print >> sys.stderr, "Documents in input 1:", numDocs
    c2 = RecalculateIds.recalculateIds(input2, None, False, numDocs)

    print >> sys.stderr, "Appending documents"
    c1Root = c1.getroot()
    for document in c2.getroot().findall("document"):
        c1Root.append(document)

    print >> sys.stderr, "Validating ids"
    ids = set()
    for element in c1Root.getiterator("entity"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("interaction"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("sentence"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("document"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(c1Root, output)
    return c1
Example #2
0
def removeUnconnectedEntities(input, output=None):
    input = ETUtils.ETFromObj(input)
    root = input.getroot()
    removed = 0
    preserved = 0
    for document in root.findall("document"):
        sentMap = {}  # allow for intersentence interactions
        for sentence in document.findall("sentence"):
            sentMap[sentence.get("id")] = sentence
        connected = set()
        for interaction in document.getiterator("interaction"):
            connected.add(interaction.get("e1"))
            connected.add(interaction.get("e2"))
        entities = []
        for entity in document.getiterator("entity"):
            entities.append(entity)
        for entity in entities:
            if entity.get("isName") == "True":  # never remove named entities
                continue
            eId = entity.get("id")
            if eId not in connected:
                if eId.find(".s") != -1:  # sentence level entity
                    sentMap[eId.rsplit(".", 1)[0]].remove(entity)
                else:  # document level entity
                    document.remove(entity)
                removed += 1
            else:
                preserved += 1

    print >> sys.stderr, "Removed", removed, "entities, preserved", preserved, "entities"

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(root, output)
    return input
def removeUnconnectedEntities(input, output=None):
    input = ETUtils.ETFromObj(input)
    root = input.getroot()
    removed = 0
    preserved = 0
    for document in root.findall("document"):
        sentMap = {} # allow for intersentence interactions
        for sentence in document.findall("sentence"):
            sentMap[sentence.get("id")] = sentence
        connected = set()
        for interaction in document.getiterator("interaction"):
            connected.add(interaction.get("e1"))
            connected.add(interaction.get("e2"))
        entities = []
        for entity in document.getiterator("entity"):
            entities.append(entity)
        for entity in entities:
            if entity.get("isName") == "True": # never remove named entities
                continue
            eId = entity.get("id")
            if eId not in connected:
                if eId.find(".s") != -1: # sentence level entity
                    sentMap[eId.rsplit(".", 1)[0]].remove(entity)
                else: # document level entity
                    document.remove(entity)
                removed += 1
            else:
                preserved += 1
    
    print >> sys.stderr, "Removed", removed, "entities, preserved", preserved, "entities"
    
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(root, output)
    return input
def mergeAll(input, output=None, debug=False, iterate=False):
    if iterate:
        origItems = defaultdict(int)
        removedItems = defaultdict(int)
        for docSentences in SentenceElements.getCorpusIterator(input, output):
            entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(
                docSentences, debug)
            for key in entitiesByType:
                origItems[key] += entitiesByType[key]
            for key in duplicatesRemovedByType:
                removedItems[key] += duplicatesRemovedByType[key]
            interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(
                docSentences, debug)
            for key in interactionsByType:
                origItems[key] += interactionsByType[key]
            for key in duplicatesRemovedByType:
                removedItems[key] += duplicatesRemovedByType[key]
        printStats(origItems, removedItems)
        return None
    else:
        corpusElements = CorpusElements.loadCorpus(
            input, removeIntersentenceInteractions=False)
        print >> sys.stderr, "Merging duplicate entities"
        entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(
            corpusElements.sentences, debug)
        printStats(entitiesByType, duplicatesRemovedByType)
        print >> sys.stderr, "Merging duplicate interactions"
        interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(
            corpusElements.sentences, debug)
        printStats(interactionsByType, duplicatesRemovedByType)
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusElements.rootElement, output)
        return corpusElements
Example #5
0
def convert(input, output=None, outputRoot=None):
    print >> sys.stderr, "##### Convert PMC to Interaction XML #####"
    
    print >> sys.stderr, "Loading corpus", input
    pmcTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    pmcRoot = pmcTree.getroot()
        
    includeElements = [
        "front",
        "article-meta",
        "title-group",
        "article-title",
        "abstract",
        "body",
        "sec",
        "p",
        "title"]
    collapseElements = [
        "front",
        "article-meta",
        "title-group",
        "p"]
    
    if outputRoot == None:
        outputRoot = ET.Element("corpus")
        outputRoot.set("source", "PMC")
    
    outputRoot.append(addElements(pmcRoot, includeElements, collapseElements))
    
    outputTree = ET.ElementTree(outputRoot)
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(outputTree, output)
    return outputTree
Example #6
0
 def run(cls,inFile,multiplier=1.0,outFile=None,targetLabel="neg", binary=False):
     """inFile can be a string with file name (.xml or .xml.gz) or an ElementTree or an Element or an open input stream
     multiplier adjusts the level of boosting the non-negative predictions, it is a real number (0,inf)
     multiplier 1.0 does nothing, <1.0 decreases negative class confidence, >1.0 increases negative class confidence
     the root of the modified tree is returned and, if outFile is a string, written out to outFile as well"""
     print >> sys.stderr, "##### Recall adjust with multiplier " + str(multiplier)[:5] + " #####"
     tree=ETUtils.ETFromObj(inFile)
     if not ET.iselement(tree):
         assert isinstance(tree,ET.ElementTree)
         root=tree.getroot()
     else:
         root = tree
     
     if multiplier != -1:
         if binary:
             print >> sys.stderr, "Recall binary mode"
             classRanges = getClassRanges(root.getiterator("entity"))
             assert len(classRanges.keys()) in [0,2]
             if len(classRanges.keys()) == 0:
                 print >> sys.stderr, "Warning, recall adjustment skipped because no prediction weights found"
         else:
             print >> sys.stderr, "Recall multiclass mode"
             classRanges = None
         for entityNode in root.getiterator("entity"):
             adjustEntity(entityNode,targetLabel,multiplier,classRanges)
     if outFile:
         ETUtils.write(root,outFile)
     return tree
Example #7
0
def catenate(input1, input2, output):
    print >> sys.stderr, "##### Catenate interaction XML #####"
    c1 = RecalculateIds.recalculateIds(input1, None, False, 0)
    numDocs = len(c1.getroot().findall("document"))
    print >> sys.stderr, "Documents in input 1:", numDocs
    c2 = RecalculateIds.recalculateIds(input2, None, False, numDocs)
    
    print >> sys.stderr, "Appending documents"
    c1Root = c1.getroot()
    for document in c2.getroot().findall("document"):
        c1Root.append(document)
    
    print >> sys.stderr, "Validating ids"
    ids = set()
    for element in c1Root.getiterator("entity"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("interaction"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("sentence"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("document"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(c1Root, output)
    return c1
Example #8
0
def convert(input, output=None, outputRoot=None):
    print >> sys.stderr, "##### Convert PMC to Interaction XML #####"

    print >> sys.stderr, "Loading corpus", input
    pmcTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    pmcRoot = pmcTree.getroot()

    includeElements = [
        "front", "article-meta", "title-group", "article-title", "abstract",
        "body", "sec", "p", "title"
    ]
    collapseElements = ["front", "article-meta", "title-group", "p"]

    if outputRoot == None:
        outputRoot = ET.Element("corpus")
        outputRoot.set("source", "PMC")

    outputRoot.append(addElements(pmcRoot, includeElements, collapseElements))

    outputTree = ET.ElementTree(outputRoot)
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(outputTree, output)
    return outputTree
def processCorpus(inputFilename, outputFilename, rules):
    print >> sys.stderr, "Loading corpus file", inputFilename
    if inputFilename.rsplit(".",1)[-1] == "gz":
        import gzip
        corpusTree = ET.parse(gzip.open(inputFilename))
    else:
        corpusTree = ET.parse(inputFilename)
    corpusRoot = corpusTree.getroot()
    
    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = {}
    for k in sorted(rules.keys()):
        countsByType[k] = 0
    for document in documents:
        counter.update()
        for sentence in document.findall("sentence"):
            processSentence(sentence, rules, countsByType)
    print >> sys.stderr, "Removed"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ":", countsByType[k]
    
    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
Example #10
0
def fixAltOffsets(input, output=None):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    docCount = 0
    sentencesCreated = 0
    sentences = [x for x in corpusRoot.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "FixAltOffsets")
    fixCount = 0
    # fix spans
    for sentence in sentences:
        counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ")
        sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        for entity in sentence.findall("entity"):
            altOffsetString = entity.get("altOffset")
            if altOffsetString == None:
                continue
            #print altOffsetString
            altOffsets = Range.charOffsetToTuples(altOffsetString)
            assert len(altOffsets) == 1
            for i in range(len(altOffsets)):
                altOffset = altOffsets[i] 
                altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
            entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))
            fixCount += 1
        
    print >> sys.stderr, "Fixed", fixCount, "altOffsets"
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Example #11
0
def extractTask2(inputfile, outputfile, inverse):
    if inverse:
        print >> sys.stderr, "Extracting task2 information from", inputfile, "to", outputfile
    else:
        print >> sys.stderr, "Removing task2 information from", inputfile, "and saving to", outputfile
    corpusRoot = getCorpus(inputfile)
    for sentence in corpusRoot.getiterator("sentence"):
        task2EntityIds = set()
        if not inverse:
            for entity in sentence.findall("entity"):
                if entity.get("type") == "Entity":
                    task2EntityIds.add(entity.get("id"))
                if entity.get("type") in ["Entity", "neg"]:
                    sentence.remove(entity)
            for interaction in sentence.findall("interaction"):
                if interaction.get("type") in ["Site","CSite","AtLoc","ToLoc","neg"]:
                    sentence.remove(interaction)
                elif interaction.get("e1") in task2EntityIds or interaction.get("e2") in task2EntityIds:
                    sentence.remove(interaction) # remove Theme/Cause interactions referring to t2 entities
        else:
            for entity in sentence.findall("entity"):
                if entity.get("type") == "Entity":
                    task2EntityIds.add(entity.get("id"))
                if entity.get("type") != "Entity":
                    sentence.remove(entity)
            for interaction in sentence.findall("interaction"):
                if interaction.get("type") not in ["Site","CSite","AtLoc","ToLoc"]:
                    sentence.remove(interaction)
            analysesElement = sentence.find("sentenceanalyses")
            if analysesElement != None:
                sentence.remove(analysesElement)

    ETUtils.write(corpusRoot, outputfile)
Example #12
0
def makeSubset(filename, output, ratio, seed):
    if ratio == 1.0:
        return filename
    totalFolds = 100
    selectedFolds = int(ratio * 100.0)
    print >> sys.stderr, "====== Making subset ======"
    print >> sys.stderr, "Subset for file", filename, "ratio", ratio, "seed", seed
    import cElementTreeUtils as ETUtils
    import Core.Split
    xml = ETUtils.ETFromObj(filename).getroot()
    count = 0
    sentCount = 0
    for document in xml.findall("document"):
        sentCount += len(document.findall("sentence"))
        count += 1
    division = Core.Split.getFolds(count, totalFolds, seed)
    #print division, selectedFolds - 1
    index = 0
    removeCount = 0
    sentRemoveCount = 0
    for document in xml.findall("document"):
        if division[index] > selectedFolds - 1:
            xml.remove(document)
            sentRemoveCount += len(document.findall("sentence"))
            removeCount += 1
        index += 1
    print "Subset", "doc:", count, "removed:", removeCount, "sent:", sentCount, "sentremoved:", sentRemoveCount
    ETUtils.write(xml, output)
    return output
def mergeAll(input, output=None, debug=False):
    corpusElements = CorpusElements.loadCorpus(input, removeIntersentenceInteractions=False)
    mergeDuplicateEntities(corpusElements, debug)
    mergeDuplicateInteractions(corpusElements, debug)
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusElements.rootElement, output)
    return corpusElements
Example #14
0
def writeTask3ToInteractionXML(examples, predictions, corpusElements, outputFileName, task3Type):
    import sys
    print >> sys.stderr, "Adding task 3 to Interaction XML"
    try:
        import xml.etree.cElementTree as ET
    except ImportError:
        import cElementTree as ET
    import cElementTreeUtils as ETUtils
    
    assert task3Type == "speculation" or task3Type == "negation"
    
    if type(predictions) == types.StringType:
        print >> sys.stderr, "Reading predictions from", predictions
        predictions = loadPredictions(predictions)
    if type(examples) == types.StringType:
        print >> sys.stderr, "Reading examples from", examples
        examples = readExamples(examples, False)
    
    corpusTree = ETUtils.ETFromObj(corpusElements)
    corpusRoot = corpusTree.getroot()
    
    # Remove the task 3 subtask information if it already exists
    for entity in corpusRoot.getiterator("entity"):
        if task3Type == "speculation":
            entity.set("speculation", "False")
        else: # task3Type == "negation"
            entity.set("negation", "False")
    
    specMap = {}
    negMap = {}
    for example, prediction in itertools.izip(examples, predictions):
        assert example[3]["xtype"] == "task3"
        if example[3]["t3type"] == "speculation":
            map = specMap
        else:
            map = negMap
        if prediction[0] != 1:
            assert not map.has_key(example[3]["entity"])
            map[example[3]["entity"]] = True
    
    for entity in corpusRoot.getiterator("entity"):
        if task3Type == "speculation":
            if specMap.has_key(entity.get("id")):
                entity.set("speculation", "True")
            else:
                entity.set("speculation", "False")
        elif task3Type == "negation":
            if negMap.has_key(entity.get("id")):
                entity.set("negation", "True")
            else:
                entity.set("negation", "False")
    
    # Write corpus
    if outputFileName != None:
        print >> sys.stderr, "Writing corpus to", outputFileName
        ETUtils.write(corpusRoot, outputFileName)
    return corpusTree
Example #15
0
def addMTMX(input, mtmxDir, output):
    from collections import defaultdict
    # read interaction XML
    print "Reading interaction XML"
    counts = defaultdict(int)
    xml = ETUtils.ETFromObj(input).getroot()
    docById = {}
    for document in xml.getiterator("document"):
        docId = document.get("origId")
        assert docId not in docById
        docById[docId] = document
        counts["document"] += 1
    for entity in xml.getiterator("entity"):
        counts["entity"] += 1
    
    # read MTMX files
    print "Processing MTMX"
    for filename in sorted(os.listdir(mtmxDir)):
        if filename.endswith(".xml"):
            print filename,
            fileId = filename.split("_")[0]
            if fileId not in docById:
                print "skipped"
                continue
            else:
                print "processing"
            doc = docById[fileId]
            entityByOrigId = {}
            for entity in doc.getiterator("entity"):
                assert entity.get("origId") not in entityByOrigId, entity.get("origId")
                entityByOrigId[entity.get("origId")] = entity
            mtmx = ETUtils.ETFromObj(os.path.join(mtmxDir, filename)).getroot()
            for phrase in mtmx.getiterator("PHRASE"):
                if phrase.get("ID") in entityByOrigId:
                    entity = entityByOrigId[phrase.get("ID")]
                    mapCount = 0
                    for map in phrase.getiterator("MAP"):
                        if (map.get("NAME").lower() == entity.get("text").lower()) or (map.get("NAME_SHORT").lower() == entity.get("text").lower()):
                            if entity.get("mtmxProb") != None:
                                if int(entity.get("mtmxProb")) > int(map.get("PROB")):
                                    break
                                else:
                                    counts["mapped-multi"] += 1
                                    counts["mapped-multi-"+str(mapCount)] += 1
                                    #print filename, phrase.get("ID")
                            else:
                                counts["mapped-at-least-once"] += 1
                            entity.set("mtmxProb", str(map.get("PROB")))
                            entity.set("mtmxCui", str(map.get("CUI")))
                            entity.set("mtmxName", str(map.get("NAME")))
                            entity.set("mtmxNameShort", str(map.get("NAME_SHORT")))
                            entity.set("mtmxSemTypes", str(map.get("SEMTYPES")))
                            counts["mappings"] += 1
                            mapCount += 1
    print counts
    ETUtils.write(xml, output)
Example #16
0
def processCorpus(input, outDir, stem, tail, mergedSets=[], saveCombined=False, verbose=False):
    newCorpora = {}
    print >> sys.stderr, "Loading corpus file", input
    corpusRoot = ETUtils.ETFromObj(input).getroot()
    
    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = {}
    for document in documents:
        counter.update()
        docSet = document.get("set")
        if docSet == None:
            if verbose: print >> sys.stderr, "Warning, no set defined for document", document.get("id")
            if not countsByType.has_key("No set"):
                countsByType["No set"] = 0
            countsByType["No set"] += 1
            continue
        elif not newCorpora.has_key(docSet):
            newCorpora[docSet] = ET.Element("corpus")
            for k, v in corpusRoot.attrib.iteritems():
                newCorpora[docSet].set(k, v)
            countsByType[docSet] = 0
        newCorpora[docSet].append(document)
        countsByType[docSet] += 1
        
    # Make merged sets
    for mergedSet in mergedSets:
        tag = "-and-".join(sorted(mergedSet))
        if not newCorpora.has_key(tag):
            newCorpora[tag] = ET.Element("corpus")
            for k, v in corpusRoot.attrib.iteritems():
                newCorpora[tag].set(k, v)
            countsByType[tag] = 0    
        for componentSet in mergedSet:
            for element in newCorpora[componentSet].findall("document"):
                newCorpora[tag].append(element)
                countsByType[tag] += 1
        
    print >> sys.stderr, "Documents per set"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + str(k) + ":", countsByType[k]
    
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    
    print >> sys.stderr, "Writing output files to directory", outDir
    if saveCombined:
        print >> sys.stderr, "Saving combined input to", stem + tail
        ETUtils.write(corpusRoot, stem + tail)
    else:
        print >> sys.stderr, "Combined input not saved"
    for docSet in sorted(newCorpora.keys()):
        outFilename = os.path.join(outDir, stem + "-" + docSet + tail)
        print >> sys.stderr, "Writing set", docSet, "to", outFilename
        ETUtils.write(newCorpora[docSet], outFilename)
Example #17
0
def findHeads(input, parse, tokenization=None, output=None, removeExisting=True, iterate=False):
    if iterate:
        from Utils.ProgressCounter import ProgressCounter
        import InteractionXML.SentenceElements as SentenceElements
        print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization
        print >> sys.stderr, "Removing existing head offsets"
        removeCount = 0
        counter = ProgressCounter(None, "Find heads")
        counter.showMilliseconds = True
        for sentences in SentenceElements.getCorpusIterator(input, output, parse, tokenization):
            for sentence in sentences:
                if removeExisting:
                    for e in sentence.sentence.findall("entity"):
                        if e.get("headOffset") != None:
                            removeCount += 1
                            del e.attrib["headOffset"]
                graph = SentenceGraph.SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies)
                graph.mapInteractions(sentence.entities, sentence.interactions)
                # Make sure every parse gets head scores
                #if graph.tokenHeadScores == None:
                #    graph.getTokenHeadScores()
            counter.update(len(sentences), "Finding heads ("+sentences[-1].sentence.get("id")+"): ")                
        print >> sys.stderr, "Removed head offsets from", removeCount, "entities"    
    else:
        xml = ETUtils.ETFromObj(input)
        if removeExisting:
            print >> sys.stderr, "Removing existing head offsets"
            removeCount = 0
            xml = ETUtils.ETFromObj(input)
            for d in xml.getroot().findall("document"):
                for s in d.findall("sentence"):
                    for e in s.findall("entity"):
                        if e.get("headOffset") != None:
                            removeCount += 1
                            del e.attrib["headOffset"]
            print >> sys.stderr, "Removed head offsets from", removeCount, "entities"
        
        # SentenceGraph automatically calculates head offsets and adds them to entities if they are missing
        print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization
        corpusElements = SentenceGraph.loadCorpus(xml, parse, tokenization)
        
        # Make sure every parse gets head scores
        for sentence in corpusElements.sentences:
            if sentence.sentenceGraph == None:
                continue
            if sentence.sentenceGraph.tokenHeadScores == None:
                sentence.sentenceGraph.getTokenHeadScores()
        
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusElements.rootElement, output)
        return xml
Example #18
0
def makeConfigXML(workdir, bannerDir, oldVersion=True):
    conf = ET.Element("banner-configuration")
    banner = ET.SubElement(conf, "banner")
    eval = ET.SubElement(banner, "eval")
    datasetName = ET.SubElement(eval, "datasetName").text = "banner.eval.dataset.BC2GMDataset"
    # Dataset
    dataset = ET.SubElement(eval, "dataset")
    ET.SubElement(dataset, "sentenceFilename").text = workdir + "/input.txt"
    ET.SubElement(dataset, "mentionTestFilename").text = workdir + "/empty.eval"
    ET.SubElement(dataset, "mentionAlternateFilename").text = workdir + "/empty.eval"
    codecs.open(os.path.join(workdir, "empty.eval"), "wt", "utf-8").close()
    # More eval level stuff
    ET.SubElement(eval, "idInputFilename").text = workdir + "/ids.txt"
    ET.SubElement(eval, "rawInputFilename").text = workdir + "/raw.txt"
    ET.SubElement(eval, "trainingInputFilename").text = workdir + "/training.txt"
    ET.SubElement(eval, "outputFilename").text = workdir + "/output.txt"
    codecs.open(os.path.join(workdir, "output.txt"), "wt", "utf-8").close()
    ET.SubElement(eval, "inContextAnalysisFilename").text = workdir + "/contextAnalysis.html"
    ET.SubElement(eval, "mentionFilename").text = workdir + "/mention.txt"
    ET.SubElement(eval, "modelFilename").text = bannerDir + "/output/model_BC2GM.bin"
    ET.SubElement(eval, "lemmatiserDataDirectory").text = bannerDir + "/nlpdata/lemmatiser"
    ET.SubElement(eval, "posTaggerDataDirectory").text = bannerDir + "/nlpdata/tagger"
    ET.SubElement(eval, "posTagger").text = "dragon.nlp.tool.HeppleTagger"
    ET.SubElement(eval, "tokenizer").text = "banner.tokenization.SimpleTokenizer"
    ET.SubElement(eval, "useParenthesisPostProcessing").text = "true"
    ET.SubElement(eval, "useLocalAbbreviationPostProcessing").text = "true"
    ET.SubElement(eval, "useNumericNormalization").text = "true"
    ET.SubElement(eval, "tagFormat").text = "IOB"
    ET.SubElement(eval, "crfOrder").text = "2"
    if not oldVersion:
        ET.SubElement(eval, "mentionTypes").text = "Required"
        ET.SubElement(eval, "sameTypeOverlapOption").text = "Exception"
        ET.SubElement(eval, "differentTypeOverlapOption").text = "Exception"
    ET.SubElement(eval, "dictionaryTagger").text = "banner.tagging.dictionary.DictionaryTagger"
    # End eval element
    tagging = ET.SubElement(banner, "tagging") 
    dictionary = ET.SubElement(tagging, "dictionary")
    dictionaryTagger = ET.SubElement(dictionary, "DictionaryTagger")
    ET.SubElement(dictionaryTagger, "filterContainedMentions").text = "true"
    ET.SubElement(dictionaryTagger, "normalizeMixedCase").text = "false"
    ET.SubElement(dictionaryTagger, "normalizeDigits").text = "false"
    ET.SubElement(dictionaryTagger, "canonize").text = "false"
    ET.SubElement(dictionaryTagger, "generate2PartVariations").text = "true"
    ET.SubElement(dictionaryTagger, "dropEndParentheticals").text = "false"
    ET.SubElement(dictionaryTagger, "dictionaryFile").text = bannerDir + "/dict/single.txt"
    ET.SubElement(dictionaryTagger, "dictionaryType").text = "GENE"
    # Write to file
    filename = workdir + "/banner_config.xml"
    ETUtils.write(conf, workdir + "/banner_config.xml")
    return workdir + "/banner_config.xml"
Example #19
0
def negateEvents(input, output=None, verbose=False):
    if not (ET.iselement(input) and input.tag == "sentence"):
        print >> sys.stderr, "Loading corpus file", input
        corpusTree = ETUtils.ETFromObj(input)
        corpusRoot = corpusTree.getroot()

    if not (ET.iselement(input) and input.tag == "sentence"):
        sentences = corpusRoot.getiterator("sentence")
    else:
        sentences = [input]
    counts = defaultdict(int)
    for sentence in sentences:
        for entity in sentence.findall("entity"):
            counts["all-entities"] += 1
            eType = entity.get("type")
            if not isNegatableEPITrigger(eType):
                counts["out-of-scope"] += 1
                continue
            eBaseType = getEPIBaseType(eType)
            eText = entity.get("text").lower()
            eNewType = determineNewType(eType, eText)

            # Insert changed charOffset
            counts["entities"] += 1
            if verbose:
                print "Entity", entity.get("id"), [entity.get("text")], [
                    eType, eBaseType, eNewType
                ],
            if eNewType != eBaseType:
                counts["negated"] += 1
                if verbose: print "NEGATED",
            if eNewType == eType:
                counts["correct"] += 1
                if verbose: print "CORRECT"
            else:
                counts["incorrect"] += 1
                if eNewType == eBaseType:
                    counts["incorrect-pos"] += 1
                else:
                    counts["incorrect-neg"] += 1
                if verbose: print "INCORRECT"
            entity.set("type", eNewType)
    if verbose:
        print counts

    if not (ET.iselement(input) and input.tag == "sentence"):
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusRoot, output)
        return corpusTree
Example #20
0
def interface(optionArgs=sys.argv[1:]):
    """
    The function to handle the command-line interface.
    """
    from optparse import OptionParser

    op = OptionParser(usage="%prog [options]\nGenia shared task specific pruning of invalid nodes and edges.")
    op.add_option("-i", "--infile",
                  dest="infile",
                  help="Input file (gifxml)",
                  metavar="FILE")
    op.add_option("-o", "--outfile",
                  dest="outfile",
                  help="Output file (gifxml)",
                  metavar="FILE")
    op.add_option("-c", "--cycles",
                  dest="cycles",
                  help="Remove cycles (requires the presence of 'predictions' attribute in 'interaction' elements)",
                  default=False,
                  action="store_true")
    (options, args) = op.parse_args(optionArgs)

    quit = False
    if not options.infile:
        print "Please specify the input file."
        quit = True
#    if not options.outfile:
#        print "Please specify the output file."
#        quit = True
    if quit:
        op.print_help()
        return(False)

    corpus = ETUtils.ETFromObj(options.infile)
    cycleBrokenCount = 0
    skipCount = 0
    for document in corpus.getroot().findall('document'):
        for sentence in document.findall("sentence"):
            #sys.stderr.write("Pruning document %s\n"%document.attrib['id'])
            pruner = Pruner(sentence)
            pruner.analyse()
            if options.cycles:
                cycleBrokenCount += pruner.analyseCycles()
            pruner.prune()
    sys.stderr.write("File pruned, broke " + str(cycleBrokenCount) + " cycles\n" )
    if skipCount > 0:
        sys.stderr.write("Pruning skipped " + str(skipCount) + " sentences\n" )
    if options.outfile:
        ETUtils.write(corpus, options.outfile)
    return corpus
def negateEvents(input, output=None, verbose=False):
    if not (ET.iselement(input) and input.tag == "sentence"):
        print >> sys.stderr, "Loading corpus file", input
        corpusTree = ETUtils.ETFromObj(input)
        corpusRoot = corpusTree.getroot()
    
    if not (ET.iselement(input) and input.tag == "sentence"):
        sentences = corpusRoot.getiterator("sentence")
    else:
        sentences = [input]
    counts = defaultdict(int)
    for sentence in sentences:
        for entity in sentence.findall("entity"):
            counts["all-entities"] += 1
            eType = entity.get("type")
            if not isNegatableEPITrigger(eType):
                counts["out-of-scope"] += 1
                continue
            eBaseType = getEPIBaseType(eType)
            eText = entity.get("text").lower()
            eNewType = determineNewType(eType, eText)
        
            # Insert changed charOffset
            counts["entities"] += 1
            if verbose:
                print "Entity", entity.get("id"), [entity.get("text")], [eType, eBaseType, eNewType],
            if eNewType != eBaseType:
                counts["negated"] += 1
                if verbose: print "NEGATED",
            if eNewType == eType:
                counts["correct"] += 1
                if verbose: print "CORRECT"
            else:
                counts["incorrect"] += 1
                if eNewType == eBaseType:
                    counts["incorrect-pos"] += 1
                else:
                    counts["incorrect-neg"] += 1
                if verbose: print "INCORRECT"
            entity.set("type", eNewType)
    if verbose:
        print counts
    
    if not (ET.iselement(input) and input.tag == "sentence"):
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusRoot, output)
        return corpusTree                    
Example #22
0
def insertTask2(inputfile, task2file, outputfile):
    print >> sys.stderr, "Adding task2 information from", task2file, "to", inputfile, "and saving to", outputfile
    t2Root = getCorpus(task2file)
    noT2Root = getCorpus(inputfile)
    
    sentMap = {}
    for sentence in t2Root.getiterator("sentence"):
        sentMap[sentence.get("id")] = sentence
    
    for sentence in noT2Root.getiterator("sentence"):
        for entity in sentMap[sentence.get("id")].findall("entity"):
            sentence.append(entity)
        for interaction in sentMap[sentence.get("id")].findall("interaction"):
            sentence.append(interaction)
    
    ETUtils.write(noT2Root, outputfile)
Example #23
0
def convert(srFiles, xmlFileName, outdir, corpusName, idByNorText=False):
    print >> sys.stderr, "Loading Static Relations"
    events = {}
    for srFile in srFiles:
        readEventsFromSR(srFile[0], srFile[1], events, idByNorText=idByNorText)
    
    if xmlFileName != None:
        xmlEvents = {}
        dataSets = {}
        srTexts = {} # original, unnormalized sentence texts from the SR corpus
        eventsToXML(events, xmlEvents, dataSets, srTexts)
        
        print >> sys.stderr, "Loading XML"
        xml = ETUtils.ETFromObj(xmlFileName)
        print >> sys.stderr, "Inserting XML events"
        insertEvents(xmlEvents, dataSets, srTexts, xml, corpusName)
        ETUtils.write(xml, outdir+corpusName+"-srevents.xml")
        # update pre-existing parses
        print >> sys.stderr, "Head Detection"
        xml = FindHeads.findHeads(xml, "split-mccc-preparsed", tokenization=None, output=outdir+corpusName+"-heads.xml", removeExisting=True)
        print >> sys.stderr, "Dividing into sets"
        InteractionXML.DivideSets.processCorpus(xml, outDir, corpusName + "-", ".xml", [("devel", "train")])
        print >> sys.stderr, "Converting back"
        STConvert.toSTFormat(outdir+corpusName + "-devel.xml", outDir + corpusName + "-stformat-devel", outputTag="rel", task=2, debug=True, validate=False)
        STConvert.toSTFormat(outdir+corpusName + "-train.xml", outDir + corpusName + "-stformat-train", outputTag="rel", task=2, debug=True, validate=False)
    else:
        xml = eventsToNewXML(events)
        xmlTree = ET.ElementTree(xml)
        ETUtils.write(xml, outdir+corpusName+"-srevents.xml")
        xml = xmlTree
        # Parse
        bigfileName = outdir+corpusName
        print >> sys.stderr, "Parsing"
        Tools.CharniakJohnsonParser.parse(xml, bigfileName+"-parsed.xml", tokenizationName="PARSED_TEXT", parseName="McClosky", requireEntities=True, timeout=60)
        print >> sys.stderr, "Stanford Conversion"
        Tools.StanfordParser.convertXML("McClosky", xml, bigfileName+"-stanford.xml")
        print >> sys.stderr, "Protein Name Splitting"
        splitTarget = "McClosky"
        xml = ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget)
        print >> sys.stderr, "Head Detection"
        xml = FindHeads.findHeads(xml, "split-McClosky", tokenization=None, output=bigfileName+".xml", removeExisting=True)
        print >> sys.stderr, "Dividing into sets"
        InteractionXML.DivideSets.processCorpus(xml, outDir, "SRNE-", ".xml")    
Example #24
0
def splitMergedElements(inputFilename, outputFilename=None):
    print >> sys.stderr, "##### Split elements with merged types #####"
    print >> sys.stderr, "Loading corpus", inputFilename
    corpusTree = ETUtils.ETFromObj(inputFilename)
    corpusRoot = corpusTree.getroot()
    
    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = {"entity":[0,0], "interaction":[0,0], "pair":[0,0]}
    for document in documents:
        counter.update()
        for sentence in document.findall("sentence"):
            processSentence(sentence, countsByType)
    print >> sys.stderr, "Results"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ": removed", countsByType[k][0], "created", countsByType[k][1]
    
    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
Example #25
0
def mixSets(input, output, docOrigIds, sourceSet, targetSet):
    print >> sys.stderr, "Mixing Sets", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()
    
    if docOrigIds != None:
        for document in corpusRoot.getiterator("document"):
            if document.get("pmid") in docOrigIds:
                assert document.get("set") == sourceSet
                document.set("set", targetSet)
                docOrigIds.remove(document.get("pmid"))
        assert len(docOrigIds) == 0, docOrigIds
    
    sentenceIds = None
    if sentenceIds != None:
        for document in corpusRoot.getiterator("document"):
            removed = []
            for sentence in document.findall("sentence"):
                assert document.get("set") == sourceSet
                sentenceId = sentence.get("id")
                if sentenceId in sentenceIds:
                    removed.append(document.remove(sentence))
                    sentenceIds.remove(sentenceId)
            if len(removed) > 0:
                newDoc = ET.Element("document")
                for attr in document.attrib:
                    newDoc.set(attr, document.get(attr))
                newDoc.set("id", None)
                newDoc.set("set", targetSet)
                for sentence in removed:
                    newDoc.append(sentence)
                corpusRoot.append(newDoc)
        assert len(sentenceIds) == None
    
        RecalculateIds.recalculateIds(corpusTree, onlyWithinSentence=False)
             
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Example #26
0
def mixSets(input, output, docOrigIds, sourceSet, targetSet):
    print >> sys.stderr, "Mixing Sets", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()

    if docOrigIds != None:
        for document in corpusRoot.getiterator("document"):
            if document.get("pmid") in docOrigIds:
                assert document.get("set") == sourceSet
                document.set("set", targetSet)
                docOrigIds.remove(document.get("pmid"))
        assert len(docOrigIds) == 0, docOrigIds

    sentenceIds = None
    if sentenceIds != None:
        for document in corpusRoot.getiterator("document"):
            removed = []
            for sentence in document.findall("sentence"):
                assert document.get("set") == sourceSet
                sentenceId = sentence.get("id")
                if sentenceId in sentenceIds:
                    removed.append(document.remove(sentence))
                    sentenceIds.remove(sentenceId)
            if len(removed) > 0:
                newDoc = ET.Element("document")
                for attr in document.attrib:
                    newDoc.set(attr, document.get(attr))
                newDoc.set("id", None)
                newDoc.set("set", targetSet)
                for sentence in removed:
                    newDoc.append(sentence)
                corpusRoot.append(newDoc)
        assert len(sentenceIds) == None

        RecalculateIds.recalculateIds(corpusTree, onlyWithinSentence=False)

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Example #27
0
def splitMergedElements(inputFilename, outputFilename=None):
    print >> sys.stderr, "##### Split elements with merged types #####"
    print >> sys.stderr, "Loading corpus", inputFilename
    corpusTree = ETUtils.ETFromObj(inputFilename)
    corpusRoot = corpusTree.getroot()

    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = {"entity": [0, 0], "interaction": [0, 0], "pair": [0, 0]}
    for document in documents:
        counter.update()
        for sentence in document.findall("sentence"):
            processSentence(sentence, countsByType)
    print >> sys.stderr, "Results"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ": removed", countsByType[k][
            0], "created", countsByType[k][1]

    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
Example #28
0
def processCorpus(inputFilename, outputFilename, rules):
    print >> sys.stderr, "Loading corpus file", inputFilename
    if inputFilename.rsplit(".", 1)[-1] == "gz":
        import gzip
        corpusTree = ET.parse(gzip.open(inputFilename))
    else:
        corpusTree = ET.parse(inputFilename)
    corpusRoot = corpusTree.getroot()

    countsByType = {}
    for key in sorted(rules.keys()):
        for attribute in rules[key]:
            countsByType[key + ":" + attribute] = 0
        removeAttributes(corpusRoot, key, rules[key], countsByType)

    print >> sys.stderr, "Removed"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ":", countsByType[k]

    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
def processCorpus(inputFilename, outputFilename, rules):
    print >> sys.stderr, "Loading corpus file", inputFilename
    if inputFilename.rsplit(".",1)[-1] == "gz":
        import gzip
        corpusTree = ET.parse(gzip.open(inputFilename))
    else:
        corpusTree = ET.parse(inputFilename)
    corpusRoot = corpusTree.getroot()
    
    countsByType = {}
    for key in sorted(rules.keys()):
        for attribute in rules[key]:
            countsByType[key + ":" + attribute] = 0
        removeAttributes(corpusRoot, key, rules[key], countsByType)
    
    print >> sys.stderr, "Removed"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ":", countsByType[k]
    
    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
Example #30
0
def processCorpus(inputFilename, outputFilename, rules):
    print >> sys.stderr, "Loading corpus file", inputFilename
    corpusTree = ETUtils.ETFromObj(inputFilename)
    corpusRoot = corpusTree.getroot()
    
    for eType in rules.keys():
        for attrRule in rules[eType].keys():
            rules[eType][attrRule] = rules[eType][attrRule].split("|")
    
    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = defaultdict(int)
    for document in documents:
        counter.update()
        for sentence in document.findall("sentence"):
            processSentence(sentence, rules, countsByType)
    print >> sys.stderr, "Removed"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ":", countsByType[k]
    
    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
Example #31
0
def processCorpus(inputFilename, outputFilename, rules):
    print >> sys.stderr, "Loading corpus file", inputFilename
    corpusTree = ETUtils.ETFromObj(inputFilename)
    corpusRoot = corpusTree.getroot()

    for eType in rules.keys():
        for attrRule in rules[eType].keys():
            rules[eType][attrRule] = rules[eType][attrRule].split("|")

    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = defaultdict(int)
    for document in documents:
        counter.update()
        for sentence in document.findall("sentence"):
            processSentence(sentence, rules, countsByType)
    print >> sys.stderr, "Removed"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ":", countsByType[k]

    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
Example #32
0
                         metavar="FILE")
    optparser.add_option("-o",
                         "--output",
                         default=None,
                         dest="output",
                         help="",
                         metavar="FILE")
    (options, args) = optparser.parse_args()

    print >> sys.stderr, "Loading input file", options.input
    sourceTree = ET.parse(options.input)
    sourceRoot = sourceTree.getroot()

    print >> sys.stderr, "Merging named entity types"
    entities = sourceRoot.getiterator("entity")
    mergedByType = {}
    for entity in entities:
        if entity.attrib.has_key(
                "isName") and entity.attrib["isName"] == "True":
            if not mergedByType.has_key(entity.attrib["type"]):
                mergedByType[entity.attrib["type"]] = 0
            mergedByType[entity.attrib["type"]] += 1
            entity.attrib["type"] = "Gene/protein/RNA"

    print >> sys.stderr, "Merged:"
    for k in sorted(mergedByType.keys()):
        print >> sys.stderr, "  " + k + ": " + str(mergedByType[k])

    print >> sys.stderr, "Writing output", options.output
    ETUtils.write(sourceRoot, options.output)
Example #33
0
    corpusElement = ET.Element("corpus")
    corpusElement.attrib["source"] = "GENIA"
    totalSentences = 0
    documentsWithSentences = []
    for documentElement in documentElements:
        parseGraphs = documentElement.attrib["parseGraphs"]
        del documentElement.attrib["parseGraphs"]
        sentenceCount = 0
        for parseGraph in parseGraphs:
            parseGraph.writeToInteractionXML(documentElement, sentenceCount)
            sentenceCount += 1
        if sentenceCount > 0:
            documentsWithSentences.append(documentElement)
        totalSentences += sentenceCount
    
    visibleSet = Split.getSample(len(documentsWithSentences), options.visibleSet, 0)
    visibleSetDocuments = 0
    visibleSetSentences = 0
    for i in range(len(documentsWithSentences)):
        if visibleSet[i] == 0:
            documentElement = documentsWithSentences[i]
            corpusElement.append(documentElement)
            visibleSetDocuments += 1
            visibleSetSentences += len(documentElement.findall("sentence"))
    ETUtils.write(corpusElement, options.output)
    print >> sys.stderr, "Total:", str(len(documentElements)) + " documents"
    print >> sys.stderr, "Total:", str(len(documentsWithSentences)) + " documents with sentences"
    print >> sys.stderr, "Total:", str(totalSentences) + " sentences"
    print >> sys.stderr, "Visible Set:", str(visibleSetDocuments) + " documents"
    print >> sys.stderr, "Visible Set:", str(visibleSetSentences) + " sentences"
Example #34
0
def run(input, output=None, elementName="entity", processElement="document", splitNewlines=False, debug=False, bannerPath=None, trovePath=None):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    # Write text to input file
    workdir = tempfile.mkdtemp()
    if debug:
        print >> sys.stderr, "BANNER work directory at", workdir
    infile = codecs.open(os.path.join(workdir, "input.txt"), "wt", "utf-8")
    idCount = 0
    for sentence in corpusRoot.getiterator(processElement):
        infile.write("U" + str(idCount) + " " + sentence.get("text").replace("\n", " ").replace("\n", " ") + "\n")
        idCount += 1
    infile.close()
    
    # Define classpath for java
    if bannerPath == None:
        bannerPath = Settings.BANNER_DIR
    if trovePath == None:
        trovePath = Settings.JAVA_TROVE_PATH
    libPath = "/lib/"
#    if not os.path.exists(bannerPath + libPath):
#        libPath = "/libs/"
#        assert os.path.exists(bannerPath + libPath)
    assert os.path.exists(bannerPath + libPath + "banner.jar"), bannerPath
    assert os.path.exists(trovePath), trovePath
    oldVersion = True
    classPath = bannerPath + "/bin"
    for filename in os.listdir(bannerPath + libPath):
        #if filename.endswith(".jar"):
        #    classPath += ":" + bannerPath + libPath + filename
        if filename == "uima":
            oldVersion = False
    classPath += ":" + bannerPath + libPath + "*"
#    classPath += ":" + bannerPath + libPath + "banner.jar"
#    classPath += ":" + bannerPath + libPath + "dragontool.jar"
#    classPath += ":" + bannerPath + libPath + "heptag.jar"
#    classPath += ":" + bannerPath + libPath + "commons-collections-3.2.1.jar"
#    classPath += ":" + bannerPath + libPath + "commons-configuration-1.6.jar"
#    classPath += ":" + bannerPath + libPath + "commons-lang-2.4.jar"
#    classPath += ":" + bannerPath + libPath + "mallet.jar"
#    classPath += ":" + bannerPath + libPath + "commons-logging-1.1.1.jar"
    if oldVersion:
        classPath += ":" + trovePath # ":/usr/share/java/trove.jar"
        print >> sys.stderr, "Trove library at", trovePath
    
    config = makeConfigXML(workdir, bannerPath, oldVersion)
    
    # Run parser
    print >> sys.stderr, "Running BANNER", bannerPath
    cwd = os.getcwd()
    os.chdir(bannerPath)
    if oldVersion: # old version
        args = ["java", "-cp", classPath, "banner.eval.TestModel", config]
    else:
        args = ["java", "-cp", classPath, "banner.eval.BANNER", "test", config]
    print >> sys.stderr, "BANNER command:", " ".join(args)
    startTime = time.time()
    exitCode = subprocess.call(args)
    assert exitCode == 0, exitCode
    print >> sys.stderr, "BANNER time:", str(datetime.timedelta(seconds=time.time()-startTime))
    os.chdir(cwd)
    
    # Put sentences in dictionary
    sDict = {}
    sentenceHasEntities = {}
    sCount = 0
    for sentence in corpusRoot.getiterator(processElement):
        sDict["U" + str(sCount)] = sentence
        sentenceHasEntities["U" + str(sCount)] = False
        sCount += 1
    
    sentencesWithEntities = 0
    totalEntities = 0
    nonSplitCount = 0
    splitEventCount = 0
    
    # TODO: mention.txt appears to contain predicted entities directly
    # To be able to feed BANNER documents (or poorly chopped sentences)
    # one should probably remove newlines, as BANNER separates its input
    # on newlines. Replacing all \r and \n characters should preserve the
    # character offsets.
    
    # Read BANNER results
    print >> sys.stderr, "Inserting entities"
    if oldVersion:
        outfile = codecs.open(os.path.join(workdir, "output.txt"), "rt", "utf-8")
        idfile = codecs.open(os.path.join(workdir, "ids.txt"), "rt", "utf-8")
        # Add output to sentences
        for line in outfile:
            bannerId = idfile.readline().strip()
            sentence = sDict[bannerId]
            
            # Find or create container elements
            sentenceId = sentence.get("id")
            
            sText = sentence.get("text")
            start = 0
            entityCount = 0
            beginOffset = None
            # Add tokens
            splits = line.strip().split()
            for split in splits:
                tokenText, tag = split.rsplit("|", 1)
                # Determine offsets by aligning BANNER-generated tokens to original text
                cStart = sText.find(tokenText, start)
                assert cStart != -1, (tokenText, tag, sText, line)
                cEnd = cStart + len(tokenText) - 1
                start = cStart + len(tokenText)
                
                if tag == "O":
                    if beginOffset != None:
                        ## Make element
                        #ent = ET.Element(elementName)
                        #ent.set("id", sentenceId + ".e" + str(entityCount))
                        #ent.set("charOffset", str(beginOffset) + "-" + str(prevEnd))
                        #ent.set("type", "Protein")
                        #ent.set("isName", "True")
                        #ent.set("source", "BANNER")
                        #ent.set("text", sText[beginOffset:prevEnd+1])
                        entities = makeEntityElements(beginOffset, prevEnd, sText, splitNewlines, elementName)
                        assert len(entities) > 0
                        nonSplitCount += 1
                        if len(entities) > 1:
                            splitEventCount += 1
                        for ent in entities:
                            ent.set("id", sentenceId + ".e" + str(entityCount))
                            sentence.append(ent)
                            if not sentenceHasEntities[bannerId]:
                                sentencesWithEntities += 1
                                sentenceHasEntities[bannerId] = True
                            totalEntities += 1
                            entityCount += 1
                        beginOffset = None
                else:
                    if beginOffset == None:
                        beginOffset = cStart
                prevEnd = cEnd
        outfile.close()
        idfile.close()
    else:
        sentenceEntityCount = {}
        mentionfile = codecs.open(os.path.join(workdir, "mention.txt"), "rt", "utf-8")
        for line in mentionfile:
            bannerId, offsets, word = line.strip().split("|")
            offsets = offsets.split()
            sentence = sDict[bannerId]
            offsets[0], offsets[1] = fixOffset(line.strip(), word, int(offsets[0]), int(offsets[1]), sentence.get("text"))
            entities = makeEntityElements(int(offsets[0]), int(offsets[1]), sentence.get("text"), splitNewlines, elementName)
            entityText = "\n".join([x.get("text") for x in entities])
            assert entityText == word, (entityText, word, bannerId, offsets, sentence.get("id"), sentence.get("text"))
            assert len(entities) > 0, (line.strip(), sentence.get("text"))
            nonSplitCount += 1
            if len(entities) > 1:
                splitEventCount += 1
            if bannerId not in sentenceEntityCount:
                sentenceEntityCount[bannerId] = 0
            for ent in entities:
                ent.set("id", sentence.get("id") + ".e" + str(sentenceEntityCount[bannerId]))
                sentence.append(ent)
                if not sentenceHasEntities[bannerId]:
                    sentencesWithEntities += 1
                    sentenceHasEntities[bannerId] = True
                totalEntities += 1
                sentenceEntityCount[bannerId] += 1
        mentionfile.close()
    
    print >> sys.stderr, "BANNER found", nonSplitCount, "entities in", sentencesWithEntities, processElement + "-elements"
    print >> sys.stderr, "New", elementName + "-elements:", totalEntities, "(Split", splitEventCount, "BANNER entities with newlines)"
    
    # Remove work directory
    if not debug:
        shutil.rmtree(workdir)
    else:
        print >> sys.stderr, "BANNER working directory for debugging at", workdir
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Example #35
0
def writeSVG(svgTokens, svgEdges, fileName):
    svgElement = makeSVG(svgTokens, svgEdges)
    ETUtils.write(svgElement, fileName)
    return svgElement
Example #36
0
    import psyco
    psyco.full()
    print >> sys.stderr, "Found Psyco, using"
except ImportError:
    print >> sys.stderr, "Psyco not installed"

assert(os.path.exists(sys.argv[1]))
corpusTree = ETUtils.ETFromObj(sys.argv[1])
corpusRoot = corpusTree.getroot()

resultRoot = ET.Element("root")
specElement = ET.Element("speculation")
resultRoot.append(specElement)
negElement = ET.Element("negation")
resultRoot.append(negElement)

for sentence in corpusRoot.getiterator("sentence"):
    inSpec = False
    inNeg = False
    for entity in sentence.findall("entity"):
        if entity.get("speculation") == "True" and not inSpec:
            specElement.append(sentence)
            inSpec = True
        if entity.get("negation") == "True" and not inNeg:
            negElement.append(sentence)
            inNeg = True
        if inSpec and inNeg:
            break

ETUtils.write(resultRoot, sys.argv[2])
def convert(datasets, analysisTags, analysisPath, corpusName):
    global moveBI

    bigfileName = corpusName + "-" + "-and-".join([x[0] for x in datasets])
    documents = []
    for pair in datasets:
        print >> sys.stderr, "Reading", pair[0], "set,",
        sitesAreArguments = False
        if corpusName == "EPI":
            sitesAreArguments = True
        docs = ST.loadSet(pair[1],
                          pair[0],
                          "a2",
                          sitesAreArguments=sitesAreArguments)
        print >> sys.stderr, len(docs), "documents"
        documents.extend(docs)

    print >> sys.stderr, "Resolving equivalences"
    STFormat.Equiv.process(documents)

    print >> sys.stderr, "Checking data validity"
    for doc in documents:
        STFormat.Validate.validate(doc.events,
                                   simulation=True,
                                   verbose=True,
                                   docId=doc.id)
    print >> sys.stderr, "Writing all documents to geniaformat"
    ST.writeSet(documents,
                "all-geniaformat",
                resultFileTag="a2",
                debug=False,
                task=2,
                validate=False)

    print >> sys.stderr, "Converting to", bigfileName + "-documents.xml"
    xml = STConvert.toInteractionXML(documents, corpusName,
                                     bigfileName + "-documents.xml")

    if corpusName == "BI":
        InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train",
                                       "devel")

    for pair in datasets:
        if True:  #corpusName != "BI":
            print >> sys.stderr, "Adding analyses for set", pair[0]
            addAnalyses(xml, analysisTags[pair[0]], analysisPath, bigfileName)
    ETUtils.write(xml, bigfileName + "-sentences.xml")
    processParses(corpusName, xml)

    # Write out converted data
    ETUtils.write(xml, bigfileName + ".xml")
    InteractionXML.MergeDuplicateEntities.mergeAll(xml,
                                                   bigfileName + "-nodup.xml")
    for sourceTag in ["", "-nodup"]:
        print >> sys.stderr, "Dividing into sets"
        InteractionXML.DivideSets.processCorpus(
            bigfileName + sourceTag + ".xml", "./", corpusName + "-",
            sourceTag + ".xml", [("devel", "train")])
        if "devel" in [x[0] for x in datasets]:
            print >> sys.stderr, "Converting back"
            STConvert.toSTFormat(corpusName + "-devel" + sourceTag + ".xml",
                                 "roundtrip/" + corpusName + "-devel" +
                                 sourceTag + "-task2",
                                 outputTag="a2",
                                 task=2)
            STConvert.toSTFormat(corpusName + "-devel" + sourceTag + ".xml",
                                 "roundtrip/" + corpusName + "-devel" +
                                 sourceTag + "-task1",
                                 outputTag="a2",
                                 task=1)
            if corpusName == "GE":
                print >> sys.stderr, "Evaluating task 2 back-conversion"
                BioNLP11GeniaTools.evaluate("roundtrip/" + corpusName +
                                            "-devel" + sourceTag + "-task2",
                                            task=2,
                                            verbose=True,
                                            debug=False)
                print >> sys.stderr, "Evaluating task 1 back-conversion"
                BioNLP11GeniaTools.evaluate("roundtrip/" + corpusName +
                                            "-devel" + sourceTag + "-task1",
                                            task=1,
                                            verbose=True,
                                            debug=False)
            elif corpusName in ["BI", "BB"]:
                print >> sys.stderr, "Evaluating task 2 back-conversion"
                BioNLP11GeniaTools.evaluateBX(
                    "roundtrip/" + corpusName + "-devel" + sourceTag +
                    "-task2", corpusName)
                print >> sys.stderr, "Evaluating task 1 back-conversion"
                BioNLP11GeniaTools.evaluateBX(
                    "roundtrip/" + corpusName + "-devel" + sourceTag +
                    "-task1", corpusName)
            print >> sys.stderr, "Creating empty devel set"
            deletionRules = {"interaction": {}, "entity": {"isName": "False"}}
            InteractionXML.DeleteElements.processCorpus(
                corpusName + "-devel" + sourceTag + ".xml",
                corpusName + "-devel" + sourceTag + "-empty.xml",
                deletionRules)
    optparser.add_option("-o", "--output", default=None, dest="output", help="The file to which the new XML structure is saved. If None, will be the same as target.", metavar="FILE")
    (options, args) = optparser.parse_args()

    print >> sys.stderr, "Loading input file", options.input
    sourceTree = ET.parse(options.input)
    sourceRoot = sourceTree.getroot()
    
    print >> sys.stderr, "Removing dependencies"
    parsesElements = sourceRoot.getiterator("parses")
    for parsesElement in parsesElements:
        for parseElement in parsesElement.findall("parse"):
            dependencies = parseElement.findall("dependency")
            toRemove = [False] * len(dependencies)
            for i in range(0, len(dependencies)-1):
                for j in range(i+1, len(dependencies)):
                    di = dependencies[i]
                    dj = dependencies[j]
                    if di.attrib["type"] == dj.attrib["type"] and di.attrib["t1"] == dj.attrib["t1"] and di.attrib["t2"] == dj.attrib["t2"]:
                        toRemove[j] = True
            count = 0
            for i in range(0, len(dependencies)):
                if toRemove[i]:
                    parseElement.remove(dependencies[i])
                    count += 1
            print >> sys.stderr, "Parse:", parseElement.attrib["parser"], "Removed:", count
    
    print >> sys.stderr, "Writing output", options.output
    ETUtils.write(sourceRoot, options.output)


Example #39
0
def convertXML(parser, input, output, debug=False, reparse=False):
    global stanfordParserDir, stanfordParserArgs
    print >> sys.stderr, "Running Stanford conversion"
    print >> sys.stderr, "Stanford tools at:", stanfordParserDir
    print >> sys.stderr, "Stanford tools arguments:", " ".join(stanfordParserArgs)
    parseTimeStamp = time.strftime("%d.%m.%y %H:%M:%S")
    print >> sys.stderr, "Stanford time stamp:", parseTimeStamp
    
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    workdir = tempfile.mkdtemp()
    if debug:
        print >> sys.stderr, "Stanford parser workdir", workdir
    stanfordInput = os.path.join(workdir, "input")
    stanfordInputFile = codecs.open(stanfordInput, "wt", "utf-8")
    
    # Put penn tree lines in input file
    existingCount = 0
    for sentence in corpusRoot.getiterator("sentence"):
        if sentence.find("sentenceanalyses") != None: # old format
            sentenceAnalyses = setDefaultElement(sentence, "sentenceanalyses")
            parses = setDefaultElement(sentenceAnalyses, "parses")
            parse = getElementByAttrib(parses, "parse", {"parser":parser})
        else:
            analyses = setDefaultElement(sentence, "analyses")
            parse = getElementByAttrib(analyses, "parse", {"parser":parser})
        if parse == None:
            continue
        if len(parse.findall("dependency")) > 0:
            if reparse: # remove existing stanford conversion
                for dep in parse.findall("dependency"):
                    parse.remove(dep)
                del parse.attrib["stanford"]
            else: # don't reparse
                existingCount += 1
                continue
        pennTree = parse.get("pennstring")
        if pennTree == None or pennTree == "":
            continue
        stanfordInputFile.write(pennTree + "\n")
    stanfordInputFile.close()
    if existingCount != 0:
        print >> sys.stderr, "Skipping", existingCount, "already converted sentences."
    
    # Run Stanford parser
    stanfordOutput = runSentenceProcess(runStanford, stanfordParserDir, stanfordInput, 
                                        workdir, True, "StanfordParser", 
                                        "Stanford Conversion", timeout=600,
                                        outputArgs={"encoding":"latin1", "errors":"replace"})   
    #stanfordOutputFile = codecs.open(stanfordOutput, "rt", "utf-8")
    stanfordOutputFile = codecs.open(stanfordOutput, "rt", "latin1", "replace")
    
    # Get output and insert dependencies
    noDepCount = 0
    failCount = 0
    sentenceCount = 0
    for sentence in corpusRoot.getiterator("sentence"):
        # Get parse
        if sentence.find("sentenceanalyses") != None: # old format
            sentenceAnalyses = setDefaultElement(sentence, "sentenceanalyses")
            parses = setDefaultElement(sentenceAnalyses, "parses")
            parse = getElementByAttrib(parses, "parse", {"parser":parser})
        else:
            analyses = setDefaultElement(sentence, "analyses")
            parse = getElementByAttrib(analyses, "parse", {"parser":parser})
        if parse == None:
            parse = ET.SubElement(analyses, "parse")
            parse.set("parser", "None")
        if reparse:
            assert len(parse.findall("dependency")) == 0
        elif len(parse.findall("dependency")) > 0: # don't reparse
            continue
        pennTree = parse.get("pennstring")
        if pennTree == None or pennTree == "":
            parse.set("stanford", "no_penn")
            continue
        parse.set("stanfordSource", "TEES") # parser was run through this wrapper
        parse.set("stanfordDate", parseTimeStamp) # links the parse to the log file
        # Get tokens
        if sentence.find("analyses") != None:
            tokenization = getElementByAttrib(sentence.find("analyses"), "tokenization", {"tokenizer":parse.get("tokenizer")})
        else:
            tokenization = getElementByAttrib(sentence.find("sentenceanalyses").find("tokenizations"), "tokenization", {"tokenizer":parse.get("tokenizer")})
        assert tokenization != None
        count = 0
        tokenByIndex = {}
        for token in tokenization.findall("token"):
            tokenByIndex[count] = token
            count += 1
        # Insert dependencies
        deps = addDependencies(stanfordOutputFile, parse, tokenByIndex, sentence.get("id"))
        if len(deps) == 0:
            parse.set("stanford", "no_dependencies")
            noDepCount += 1
            if parse.get("stanfordAlignmentError") != None:
                failCount += 1
        else:
            parse.set("stanford", "ok")
            if parse.get("stanfordAlignmentError") != None:
                failCount += 1
                parse.set("stanford", "partial")
        sentenceCount += 1
    stanfordOutputFile.close()
    # Remove work directory
    if not debug:
        shutil.rmtree(workdir)
        
    print >> sys.stderr, "Stanford conversion was done for", sentenceCount, "sentences,", noDepCount, "had no dependencies,", failCount, "failed"
    
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Example #40
0
def convertDownloaded(outdir,
                      corpus,
                      files,
                      intermediateFiles=True,
                      evaluate=True):
    global moveBI
    workdir = outdir + "/conversion/" + corpus
    if os.path.exists(workdir):
        shutil.rmtree(workdir)
    os.makedirs(workdir)

    print >> sys.stderr, "---------------", "Converting to XML", "---------------"
    # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are
    # applied equally
    datasets = ["devel", "train", "test"]
    bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets))
    documents = []
    for setName in datasets:
        sourceFile = files[corpus + "_" + setName.upper()]
        print >> sys.stderr, "Reading", setName, "set from", sourceFile, "temp at ",
        sitesAreArguments = False
        if corpus == "EPI":
            sitesAreArguments = True
        docs = ST.loadSet(sourceFile,
                          setName,
                          "a2",
                          sitesAreArguments=sitesAreArguments)
        print >> sys.stderr, "Read", len(docs), "documents"
        documents.extend(docs)

    print >> sys.stderr, "Resolving equivalences"
    STFormat.Equiv.process(documents)

    if evaluate:
        print >> sys.stderr, "Checking data validity"
        for doc in documents:
            STFormat.Validate.validate(doc.events,
                                       simulation=True,
                                       verbose=True,
                                       docId=doc.id)
        print >> sys.stderr, "Writing all documents to geniaformat"
        ST.writeSet(documents,
                    os.path.join(workdir, "all-geniaformat"),
                    resultFileTag="a2",
                    debug=False,
                    task=2,
                    validate=False)

    if intermediateFiles:
        print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName + "-documents.xml"
        xml = STConvert.toInteractionXML(documents, corpus,
                                         bigfileName + "-documents.xml")
    else:
        print >> sys.stderr, "Converting to XML"
        xml = STConvert.toInteractionXML(documents, corpus, None)

    if corpus == "BI":
        InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train",
                                       "devel")

    addAnalyses(xml, corpus, datasets, files, bigfileName)
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName + "-sentences.xml"
        ETUtils.write(xml, bigfileName + "-sentences.xml")
    processParses(xml)

    print >> sys.stderr, "---------------", "Writing corpora", "---------------"
    # Write out converted data
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName + ".xml"
        ETUtils.write(xml, bigfileName + ".xml")
    print >> sys.stderr, "Dividing into sets"
    InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml")

    if evaluate and "devel" in datasets:
        print >> sys.stderr, "---------------", "Evaluating conversion", "---------------"
        print >> sys.stderr, "Converting back"
        STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"),
                             workdir + "/roundtrip/" + corpus + "-devel" +
                             "-task1",
                             outputTag="a2",
                             task=1)
        STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"),
                             workdir + "/roundtrip/" + corpus + "-devel" +
                             "-task2",
                             outputTag="a2",
                             task=2)
        print >> sys.stderr, "Evaluating task 1 back-conversion"
        BioNLP11GeniaTools.evaluate(
            workdir + "/roundtrip/" + corpus + "-devel" + "-task1",
            corpus + ".1")
        print >> sys.stderr, "Evaluating task 2 back-conversion"
        BioNLP11GeniaTools.evaluate(
            workdir + "/roundtrip/" + corpus + "-devel" + "-task2",
            corpus + ".2")
        print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping"
Example #41
0
def mainFunc(input,
             output,
             parseName,
             tokenizationName=None,
             newParseName=None,
             newTokenizationName=None,
             logFileName=None,
             removeOld=True):
    print >> sys.stderr, "Protein Name Splitter"
    if logFileName != None:
        print >> sys.stderr, "Writing log to", logFileName
        logFile = open(logFileName, "wt")
    else:
        logFile = None
    #if input.endswith(".gz"):
    #    inFile = gzip.GzipFile(input)
    #else:
    #    inFile = open(input)
    tree = ETUtils.ETFromObj(input)

    if tokenizationName == None:
        tokenizationName = parseName

    #tree = ElementTree.parse(inFile)
    root = tree.getroot()

    sentences = [x for x in root.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "Split Protein Names")
    counter.showMilliseconds = True
    missingTokCount = 0
    for sentence in sentences:
        sId = sentence.get("id")
        counter.update(1, "Splitting names (" + sId + "): ")

        tok = getTokenization(tokenizationName,
                              sentence,
                              sId,
                              remove=removeOld)
        if tok == None:
            missingTokCount += 1
            continue

        assert tok is not None, "Missing tokenization '%s' in sentence %s!" % (
            tokenizationName, sId)

        parse = getParse(parseName,
                         tokenizationName,
                         sentence,
                         sId,
                         remove=removeOld)
        assert parse is not None, "Missing parse '%s' in sentence %s!" % (
            parseName, sId)

        split = splitTokens(tok, sentence, logFile)

        # Default names
        if removeOld:
            if newTokenizationName == None:
                newTokenizationName = tok.get("tokenizer")
            if newParseName == None:
                newParseName = parse.get("parser")
        else:
            if newTokenizationName == None:
                newTokenizationName = "split-" + tok.get("tokenizer")
            if newParseName == None:
                newParseName = "split-" + parse.get("parser")

        # add a new tokenization with the split tokens.
        splittok = addTokenization(newTokenizationName, sentence, sId)
        addTokensToTree(split, splittok)
        for a in tok.attrib:
            if splittok.get(a) == None:
                splittok.set(a, tok.get(a))
        #splittok.set("split-")

        # make a mapping from original to split token ids. Store the
        # head token when given.
        tokenIdMap = {}
        for t in split:
            if t.head:
                head = t.head
                # traverse
                while head.head is not None:
                    assert head.head != t, "Cyclic heads"
                    head = head.head

                # should match (nah, punctuation problems)
                # assert t.origId not in tokenIdMap or tokenIdMap[t.origId] == head.id, "Head conflict"
                tokenIdMap[t.origId] = head.id
            else:
                # only allow overwrite of existing entry if the current token
                # is not punctuation.
                if t.origId not in tokenIdMap or not t.isPunct():
                    tokenIdMap[t.origId] = t.id

        # make a copy of the specified parse that refers to the split tokens
        # instead of the originals.
        newparse = addParse(newParseName, newTokenizationName, sentence, sId)
        for a in parse.attrib:
            if newparse.get(a) == None:
                newparse.set(a, parse.get(a))
        newparse.set("ProteinNameSplitter", "True")
        splittok.set("ProteinNameSplitter", "True")

        depSeqId = 1
        for d in parse.getiterator("dependency"):
            t1, t2, dType = d.get("t1"), d.get("t2"), d.get("type")
            assert t1 in tokenIdMap and t2 in tokenIdMap, "INTERNAL ERROR"

            dep = ElementTree.SubElement(newparse, "dependency")
            dep.set("t1", tokenIdMap[t1])
            dep.set("t2", tokenIdMap[t2])
            dep.set("type", dType)
            dep.set("id", "split_%d" % depSeqId)
            depSeqId += 1

        # Add in new dependencies between the split parts.
        for t in [tok for tok in split if tok.head is not None]:
            dep = ElementTree.SubElement(newparse, "dependency")
            dep.set("t1", t.head.id)
            dep.set("t2", t.id)
            dep.set("type", t.depType)
            dep.set("split", "PNS")

        for phrase in parse.getiterator("phrase"):
            newparse.append(phrase)

            # debugging
            #print >> sys.stderr, "NEW DEP IN", sId

    print >> sys.stderr, "Tokenization missing from", missingTokCount, "sentences"

    #indent(root)
    if logFile != None:
        logFile.close()

    # debugging
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(tree, output)
    return tree
Example #42
0
def makeSentences(input, tokenizationPath, output=None, removeText=False):
    """
    Divide text in the "text" attributes of document and section 
    elements into sentence elements. These sentence elements are
    inserted into their respective parent elements.
    """
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    print >> sys.stderr, "Inserting tokenizations from", tokenizationPath
    if tokenizationPath.find(".tar.gz") != -1:
        tarFilePath, tokenizationPath = tokenizationPath.split(".tar.gz")
        tarFilePath += ".tar.gz"
        tarFile = tarfile.open(tarFilePath)
        if tokenizationPath[0] == "/":
            tokenizationPath = tokenizationPath[1:]
    else:
        tarFile = None
    
    docCount = 0
    docsWithSentences = 0
    sentencesCreated = 0
    sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")]
    counter = ProgressCounter(len(sourceElements), "Sentence Splitting")
    for document in sourceElements:
        docCount += 1
        counter.update(1, "Splitting Documents ("+document.get("id")+"/" + document.get("pmid") + "): ")
        docId = document.get("id")
        if docId == None:
            docId = "CORPUS.d" + str(docCount)
        if document.find("sentence") == None: # no existing sentence split                
            text = document.get("text")
            if text == None or text.strip() == "":
                continue
            
            newFile = os.path.join(tokenizationPath, document.get("pmid") + ".tok")
            f = openFile(newFile, tarFile)
            if f == None: # file with BioNLP'11 extension not found, try BioNLP'09 extension
                oldFile = os.path.join(tokenizationPath, document.get("pmid") + ".tokenized")
                f = openFile(newFile, oldFile)
                if f == None: # no tokenization found
                    continue
            sentencesCreated += alignSentences(document, f.readlines())
            f.close()
    
            # Remove original text
            if removeText:
                del document["text"]
            # Move elements from document element to sentences
            moveElements(document)
            docsWithSentences += 1
        else:
            docsWithSentences += 1
    
    if tarFile != None:
        tarFile.close()
    print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences"
    print >> sys.stderr, docsWithSentences, "/", docCount, "documents have sentences"
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Example #43
0
def buildExamples(corpusDir, outPath):
    # define shortcuts for commonly used files
    PARSE = "stanford-newMC-intra"  #"split-Charniak-Lease"
    TOK = "split-McClosky"
    CORPUS_DIR = corpusDir

    # xml files without heads
    BI_DEVEL_FILE = CORPUS_DIR + "/bioinfer.devel.refRem-eqRem-negRem-metaRes-anonRes.merged.gold.gif.xml"
    BI_TEST_FILE = CORPUS_DIR + "/bioinfer.test.refRem-eqRem-negRem-metaRes-anonRes.merged.gold.gif.xml"
    BI_TRAIN_FILE = CORPUS_DIR + "/bioinfer.train.refRem-eqRem-negRem-metaRes-anonRes.merged.gold.gif.xml"
    BI_TRAIN_AND_DEVEL_FILE = CORPUS_DIR + "/bioinfer.train+devel.refRem-eqRem-negRem-metaRes-anonRes.merged.gold.gif.xml"

    # xml files with head tokens
    TEST_FILE = outpath + "/bioinfer-test-" + PARSE + ".xml"
    DEVEL_FILE = outpath + "/bioinfer-devel-" + PARSE + ".xml"
    TRAIN_FILE = outpath + "/bioinfer-train-" + PARSE + ".xml"
    TRAIN_AND_DEVEL_FILE = outpath + "/bioinfer-train-and-devel-" + PARSE + ".xml"
    WORKDIR = outpath

    # Find heads
    sys.path.append("..")
    import Core.SentenceGraph as SentenceGraph
    import cElementTreeUtils as ETUtils
    if not os.path.exists(TEST_FILE):
        c = SentenceGraph.loadCorpus(BI_TEST_FILE, PARSE, TOK)
        ETUtils.write(c.rootElement, TEST_FILE)
    if not os.path.exists(DEVEL_FILE):
        c = SentenceGraph.loadCorpus(BI_DEVEL_FILE, PARSE, TOK)
        ETUtils.write(c.rootElement, DEVEL_FILE)
    if not os.path.exists(TRAIN_FILE):
        c = SentenceGraph.loadCorpus(BI_TRAIN_FILE, PARSE, TOK)
        ETUtils.write(c.rootElement, TRAIN_FILE)
    if not os.path.exists(TRAIN_AND_DEVEL_FILE):
        c = SentenceGraph.loadCorpus(BI_TRAIN_AND_DEVEL_FILE, PARSE, TOK)
        ETUtils.write(c.rootElement, TRAIN_AND_DEVEL_FILE)

    ###############################################################################
    # Trigger example generation
    ###############################################################################
    print >> sys.stderr, "Trigger examples for parse", TOK
    if not os.path.exists("gazetteer-train-" + TOK):
        Gazetteer.run(TRAIN_FILE, "gazetteer-train-" + TOK, TOK)
    if not os.path.exists("gazetteer-train-and-devel-" + TOK):
        Gazetteer.run(TRAIN_AND_DEVEL_FILE, "gazetteer-train-and-devel-" + TOK,
                      TOK)
    # generate the files for the old charniak
    if not os.path.exists("trigger-train-examples-" + PARSE):
        GeneralEntityTypeRecognizerGztr.run(TRAIN_FILE,
                                            "trigger-train-examples-" + PARSE,
                                            PARSE, TOK, "style:typed",
                                            "bioinfer-trigger-ids",
                                            "gazetteer-train-" + TOK)
    if not os.path.exists("trigger-devel-examples-" + PARSE):
        GeneralEntityTypeRecognizerGztr.run(DEVEL_FILE,
                                            "trigger-devel-examples-" + PARSE,
                                            PARSE, TOK, "style:typed",
                                            "bioinfer-trigger-ids",
                                            "gazetteer-train-" + TOK)
    if not os.path.exists("trigger-train-and-devel-examples-" + PARSE):
        GeneralEntityTypeRecognizerGztr.run(
            TRAIN_AND_DEVEL_FILE, "trigger-train-and-devel-examples-" + PARSE,
            PARSE, TOK, "style:typed", "bioinfer-trigger-ids",
            "gazetteer-train-and-devel-" + TOK)
    if not os.path.exists("trigger-test-examples-" + PARSE):
        GeneralEntityTypeRecognizerGztr.run(TEST_FILE,
                                            "trigger-test-examples-" + PARSE,
                                            PARSE, TOK, "style:typed",
                                            "bioinfer-trigger-ids",
                                            "gazetteer-train-and-devel-" + TOK)

    ###############################################################################
    # Edge example generation
    ###############################################################################
    print >> sys.stderr, "Edge examples for parse", PARSE
    EDGE_FEATURE_PARAMS = "style:typed,directed,no_linear,entities,noMasking,maxFeatures,bioinfer_limits"

    if not os.path.exists("edge-train-examples-" + PARSE):
        MultiEdgeExampleBuilder.run(TRAIN_FILE, "edge-train-examples-" + PARSE,
                                    PARSE, TOK, EDGE_FEATURE_PARAMS,
                                    "bioinfer-edge-ids")
    if not os.path.exists("edge-devel-examples-" + PARSE):
        MultiEdgeExampleBuilder.run(DEVEL_FILE, "edge-devel-examples-" + PARSE,
                                    PARSE, TOK, EDGE_FEATURE_PARAMS,
                                    "bioinfer-edge-ids")
    if not os.path.exists("edge-train-and-devel-examples-" + PARSE):
        MultiEdgeExampleBuilder.run(TRAIN_AND_DEVEL_FILE,
                                    "edge-train-and-devel-examples-" + PARSE,
                                    PARSE, TOK, EDGE_FEATURE_PARAMS,
                                    "bioinfer-edge-ids")
    # NOTE! These TEST examples will be based on gold standard triggers!
    if not os.path.exists("edge-test-examples-" + PARSE):
        MultiEdgeExampleBuilder.run(TEST_FILE, "edge-test-examples-" + PARSE,
                                    PARSE, TOK, EDGE_FEATURE_PARAMS,
                                    "bioinfer-edge-ids")
Example #44
0
        for pW,cls in prds:
            result[idx][cls]=pW/float(total)
            assert result[idx][cls]<=1.0 and result[idx][cls]>=0.0,"%f/%f=%f"%(pW,total,result[idx][cls])
    for d in result:
        d["neg"]=d.get("neg",0.0)*boost
    return result

if __name__=="__main__":
    desc="Weighted combination of several trigger word recognizers"
    parser = OptionParser(description=desc)
    parser.add_option("--lambda",dest="l",action="store",default=None,type="float",help="The mixing weight of predictions1 with predictions2. A number between 0 and 1. No default.")
    parser.add_option("--b1",dest="b1",action="store",default=None,type="float",help="Recall boost of file1")
    parser.add_option("--b2",dest="b2",action="store",default=None,type="float",help="Recall boost of file2")
    

    (options, args) = parser.parse_args()

    if options.l==None:
        print >> sys.stderr, "You need to give a lambda"
        sys.exit(1)

    tree1=ET.parse(args[0]).getroot()
    tree2=ET.parse(args[1]).getroot()
    assert len(tree1)==len(tree2)
    for docIdx in range(len(tree1)):
        assert len(tree1[docIdx])==len(tree2[docIdx])
        for sIdx in range(len(tree1[docIdx])):
            newSNode=merge2sents(tree1[docIdx][sIdx],tree2[docIdx][sIdx],options.l,options.b1,options.b2)
            tree1[docIdx][sIdx]=newSNode
    ETUtils.write(tree1,sys.stdout)
Example #45
0
 def writeXML(self, examples, predictions, corpus, outputFile, classSet=None, parse=None, tokenization=None, goldCorpus=None):
     #print >> sys.stderr, "Writing output to Interaction XML"
     corpus = self.loadCorpus(corpus, parse, tokenization)
     if goldCorpus != None:
         goldCorpus = self.loadCorpus(corpus, parse, tokenization)
     examples, predictions = self.loadExamples(examples, predictions)
     
     if type(classSet) == types.StringType: # class names are in file
         classSet = IdSet(filename=classSet)
     classIds = None
     if classSet != None:
         classIds = classSet.getIds()
         
     #counter = ProgressCounter(len(corpus.sentences), "Write Examples")
             
     exampleQueue = [] # One sentence's examples
     predictionsByExample = {}
     currentMajorId = None
     prevMajorIds = set()
     processedSentenceIds = set()
     xType = None
     
     count = 0
     for example in examples:
         count += 1
     assert count > 0
     progress = ProgressCounter(count, "Write Examples")
     
     for example, prediction in itertools.izip_longest(examples, predictions):
         assert example != None
         assert prediction != None
         majorId, minorId = example[0].rsplit(".x", 1)
         #if currentMajorId == "GENIA.d114.s9": print "Start"
         if majorId != currentMajorId: # new sentence
             if currentMajorId != None:
                 #if currentMajorId == "GENIA.d114.s9": print "JAA"
                 processedSentenceIds.add(currentMajorId)
                 sentenceObject = corpus.sentencesById[currentMajorId]
                 goldSentence = None
                 if goldCorpus != None:
                     goldSentence = goldCorpus.sentencesById[currentMajorId]
                 self.writeXMLSentence(exampleQueue, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=goldSentence) # process queue
                 progress.update(len(exampleQueue), "Writing examples ("+exampleQueue[-1][0]+"): ")
             exampleQueue = []
             predictionsByExample = {}
             prevMajorIds.add(currentMajorId)
             assert majorId not in prevMajorIds, majorId
             currentMajorId = majorId 
         exampleQueue.append(example) # queue example
         predictionsByExample[example[0]] = prediction
         assert example[3]["xtype"] == self.xType, str(example[3]["xtype"]) + "/" + str(self.xType)
     
     # Process what is still in queue
     if currentMajorId != None:
         processedSentenceIds.add(currentMajorId)
         sentenceObject = corpus.sentencesById[currentMajorId]
         goldSentence = None
         if goldCorpus != None:
             goldSentence = goldCorpus.sentencesById[currentMajorId]
         self.writeXMLSentence(exampleQueue, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=goldSentence) # process queue
         progress.update(len(exampleQueue), "Writing examples ("+exampleQueue[-1][0]+"): ")
         exampleQueue = []
         predictionsByExample = {}
     
     # Process sentences with no examples (e.g. to clear interactions)
     for sentenceId in sorted(corpus.sentencesById.keys()):
         if sentenceId not in processedSentenceIds:
             sentenceObject = corpus.sentencesById[sentenceId]
             goldSentence = None
             if goldCorpus != None:
                 goldSentence = goldCorpus.sentencesById[currentMajorId]
             self.writeXMLSentence([], {}, sentenceObject, classSet, classIds, goldSentence=goldSentence)
     
     # Print statistics
     if len(self.counts) > 0:
         print >> sys.stderr, self.counts
         self.counts = defaultdict(int)
 
     # Write corpus
     if outputFile != None:
         print >> sys.stderr, "Writing corpus to", outputFile
         ETUtils.write(corpus.rootElement, outputFile)
     return corpus.tree
Example #46
0
def tokenize(input,
             output=None,
             tokenizationName="GeniaTagger-3.0.1",
             extraFields=[]):  #["base", "chunk", "NE"]):
    global geniaTaggerDir

    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()

    # Write text to input file
    workdir = tempfile.mkdtemp()
    infile = codecs.open(os.path.join(workdir, "tagger-input.txt"), "wt",
                         "utf-8")
    numCorpusSentences = 0
    for sentence in corpusRoot.getiterator("sentence"):
        infile.write(sentence.get("text") + "\n")
        numCorpusSentences += 1
    infile.close()

    # Run tagger
    cwd = os.getcwd()
    os.chdir(geniaTaggerDir)
    args = [geniaTaggerDir + "/geniatagger"]
    #args += [ "<", os.path.join(workdir, "tagger-input.txt")]
    #args += [ ">", os.path.join(workdir, "tagger-output.txt")]
    #subprocess.call(args,
    process = subprocess.Popen(
        args,
        stdin=codecs.open(os.path.join(workdir, "tagger-input.txt"), "rt",
                          "utf-8"),
        stdout=codecs.open(os.path.join(workdir, "tagger-output.txt"), "wt",
                           "utf-8"))
    waitForProcess(process, numCorpusSentences, True,
                   os.path.join(workdir, "tagger-output.txt"), "GeniaTagger",
                   "Tokenizing Sentences")
    os.chdir(cwd)

    # Read tokenization
    outfile = codecs.open(os.path.join(workdir, "tagger-output.txt"), "rt",
                          "utf-8")
    # Add output to sentences
    for sentence in corpusRoot.getiterator("sentence"):
        # Find or create container elements
        sentenceAnalyses = sentence.find("sentenceAnalyses")
        if sentenceAnalyses == None:
            sentenceAnalyses = ET.Element("sentenceAnalyses")
            sentence.append(sentenceAnalyses)
        tokenizations = sentenceAnalyses.find("tokenizations")
        if tokenizations == None:
            tokenizations = ET.Element("tokenizations")
            sentenceAnalyses.append(tokenizations)
        prevTokenizationIndex = 0
        for prevTokenization in tokenizations.findall("tokenization"):
            assert prevTokenization.get("tokenizer") != tokenizationName
            prevTokenizationIndex += 1
        tokenization = ET.Element("tokenization")
        tokenization.set("tokenizer", tokenizationName)
        tokenizations.insert(prevTokenizationIndex, tokenization)

        sText = sentence.get("text")
        start = 0
        tokenCount = 0
        line = outfile.readline()
        while line.strip() != "":
            # Add tokens
            splits = line.strip().split("\t")
            # Determine offsets
            cStart = sText.find(splits[0], start)
            if cStart == -1:
                if splits[0] == "``":
                    splits[0] = "\""
                if splits[0] == "''":
                    splits[0] = "\""
                cStart = sText.find(splits[0], start)
            assert cStart != -1, (sentence.get("id"), sText, line, tokenCount)
            cEnd = cStart + len(splits[0])
            start = cStart + len(splits[0])
            # Make element
            token = ET.Element("token")
            token.set("id", "gt_" + str(tokenCount + 1))
            token.set("text", splits[0])
            if "base" in extraFields:
                token.set("base", splits[1])
            token.set("POS", splits[2])
            if "chunk" in extraFields:
                token.set("chunk", splits[3])
            if "NE" in extraFields:
                token.set("NE", splits[4])
            token.set("charOffset",
                      str(cStart) + "-" + str(cEnd - 1))  # NOTE: check
            tokenization.append(token)
            tokenCount += 1
            line = outfile.readline()

    outfile.close()
    # Remove work directory
    shutil.rmtree(workdir)

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Example #47
0
                selection = sentence.attrib["origId"] in idList
                if options.invert:
                    selection = not selection
                assert (keep == None or keep == selection)
                keep = selection
            if not keep:
                corpusRoot.remove(document)
                removedDocuments += 1
                removedSentences += len(sentences)
            else:
                keptDocuments += 1
                keptSentences += len(sentences)
        else:
            selection = documentSets[i] != 0
            if options.invert:
                selection = not selection
            if selection:
                corpusRoot.remove(document)
                removedDocuments += 1
                removedSentences += len(sentences)
            else:
                keptDocuments += 1
                keptSentences += len(sentences)

    print >> sys.stderr, "Corpus:", keptDocuments + removedDocuments, "documents,", keptSentences + removedSentences, "sentences."
    print >> sys.stderr, "Removed:", removedDocuments, "documents,", removedSentences, "sentences."
    print >> sys.stderr, "Subset:", keptDocuments, "documents,", keptSentences, "sentences."

    print >> sys.stderr, "Writing subset to", options.output
    ETUtils.write(corpusRoot, options.output)
Example #48
0
def makeSentences(input,
                  output=None,
                  removeText=False,
                  postProcess=True,
                  debug=False):
    """
    Run GENIA Sentence Splitter
    
    Divide text in the "text" attributes of document and section 
    elements into sentence elements. These sentence elements are
    inserted into their respective parent elements.
    """
    global sentenceSplitterDir

    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()

    print >> sys.stderr, "Running GENIA Sentence Splitter", Settings.GENIA_SENTENCE_SPLITTER_DIR,
    if postProcess:
        print >> sys.stderr, "(Using post-processing)"
    else:
        print >> sys.stderr, "(No post-processing)"
    docCount = 0
    sentencesCreated = 0
    redivideCount = 0
    sourceElements = [x for x in corpusRoot.getiterator("document")
                      ] + [x for x in corpusRoot.getiterator("section")]
    counter = ProgressCounter(len(sourceElements), "GeniaSentenceSplitter")
    counter.showMilliseconds = True
    # Create working directory
    workdir = tempfile.mkdtemp()
    for document in sourceElements:
        counter.update(1, "Splitting Documents (" + document.get("id") + "): ")
        docId = document.get("id")
        if docId == None:
            docId = "CORPUS.d" + str(docCount)
        docTag = "-" + str(docCount)
        assert document.find("sentence") == None
        text = document.get("text")
        if text == None or text.strip() == "":
            continue
        #print type(text)
        # Write text to workfile
        #workdir = tempfile.mkdtemp()
        workfile = codecs.open(
            os.path.join(workdir, "sentence-splitter-input.txt" + docTag),
            "wt", "utf-8")
        # From http://themoritzfamily.com/python-encodings-and-unicode.html
        # "You have to be careful with the codecs module. Whatever you pass to it must be a Unicode
        # object otherwise it will try to automatically decode the byte stream as ASCII"
        # However, the unicode errors here were simply due to STTools reading unicode ST-format as ASCII,
        # thus creating an ASCII interaction XML, which then triggered here the unicode error. So, at this
        # point we should be able to safely write(text), as the output file is unicode, and reading with
        # the correct coded is taken care of earlier in the pipeline.
        workfile.write(text)  #.encode("utf-8"))
        workfile.close()
        # Run sentence splitter
        assert os.path.exists(
            Settings.GENIA_SENTENCE_SPLITTER_DIR +
            "/run_geniass.sh"), Settings.GENIA_SENTENCE_SPLITTER_DIR
        args = [
            Settings.GENIA_SENTENCE_SPLITTER_DIR + "/run_geniass.sh",
            os.path.join(workdir, "sentence-splitter-input.txt" + docTag),
            os.path.join(workdir, "sentence-splitter-output.txt" + docTag),
            Settings.RUBY_PATH
        ]
        #p = subprocess.call(args)
        p = subprocess.Popen(args,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        stdout, stderr = p.communicate()
        if stdout != "":
            print >> sys.stderr, stdout
        if stderr != 'Extracting events.roading model file.\nstart classification.\n':
            print >> sys.stderr, stderr
        #print "stdout<", p.stdout.readlines(), ">"
        #print "stderr<", p.stderr.readlines(), ">"
        if postProcess:
            ppIn = codecs.open(
                os.path.join(workdir, "sentence-splitter-output.txt" + docTag),
                "rt", "utf-8")
            ppOut = codecs.open(
                os.path.join(
                    workdir,
                    "sentence-splitter-output-postprocessed.txt" + docTag),
                "wt", "utf-8")
            subprocess.call(os.path.join(Settings.GENIA_SENTENCE_SPLITTER_DIR,
                                         "geniass-postproc.pl"),
                            stdin=ppIn,
                            stdout=ppOut)
            ppIn.close()
            ppOut.close()
            # Read split sentences
            workfile = codecs.open(
                os.path.join(
                    workdir,
                    "sentence-splitter-output-postprocessed.txt" + docTag),
                "rt", "utf-8")
        else:
            workfile = codecs.open(
                os.path.join(workdir, "sentence-splitter-output.txt" + docTag),
                "rt", "utf-8")
        start = 0  # sentences are consecutively aligned to the text for charOffsets
        sentenceCount = 0
        #text = text.replace("\n", " ") # should stop sentence splitter from crashing.
        #text = text.replace("  ", " ") # should stop sentence splitter from crashing.
        #alignmentText = text.replace("\n", " ").replace("\r", " ")
        #docTokens = reWhiteSpace.split(text)
        docIndex = 0
        sentenceBeginIndex = -1
        prevSentence = None
        prevEndIndex = None
        emptySentenceCount = 0
        prevText = None
        for sText in workfile.readlines():
            sText = sText.strip()  # The text of the sentence
            if sText == "":
                emptySentenceCount += 1
                continue

            for i in range(len(sText)):
                if sText[i].isspace():
                    assert sText[i] not in ["\n", "\r"]
                    continue
                while text[docIndex].isspace():
                    if text[docIndex] in ["\n", "\r"
                                          ] and sentenceBeginIndex != -1:
                        redivideCount += 1
                        prevSentence = makeSentence(text, sentenceBeginIndex,
                                                    docIndex - 1, prevSentence,
                                                    prevEndIndex)
                        prevSentence.set("id",
                                         docId + ".s" + str(sentenceCount))
                        prevSentence.set("redevided", "True")
                        sentencesCreated += 1
                        sentenceCount += 1
                        prevEndIndex = docIndex - 1
                        sentenceBeginIndex = -1
                        document.append(prevSentence)
                    docIndex += 1
                assert sText[i] == text[docIndex], (
                    text, sText, prevText, sText[i:i + 10],
                    text[docIndex:docIndex + 10], (i, docIndex),
                    sentenceBeginIndex)  # tokens[i].isspace() == False
                if sentenceBeginIndex == -1:
                    sentenceBeginIndex = docIndex
                docIndex += 1
                prevText = sText
            if sentenceBeginIndex != -1:
                prevSentence = makeSentence(text, sentenceBeginIndex,
                                            docIndex - 1, prevSentence,
                                            prevEndIndex)
                prevSentence.set("id", docId + ".s" + str(sentenceCount))
                prevEndIndex = docIndex - 1
                sentenceBeginIndex = -1
                sentencesCreated += 1
                sentenceCount += 1
                document.append(prevSentence)
        # Add possible tail for last sentence
        if prevEndIndex < len(text) - 1 and prevSentence != None:
            assert prevSentence.get("tail") == None, prevSentence.get("tail")
            prevSentence.set("tail", text[prevEndIndex + 1:])

        if emptySentenceCount > 0:
            print >> sys.stderr, "Warning,", emptySentenceCount, "empty sentences in", document.get(
                "id")
        # Remove original text
        if removeText:
            del document["text"]
        # Move elements from document element to sentences
        moveElements(document)
        docCount += 1

    print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences"
    print >> sys.stderr, "Redivided", redivideCount, "sentences"

    if debug:
        print >> sys.stderr, "Work directory preserved for debugging at", workdir
    else:
        # Remove work directory
        shutil.rmtree(workdir)

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Example #49
0
def insertParses(input, parsePath, output=None, parseName="McCC", extraAttributes={}):
    import tarfile
    from SentenceSplitter import openFile
    """
    Divide text in the "text" attributes of document and section 
    elements into sentence elements. These sentence elements are
    inserted into their respective parent elements.
    """  
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    print >> sys.stderr, "Inserting parses from", parsePath
    if parsePath.find(".tar.gz") != -1:
        tarFilePath, parsePath = parsePath.split(".tar.gz")
        tarFilePath += ".tar.gz"
        tarFile = tarfile.open(tarFilePath)
        if parsePath[0] == "/":
            parsePath = parsePath[1:]
    else:
        tarFile = None
    
    docCount = 0
    failCount = 0
    sentenceCount = 0
    docsWithStanford = 0
    sentencesCreated = 0
    sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")]
    counter = ProgressCounter(len(sourceElements), "McCC Parse Insertion")
    for document in sourceElements:
        docCount += 1
        docId = document.get("id")
        if docId == None:
            docId = "CORPUS.d" + str(docCount)
        
        f = openFile(os.path.join(parsePath, document.get("pmid") + ".sd"), tarFile)
        if f == None: # file with BioNLP'11 extension not found, try BioNLP'09 extension
            f = openFile(os.path.join(parsePath, document.get("pmid") + ".dep"), tarFile)
        if f != None:
            sentences = document.findall("sentence")
            # TODO: Following for-loop is the same as when used with a real parser, and should
            # be moved to its own function.
            for sentence in sentences:
                sentenceCount += 1
                counter.update(0, "Processing Documents ("+sentence.get("id")+"/" + document.get("pmid") + "): ")
                if not insertParse(sentence, f, parseName, extraAttributes={}):
                    failCount += 1
            f.close()
        counter.update(1, "Processing Documents ("+document.get("id")+"/" + document.get("pmid") + "): ")
    
    if tarFile != None:
        tarFile.close()
    #print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences"
    #print >> sys.stderr, docsWithSentences, "/", docCount, "documents have stanford parses"

    print >> sys.stderr, "Stanford conversion was inserted to", sentenceCount, "sentences,", failCount, "failed"
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Example #50
0
    tags = ["e1", "e2"]
    for sentence in corpusElements.sentences:
        counter.update(
            1,
            "Resolving chains for (" + sentence.sentence.attrib["id"] + "): ")
        identityChainDict = {}
        tokenHeadScores = sentence.sentenceGraph.getTokenHeadScores()
        for interaction in sentence.interactions:
            if interaction.attrib["type"] == "identity":
                e1 = sentence.entitiesById[interaction.attrib["e1"]]
                e2 = sentence.entitiesById[interaction.attrib["e2"]]
                t1 = sentence.sentenceGraph.entityHeadTokenByEntity[e1]
                t2 = sentence.sentenceGraph.entityHeadTokenByEntity[e2]
                if tokenHeadScores[t2] > tokenHeadScores[t1]:
                    identityChainDict[
                        interaction.attrib["e1"]] = interaction.attrib["e2"]
                else:
                    identityChainDict[
                        interaction.attrib["e2"]] = interaction.attrib["e1"]
        for interaction in sentence.interactions:
            if interaction.attrib["type"] != "identity":
                for tag in tags:
                    id = interaction.attrib[tag]
                    while identityChainDict.has_key(id):
                        id = identityChainDict[id]
                    if id != interaction.attrib[tag]:
                        interaction.attrib[tag] = id

    print >> sys.stderr, "Writing output", options.output
    ETUtils.write(corpusElements.rootElement, options.output)
Example #51
0
                selection = sentence.attrib["origId"] in idList
                if options.invert:
                    selection = not selection
                assert keep == None or keep == selection
                keep = selection
            if not keep:
                corpusRoot.remove(document)
                removedDocuments += 1
                removedSentences += len(sentences)
            else:
                keptDocuments += 1
                keptSentences += len(sentences)
        else:
            selection = documentSets[i] != 0
            if options.invert:
                selection = not selection
            if selection:
                corpusRoot.remove(document)
                removedDocuments += 1
                removedSentences += len(sentences)
            else:
                keptDocuments += 1
                keptSentences += len(sentences)

    print >>sys.stderr, "Corpus:", keptDocuments + removedDocuments, "documents,", keptSentences + removedSentences, "sentences."
    print >>sys.stderr, "Removed:", removedDocuments, "documents,", removedSentences, "sentences."
    print >>sys.stderr, "Subset:", keptDocuments, "documents,", keptSentences, "sentences."

    print >>sys.stderr, "Writing subset to", options.output
    ETUtils.write(corpusRoot, options.output)
Example #52
0
def interface(optionArgs=sys.argv[1:]):
    """
    The function to handle the command-line interface.
    """
    from optparse import OptionParser

    op = OptionParser(
        usage="%prog [options]\nGenia shared task specific unflattening.")
    op.add_option("-i",
                  "--infile",
                  dest="infile",
                  help="Input file (gifxml)",
                  metavar="FILE")
    op.add_option("-o",
                  "--outfile",
                  dest="outfile",
                  help="Output file (gifxml)",
                  metavar="FILE")
    op.add_option(
        "-p",
        "--perfect",
        dest="perfect",
        help="Process only those event which can be perfectly solved",
        action="store_true",
        default=False)
    op.add_option("-a",
                  "--parse",
                  dest="parse",
                  help="Parse to be used",
                  metavar="PARSE")
    op.add_option("-t",
                  "--tokens",
                  dest="tokens",
                  help="Tokens to be used",
                  metavar="TOKENS")
    (options, args) = op.parse_args(optionArgs)

    quit = False
    if not options.infile:
        print "Please specify the input file."
        quit = True


#    if not options.outfile:
#        print "Please specify the output file."
#        quit = True
    if not options.parse:
        print "Please specify the parse."
        quit = True
    if not options.tokens:
        print "Please specify the tokenisation."
        quit = True
    if quit:
        op.print_help()
        return (False)

    corpus = ETUtils.ETFromObj(options.infile)
    documents = corpus.getroot().findall('document')
    counter = ProgressCounter(len(documents), "Unflatten")
    for document in documents:
        counter.update(1, "Unflattening (" + document.get("id") + "): ")
        #sys.stderr.write("Unflattening document %s\n"%document.attrib['id'])
        unflattener = Unflattener(document, options.perfect, options.tokens,
                                  options.parse)
        #if len(unflattener.tokens) == 0:
        #    continue
        unflattener.analyse()
        unflattener.unflatten()
    #indent(corpus.getroot())
    if options.outfile:
        ETUtils.write(corpus, options.outfile)
    return corpus
Example #53
0
def makeSentences(input, output=None, removeText=False, postProcess=True, debug=False):
    """
    Run GENIA Sentence Splitter
    
    Divide text in the "text" attributes of document and section 
    elements into sentence elements. These sentence elements are
    inserted into their respective parent elements.
    """
    global sentenceSplitterDir
    
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    print >> sys.stderr, "Running GENIA Sentence Splitter", Settings.GENIA_SENTENCE_SPLITTER_DIR,
    if postProcess:
        print >> sys.stderr, "(Using post-processing)"
    else:
        print >> sys.stderr, "(No post-processing)"
    docCount = 0
    sentencesCreated = 0
    redivideCount = 0
    sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")]
    counter = ProgressCounter(len(sourceElements), "GeniaSentenceSplitter")
    counter.showMilliseconds = True
    # Create working directory
    workdir = tempfile.mkdtemp()
    for document in sourceElements:
        counter.update(1, "Splitting Documents ("+document.get("id")+"): ")
        docId = document.get("id")
        if docId == None:
            docId = "CORPUS.d" + str(docCount)
        docTag = "-" + str(docCount)
        assert document.find("sentence") == None
        text = document.get("text")
        if text == None or text.strip() == "":
            continue
        #print type(text)
        # Write text to workfile
        #workdir = tempfile.mkdtemp()
        workfile = codecs.open(os.path.join(workdir, "sentence-splitter-input.txt"+docTag), "wt", "utf-8")
        # From http://themoritzfamily.com/python-encodings-and-unicode.html
        # "You have to be careful with the codecs module. Whatever you pass to it must be a Unicode 
        # object otherwise it will try to automatically decode the byte stream as ASCII"
        # However, the unicode errors here were simply due to STTools reading unicode ST-format as ASCII,
        # thus creating an ASCII interaction XML, which then triggered here the unicode error. So, at this
        # point we should be able to safely write(text), as the output file is unicode, and reading with
        # the correct coded is taken care of earlier in the pipeline.
        workfile.write(text) #.encode("utf-8"))
        workfile.close()
        # Run sentence splitter
        assert os.path.exists(Settings.GENIA_SENTENCE_SPLITTER_DIR + "/run_geniass.sh"), Settings.GENIA_SENTENCE_SPLITTER_DIR
        args = [Settings.GENIA_SENTENCE_SPLITTER_DIR + "/run_geniass.sh", os.path.join(workdir, "sentence-splitter-input.txt"+docTag), os.path.join(workdir, "sentence-splitter-output.txt"+docTag), Settings.RUBY_PATH]
        #p = subprocess.call(args)
        p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = p.communicate()
        if stdout != "":
            print >> sys.stderr, stdout
        if stderr != 'Extracting events.roading model file.\nstart classification.\n':
            print >> sys.stderr, stderr
        #print "stdout<", p.stdout.readlines(), ">"
        #print "stderr<", p.stderr.readlines(), ">"
        if postProcess:
            ppIn = codecs.open(os.path.join(workdir, "sentence-splitter-output.txt"+docTag), "rt", "utf-8")
            ppOut = codecs.open(os.path.join(workdir, "sentence-splitter-output-postprocessed.txt"+docTag), "wt", "utf-8")
            subprocess.call(os.path.join(Settings.GENIA_SENTENCE_SPLITTER_DIR, "geniass-postproc.pl"), stdin=ppIn, stdout=ppOut)
            ppIn.close()
            ppOut.close()
            # Read split sentences
            workfile = codecs.open(os.path.join(workdir, "sentence-splitter-output-postprocessed.txt"+docTag), "rt", "utf-8")
        else:
            workfile = codecs.open(os.path.join(workdir, "sentence-splitter-output.txt"+docTag), "rt", "utf-8")
        start = 0 # sentences are consecutively aligned to the text for charOffsets
        sentenceCount = 0
        #text = text.replace("\n", " ") # should stop sentence splitter from crashing.
        #text = text.replace("  ", " ") # should stop sentence splitter from crashing.
        #alignmentText = text.replace("\n", " ").replace("\r", " ")
        #docTokens = reWhiteSpace.split(text)
        docIndex = 0
        sentenceBeginIndex = -1
        prevSentence = None
        prevEndIndex = None
        emptySentenceCount = 0
        prevText = None
        for sText in workfile.readlines():
            sText = sText.strip() # The text of the sentence
            if sText == "":
                emptySentenceCount += 1
                continue

            for i in range(len(sText)):
                if sText[i].isspace():
                    assert sText[i] not in ["\n", "\r"]
                    continue
                while text[docIndex].isspace():
                    if text[docIndex] in ["\n", "\r"] and sentenceBeginIndex != -1:
                        redivideCount += 1
                        prevSentence = makeSentence(text, sentenceBeginIndex, docIndex-1, prevSentence, prevEndIndex)
                        prevSentence.set("id", docId + ".s" + str(sentenceCount))
                        prevSentence.set("redevided", "True")
                        sentencesCreated += 1
                        sentenceCount += 1
                        prevEndIndex = docIndex-1
                        sentenceBeginIndex = -1
                        document.append(prevSentence)
                    docIndex += 1
                assert sText[i] == text[docIndex], (text, sText, prevText, sText[i:i+10], text[docIndex:docIndex+10], (i, docIndex), sentenceBeginIndex) # tokens[i].isspace() == False
                if sentenceBeginIndex == -1:
                    sentenceBeginIndex = docIndex
                docIndex += 1
                prevText = sText
            if sentenceBeginIndex != -1:
                prevSentence = makeSentence(text, sentenceBeginIndex, docIndex-1, prevSentence, prevEndIndex)
                prevSentence.set("id", docId + ".s" + str(sentenceCount))
                prevEndIndex = docIndex-1
                sentenceBeginIndex = -1
                sentencesCreated += 1
                sentenceCount += 1
                document.append(prevSentence)
        # Add possible tail for last sentence
        if prevEndIndex < len(text) - 1 and prevSentence != None:
            assert prevSentence.get("tail") == None, prevSentence.get("tail")
            prevSentence.set("tail", text[prevEndIndex+1:])
            
        if emptySentenceCount > 0:
            print >> sys.stderr, "Warning,", emptySentenceCount, "empty sentences in", document.get("id") 
        # Remove original text
        if removeText:
            del document["text"]
        # Move elements from document element to sentences
        moveElements(document)
        docCount += 1
    
    print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences"
    print >> sys.stderr, "Redivided", redivideCount, "sentences"
    
    if debug:
        print >> sys.stderr, "Work directory preserved for debugging at", workdir
    else:
        # Remove work directory
        shutil.rmtree(workdir)
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Example #54
0
EDGE_TEST_EXAMPLE_FILE="edge-test-examples-"+PARSE
EDGE_CLASS_NAMES="bioinfer-edge-ids.class_names"

EDGE_FEATURE_PARAMS="style:typed,directed,no_linear,entities,noMasking,maxFeatures,bioinfer_limits"

if True:
    ###############################################################################
    # Head token detection
    ###############################################################################
    
    # Find heads
    sys.path.append("..")
    import Core.SentenceGraph as SentenceGraph
    import cElementTreeUtils as ETUtils
    
    ETUtils.write(SentenceGraph.loadCorpus(BI_TEST_FILE, PARSE, TOK).rootElement, TEST_FILE)
    ETUtils.write(SentenceGraph.loadCorpus(BI_DEVEL_FILE, PARSE, TOK).rootElement, DEVEL_FILE)
    ETUtils.write(SentenceGraph.loadCorpus(BI_TRAIN_FILE, PARSE, TOK).rootElement, TRAIN_FILE)
    ETUtils.write(SentenceGraph.loadCorpus(BI_TRAIN_AND_DEVEL_FILE, PARSE, TOK).rootElement, TRAIN_AND_DEVEL_FILE)
    
    ###############################################################################
    # Trigger example generation
    ###############################################################################
    print >> sys.stderr, "Trigger examples for parse", PARSE
    Gazetteer.run(TRAIN_FILE, "gazetteer-train-"+TOK, TOK)
    Gazetteer.run(TRAIN_AND_DEVEL_FILE, "gazetteer-train-and-devel-"+TOK, TOK)
    # Generate example files
    GeneralEntityTypeRecognizerGztr.run(TRAIN_FILE, TRIGGER_TRAIN_EXAMPLE_FILE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-"+TOK)
    GeneralEntityTypeRecognizerGztr.run(DEVEL_FILE, TRIGGER_DEVEL_EXAMPLE_FILE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-"+TOK)
    GeneralEntityTypeRecognizerGztr.run(TRAIN_AND_DEVEL_FILE, TRIGGER_TRAIN_AND_DEVEL_EXAMPLE_FILE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-and-devel-"+TOK)
    GeneralEntityTypeRecognizerGztr.run(TEST_FILE, TRIGGER_TEST_EXAMPLE_FILE, PARSE, TOK, "style:typed", "bioinfer-trigger-ids", "gazetteer-train-and-devel-"+TOK)