Exemple #1
0
def processCorpus(inputFilename, outputFilename, rules):
    print >> sys.stderr, "Deleting elements, rules =", rules
    print >> sys.stderr, "Loading corpus file", inputFilename
    corpusTree = ETUtils.ETFromObj(inputFilename)
    corpusRoot = corpusTree.getroot()
    
    for eType in rules.keys():
        for attrRule in rules[eType].keys():
            rules[eType][attrRule] = rules[eType][attrRule].split("|")
    
    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = defaultdict(int)
    for document in documents:
        counter.update()
        for sentence in document.findall("sentence"):
            processSentence(sentence, rules, countsByType)
    print >> sys.stderr, "Deleted elements"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ":", countsByType[k]
    
    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
Exemple #2
0
def mergeAll(input, output=None, debug=False, iterate=False):
    if iterate:
        origItems = defaultdict(int)
        removedItems = defaultdict(int)
        for docSentences in SentenceElements.getCorpusIterator(input, output):
            entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(
                docSentences, debug)
            for key in entitiesByType:
                origItems[key] += entitiesByType[key]
            for key in duplicatesRemovedByType:
                removedItems[key] += duplicatesRemovedByType[key]
            interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(
                docSentences, debug)
            for key in interactionsByType:
                origItems[key] += interactionsByType[key]
            for key in duplicatesRemovedByType:
                removedItems[key] += duplicatesRemovedByType[key]
        printStats(origItems, removedItems)
        return None
    else:
        corpusElements = CorpusElements.loadCorpus(
            input, removeIntersentenceInteractions=False)
        print >> sys.stderr, "Merging duplicate entities"
        entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(
            corpusElements.sentences, debug)
        printStats(entitiesByType, duplicatesRemovedByType)
        print >> sys.stderr, "Merging duplicate interactions"
        interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(
            corpusElements.sentences, debug)
        printStats(interactionsByType, duplicatesRemovedByType)
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusElements.rootElement, output)
        return corpusElements
Exemple #3
0
 def classify(self, data, model, output, parse=None, task=None, goldData=None, workDir=None, fromStep=None, omitSteps=None, validate=False):
     model = self.openModel(model, "r")
     self.enterState(self.STATE_CLASSIFY)
     self.setWorkDir(workDir)
     if workDir == None:
         self.setTempWorkDir()
     model = self.openModel(model, "r")
     if parse == None: parse = self.getStr(self.tag+"parse", model)
     workOutputTag = os.path.join(self.workDir, os.path.basename(output) + "-")
     xml = self.classifyToXML(data, model, None, workOutputTag, 
         model.get(self.tag+"classifier-model", defaultIfNotExist=None), goldData, parse, float(model.getStr("recallAdjustParameter", defaultIfNotExist=1.0)))
     if (validate):
         self.structureAnalyzer.load(model)
         self.structureAnalyzer.validate(xml)
         ETUtils.write(xml, output+"-pred.xml.gz")
     else:
         shutil.copy2(workOutputTag+self.tag+"pred.xml.gz", output+"-pred.xml.gz")
     EvaluateInteractionXML.run(self.evaluator, xml, data, parse)
     stParams = self.getBioNLPSharedTaskParams(self.bioNLPSTParams, model)
     if stParams["convert"]: #self.useBioNLPSTFormat:
         extension = ".zip" if (stParams["convert"] == "zip") else ".tar.gz" 
         Utils.STFormat.ConvertXML.toSTFormat(xml, output+"-events" + extension, outputTag=stParams["a2Tag"], writeExtra=(stParams["scores"] == True))
         if stParams["evaluate"]: #self.stEvaluator != None:
             if task == None: 
                 task = self.getStr(self.tag+"task", model)
             self.stEvaluator.evaluate(output+"-events" + extension, task)
     self.deleteTempWorkDir()
     self.exitState()
Exemple #4
0
def catenateElements(inputs, output):
    print >> sys.stderr, "##### Catenate interaction XML as elements #####"
    c1 = RecalculateIds.recalculateIds(input1, None, False, 0)
    numDocs = len(c1.getroot().findall("document"))
    print >> sys.stderr, "Documents in input 1:", numDocs
    c2 = RecalculateIds.recalculateIds(input2, None, False, numDocs)
    
    print >> sys.stderr, "Appending documents"
    c1Root = c1.getroot()
    for document in c2.getroot().findall("document"):
        c1Root.append(document)
    
    print >> sys.stderr, "Validating ids"
    ids = set()
    for element in c1Root.getiterator("entity"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("interaction"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("sentence"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("document"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(c1Root, output)
    return c1
def removeUnconnectedEntities(input, output=None):
    input = ETUtils.ETFromObj(input)
    root = input.getroot()
    removed = 0
    preserved = 0
    for document in root.findall("document"):
        sentMap = {} # allow for intersentence interactions
        for sentence in document.findall("sentence"):
            sentMap[sentence.get("id")] = sentence
        connected = set()
        for interaction in document.getiterator("interaction"):
            connected.add(interaction.get("e1"))
            connected.add(interaction.get("e2"))
        entities = []
        for entity in document.getiterator("entity"):
            entities.append(entity)
        for entity in entities:
            if entity.get("isName") == "True": # never remove named entities
                continue
            eId = entity.get("id")
            if eId not in connected:
                if eId.find(".s") != -1: # sentence level entity
                    sentMap[eId.rsplit(".", 1)[0]].remove(entity)
                else: # document level entity
                    document.remove(entity)
                removed += 1
            else:
                preserved += 1
    
    print >> sys.stderr, "Removed", removed, "entities, preserved", preserved, "entities"
    
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(root, output)
    return input
Exemple #6
0
def removeUnconnectedEntities(input, output=None):
    input = ETUtils.ETFromObj(input)
    root = input.getroot()
    removed = 0
    preserved = 0
    for document in root.findall("document"):
        sentMap = {}  # allow for intersentence interactions
        for sentence in document.findall("sentence"):
            sentMap[sentence.get("id")] = sentence
        connected = set()
        for interaction in document.getiterator("interaction"):
            connected.add(interaction.get("e1"))
            connected.add(interaction.get("e2"))
        entities = []
        for entity in document.getiterator("entity"):
            entities.append(entity)
        for entity in entities:
            if entity.get("given") == "True":  # never remove named entities
                continue
            eId = entity.get("id")
            if eId not in connected:
                if eId.find(".s") != -1:  # sentence level entity
                    sentMap[eId.rsplit(".", 1)[0]].remove(entity)
                else:  # document level entity
                    document.remove(entity)
                removed += 1
            else:
                preserved += 1

    print >> sys.stderr, "Removed", removed, "entities, preserved", preserved, "entities"

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(root, output)
    return input
Exemple #7
0
def processCorpus(inPath, outPath, sourceSet, newSets, seed=1):
    print >> sys.stderr, "Loading corpus file", inPath
    corpusTree = ETUtils.ETFromObj(inPath)
    corpusRoot = corpusTree.getroot()

    rand = random.Random(seed)
    documents = corpusRoot.findall("document")
    counts = {"old": defaultdict(int), "new": defaultdict(int)}
    for document in documents:
        counts["old"][document.get("set")] += 1
        if sourceSet != None and document.get("set") != sourceSet:
            counts["new"][document.get("set")] += 1
            continue
        value = rand.random()
        document.set("setValue", str(value))
        document.set("origSet", document.get("set", ""))
        for setName, cutoff in newSets:
            if value <= cutoff:
                document.set("set", setName)
                break
        counts["new"][document.get("set")] += 1
    #for key in counts:
    #    counts[key] = dict(counts[key])
    print "MakeSets result:", "old=" + str(dict(
        counts["old"])) + ", new=" + str(dict(counts["new"]))
    if outPath != None:
        ETUtils.write(corpusRoot, outPath)
    return corpusTree
 def run(cls,inFile,multiplier=1.0,outFile=None,targetLabel="neg", binary=False):
     """inFile can be a string with file name (.xml or .xml.gz) or an ElementTree or an Element or an open input stream
     multiplier adjusts the level of boosting the non-negative predictions, it is a real number (0,inf)
     multiplier 1.0 does nothing, <1.0 decreases negative class confidence, >1.0 increases negative class confidence
     the root of the modified tree is returned and, if outFile is a string, written out to outFile as well"""
     print >> sys.stderr, "##### Recall adjust with multiplier " + str(multiplier)[:5] + " #####"
     tree=ETUtils.ETFromObj(inFile)
     if not ET.iselement(tree):
         assert isinstance(tree,ET.ElementTree)
         root=tree.getroot()
     else:
         root = tree
     
     if multiplier != -1:
         if binary:
             print >> sys.stderr, "Recall binary mode"
             classRanges = getClassRanges(root.getiterator("entity"))
             assert len(classRanges.keys()) in [0,2]
             if len(classRanges.keys()) == 0:
                 print >> sys.stderr, "Warning, recall adjustment skipped because no prediction weights found"
         else:
             print >> sys.stderr, "Recall multiclass mode"
             classRanges = None
         for entityNode in root.getiterator("entity"):
             adjustEntity(entityNode,targetLabel,multiplier,classRanges)
     if outFile:
         ETUtils.write(root,outFile)
     return tree
Exemple #9
0
def makeSubset(input, output=None, ratio=1.0, seed=0):
    if ratio == 1.0:
        if output != None:
            shutil.copy2(input, output)
            return output
        else:
            return input
    totalFolds = 100
    selectedFolds = int(ratio * 100.0)
    print >>sys.stderr, "====== Making subset ======"
    print >>sys.stderr, "Subset for ", input, "ratio", ratio, "seed", seed
    xml = ETUtils.ETFromObj(input).getroot()
    count = 0
    sentCount = 0
    for document in xml.findall("document"):
        sentCount += len(document.findall("sentence"))
        count += 1
    division = Core.Split.getFolds(count, totalFolds, seed)
    # print division, selectedFolds - 1
    index = 0
    removeCount = 0
    sentRemoveCount = 0
    for document in xml.findall("document"):
        if division[index] > selectedFolds - 1:
            xml.remove(document)
            sentRemoveCount += len(document.findall("sentence"))
            removeCount += 1
        index += 1
    print >>sys.stderr, "Subset", "doc:", count, "removed:", removeCount, "sent:", sentCount, "sentremoved:", sentRemoveCount
    xml.set("subsetRatio", str(ratio))
    xml.set("subsetSeed", str(seed))
    if output != None:
        ETUtils.write(xml, output)
    return output
Exemple #10
0
def processCorpus(inPath, outPath, sourceSet, newSets, seed=1):
    print >> sys.stderr, "Loading corpus file", inPath
    corpusTree = ETUtils.ETFromObj(inPath)
    corpusRoot = corpusTree.getroot()
    
    rand = random.Random(seed)
    documents = corpusRoot.findall("document")
    counts = {"old":defaultdict(int), "new":defaultdict(int)}
    for document in documents:
        counts["old"][document.get("set")] += 1
        if sourceSet != None and document.get("set") != sourceSet:
            counts["new"][document.get("set")] += 1
            continue
        value = rand.random()
        document.set("setValue", str(value))
        document.set("origSet", document.get("set", ""))
        for setName, cutoff in newSets:
            if value <= cutoff:
                document.set("set", setName)
                break
        counts["new"][document.get("set")] += 1
    #for key in counts:
    #    counts[key] = dict(counts[key])
    print "MakeSets result:", "old=" + str(dict(counts["old"])) + ", new=" + str(dict(counts["new"]))
    if outPath != None:
        ETUtils.write(corpusRoot, outPath)
    return corpusTree
Exemple #11
0
def fixAltOffsets(input, output=None):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    docCount = 0
    sentencesCreated = 0
    sentences = [x for x in corpusRoot.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "FixAltOffsets")
    fixCount = 0
    # fix spans
    for sentence in sentences:
        counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ")
        sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        for entity in sentence.findall("entity"):
            altOffsetString = entity.get("altOffset")
            if altOffsetString == None:
                continue
            #print altOffsetString
            altOffsets = Range.charOffsetToTuples(altOffsetString)
            assert len(altOffsets) == 1
            for i in range(len(altOffsets)):
                altOffset = altOffsets[i] 
                altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
            entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))
            fixCount += 1
        
    print >> sys.stderr, "Fixed", fixCount, "altOffsets"
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemple #12
0
def catenateElements(inputs, output):
    print >> sys.stderr, "##### Catenate interaction XML as elements #####"
    c1 = RecalculateIds.recalculateIds(input1, None, False, 0)
    numDocs = len(c1.getroot().findall("document"))
    print >> sys.stderr, "Documents in input 1:", numDocs
    c2 = RecalculateIds.recalculateIds(input2, None, False, numDocs)

    print >> sys.stderr, "Appending documents"
    c1Root = c1.getroot()
    for document in c2.getroot().findall("document"):
        c1Root.append(document)

    print >> sys.stderr, "Validating ids"
    ids = set()
    for element in c1Root.getiterator("entity"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("interaction"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("sentence"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("document"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(c1Root, output)
    return c1
Exemple #13
0
def makeSubset(input, output=None, ratio=1.0, seed=0):
    if ratio == 1.0:
        if output != None:
            shutil.copy2(input, output)
            return output
        else:
            return input
    totalFolds = 100
    selectedFolds = int(ratio * 100.0)
    print >> sys.stderr, "====== Making subset ======"
    print >> sys.stderr, "Subset for ", input, "ratio", ratio, "seed", seed
    xml = ETUtils.ETFromObj(input).getroot()
    count = 0
    sentCount = 0
    for document in xml.findall("document"):
        sentCount += len(document.findall("sentence"))
        count += 1
    division = Core.Split.getFolds(count, totalFolds, seed)
    #print division, selectedFolds - 1
    index = 0
    removeCount = 0
    sentRemoveCount = 0
    for document in xml.findall("document"):
        if division[index] > selectedFolds - 1:
            xml.remove(document)
            sentRemoveCount += len(document.findall("sentence"))
            removeCount += 1
        index += 1
    print >> sys.stderr, "Subset", "doc:", count, "removed:", removeCount, "sent:", sentCount, "sentremoved:", sentRemoveCount
    xml.set("subsetRatio", str(ratio))
    xml.set("subsetSeed", str(seed))
    if output != None:
        ETUtils.write(xml, output)
    return output
Exemple #14
0
 def run(cls,inFile,multiplier=1.0,outFile=None,targetLabel="neg", binary=False):
     """inFile can be a string with file name (.xml or .xml.gz) or an ElementTree or an Element or an open input stream
     multiplier adjusts the level of boosting the non-negative predictions, it is a real number (0,inf)
     multiplier 1.0 does nothing, <1.0 decreases negative class confidence, >1.0 increases negative class confidence
     the root of the modified tree is returned and, if outFile is a string, written out to outFile as well"""
     print >> sys.stderr, "##### Recall adjust with multiplier " + str(multiplier)[:5] + " #####"
     tree=ETUtils.ETFromObj(inFile)
     if not ET.iselement(tree):
         assert isinstance(tree,ET.ElementTree)
         root=tree.getroot()
     else:
         root = tree
     
     if multiplier != -1:
         if binary:
             print >> sys.stderr, "Recall binary mode"
             classRanges = getClassRanges(root.getiterator("entity"))
             assert len(classRanges.keys()) in [0,2]
             if len(classRanges.keys()) == 0:
                 print >> sys.stderr, "Warning, recall adjustment skipped because no prediction weights found"
         else:
             print >> sys.stderr, "Recall multiclass mode"
             classRanges = None
         for entityNode in root.getiterator("entity"):
             adjustEntity(entityNode,targetLabel,multiplier,classRanges)
     if outFile:
         ETUtils.write(root,outFile)
     return tree
Exemple #15
0
 def parse(self,
           input,
           output=None,
           tokenizationName=None,
           parseName="McCC",
           requireEntities=False,
           skipIds=[],
           skipParsed=True,
           timeout=600,
           makePhraseElements=True,
           debug=False,
           pathParser=None,
           pathBioModel="AUTO",
           addTimeStamp=True):
     print >> sys.stderr, "BLLIP parser"
     corpusTree, corpusRoot = self.getCorpus(input)
     workdir = tempfile.mkdtemp()
     infileName, numCorpusSentences = self.makeInputFile(
         workdir, corpusRoot, requireEntities, skipIds, skipParsed,
         tokenizationName, debug)
     bllipOutput = self.runProcess(infileName, workdir, pathParser,
                                   pathBioModel, tokenizationName, timeout)
     self.insertPennTrees(bllipOutput, corpusRoot, parseName,
                          requireEntities, skipIds, skipParsed)
     if output != None:
         print >> sys.stderr, "Writing output to", output
         ETUtils.write(corpusRoot, output)
     # Remove work directory
     if not debug:
         shutil.rmtree(workdir)
     else:
         print >> sys.stderr, "Parser IO files at", workdir
     return corpusTree
Exemple #16
0
def processCorpus(inputFilename, outputFilename, rules):
    print >> sys.stderr, "Deleting elements, rules =", rules
    print >> sys.stderr, "Loading corpus file", inputFilename
    corpusTree = ETUtils.ETFromObj(inputFilename)
    corpusRoot = corpusTree.getroot()

    for eType in rules.keys():
        for attrRule in rules[eType].keys():
            if type(rules[eType][attrRule]) in types.StringTypes:
                rules[eType][attrRule] = rules[eType][attrRule].split("|")

    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = defaultdict(int)
    for document in documents:
        counter.update()
        for sentence in document.findall("sentence"):
            processSentence(sentence, rules, countsByType)
    print >> sys.stderr, "Deleted elements"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ":", countsByType[k]

    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
def mergeAll(input, output=None, debug=False, iterate=False):
    if iterate:
        origItems = defaultdict(int)
        removedItems = defaultdict(int)
        for docSentences in SentenceElements.getCorpusIterator(input, output):
            entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(docSentences, debug)
            for key in entitiesByType: origItems[key] += entitiesByType[key]
            for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key]
            interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(docSentences, debug)
            for key in interactionsByType: origItems[key] += interactionsByType[key]
            for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key]
        printStats(origItems, removedItems)
        return None
    else:
        corpusElements = CorpusElements.loadCorpus(input, removeIntersentenceInteractions=False)
        print >> sys.stderr, "Merging duplicate entities"
        entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(corpusElements.sentences, debug)
        printStats(entitiesByType, duplicatesRemovedByType)
        print >> sys.stderr, "Merging duplicate interactions"
        interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(corpusElements.sentences, debug)
        printStats(interactionsByType, duplicatesRemovedByType)
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusElements.rootElement, output)
        return corpusElements
Exemple #18
0
 def parse(self, parserName, input, output=None, debug=False, reparse=False, stanfordParserDir=None, stanfordParserArgs=None, action="convert", outputFormat=None, memory=None):
     #global stanfordParserDir, stanfordParserArgs
     assert action in ("convert", "penn", "dep")
     if stanfordParserDir == None:
         stanfordParserDir = Settings.STANFORD_PARSER_DIR
     # Run the parser process
     corpusTree, corpusRoot = self.getCorpus(input)
     workdir = tempfile.mkdtemp()
     inPath = self.makeInputFile(corpusRoot, workdir, parserName, reparse, action, debug)
     outPath = self.runProcess(stanfordParserArgs, stanfordParserDir, inPath, workdir, action, outputFormat, memory)
     self.printStderr(outPath)
     # Insert the parses    
     if action in ("convert", "dep"):
         #self.insertDependencyParses(outPath, corpusRoot, parserName, {"stanford-mode":action}, addTimeStamp=True, skipExtra=0, removeExisting=True)
         self.insertStanfordDependencyParses(outPath, corpusRoot, parserName, skipParsed=reparse, removeExisting=reparse)
     elif action == "penn":
         self.insertPennTrees(outPath, corpusRoot, parserName)
     # Remove work directory
     if not debug:
         shutil.rmtree(workdir)
     else:
         print >> sys.stderr, "Parser IO files at", workdir
     # Write the output XML file
     if output != None:
         print >> sys.stderr, "Writing output to", output
         ETUtils.write(corpusRoot, output)
     return corpusTree
Exemple #19
0
def process(input, output=None):
    download("/tmp/extract", "/tmp/download")
    specAnn = readResources("/tmp/extract")
    insertElements(input.getroot(), specAnn)
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(input.getroot(), output)
    return input
Exemple #20
0
def test(extractPath, downloadPath, inCorpusPath, outCorpusPath):
    download(extractPath, downloadPath)
    specAnn = readResources(extractPath)
    inCorpus = ETUtils.ETFromObj(inCorpusPath)
    insertElements(inCorpus.getroot(), specAnn)
    ETUtils.write(inCorpus.getroot(), outCorpusPath)

#process("/tmp/extract", "/tmp/download", "/home/jari/Dropbox/data/BioNLP16/corpora/BB_EVENT_16-devel.xml", "/tmp/ner.xml")
Exemple #21
0
def process(input, output=None):
    download("/tmp/extract", "/tmp/download")
    specAnn = readResources("/tmp/extract")
    insertElements(input.getroot(), specAnn)
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(input.getroot(), output)
    return input
Exemple #22
0
 def insertParses(self, parseDir, input, output=None, parseName="McCC", extensions=None, subDirs=None, debug=False, skipParsed=False, docMatchKeys=None, conllFormat=None, splitting=True, unescapeFormats="AUTO", tokenMerging=True, extMap=None, sdFailedFormat="empty", origIdType=None, posTags=None):
     corpusTree, corpusRoot = self.getCorpus(input)
     if not os.path.exists(parseDir):
         raise Exception("Cannot find parse input '" + str(parseDir) + "'")
     if not os.path.isdir(parseDir):
         raise Exception("Parse input '" + str(parseDir) + "' is not a directory")
     if extensions == None:
         extensions = self.allExt
     elif isinstance(extensions, basestring):
         extensions = extensions.split(",")
     extensions = [x for x in extensions if x in self.allExt]
     unescapeFormats = self.getUnescapeFormats(unescapeFormats)
     if docMatchKeys == None:
         docMatchKeys = ["origId", "pmid", "id"]
     elif isinstance(docMatchKeys, basestring):
         docMatchKeys = docMatchKeys.split(",")
     print >> sys.stderr, "Inserting parses from file types:", extensions
     counts = defaultdict(int)
     files = self.getParseFiles(parseDir, extensions, subDirs, counts, extMap=extMap, origIdType=origIdType)
     typeCounts = {x:defaultdict(int) for x in extensions}
     # Make document elements if needed
     documents = [x for x in corpusRoot.findall("document")]
     if len(documents) == 0:
         typeCounts["document-generation"] = defaultdict(int)
         documents = self.prepareDocuments(corpusRoot, files)
     counter = ProgressCounter(len(files), "Parse Insertion")
     # Insert parses and make sentence elements if needed
     typeCounts["sentence-splitting"] = defaultdict(int)
     print >> sys.stderr, "Inserting parses for", len(files), "out of total", len(documents), "documents"
     for document in documents:
         counts["document"] += 1
         matchFound = False
         for docMatchValue in [document.get(x) for x in docMatchKeys if document.get(x) != None]:
             if docMatchValue in files:
                 if matchFound:
                     raise Exception("Multiple matching parses for document " + str(document.attrib) + " using keys " + str(docMatchKeys))
                 matchFound = True
                 counter.update(1, "Inserting parses for (" + document.get("id") + "/" + str(docMatchValue) + "): ")
                 counts["document-match"] += 1
                 for ext in extensions:
                     if ext not in files[docMatchValue]:
                         continue
                     counts[ext + "-match"] += 1
                     sentences = [x for x in self.getSentences(document, skipParsed=skipParsed)]
                     self.insertParse(document, sentences, ext, files[docMatchValue][ext], parseName, splitting, typeCounts, conllFormat, unescapeFormats=unescapeFormats, tokenMerging=tokenMerging, sdFailedFormat=sdFailedFormat, posTags=posTags)
         if not matchFound:
             counts["document-no-match"] += 1
     if len(typeCounts["sentence-splitting"]) > 0:
         print >> sys.stderr, "Sentence Splitting Counts", dict(typeCounts["sentence-splitting"])
     print >> sys.stderr, "Counts", dict(counts)
     for ext in extensions:
         if len(typeCounts[ext]) > 0:
             print >> sys.stderr, "Counts for type '" + ext + "':", dict(typeCounts[ext])
     # Write the output XML file
     if output != None:
         print >> sys.stderr, "Writing output to", output
         ETUtils.write(corpusRoot, output)
     return corpusTree
Exemple #23
0
def processCorpus(input, outDir, stem=None, tail=".xml", mergedSets=[], saveCombined=False, verbose=False):
    newCorpora = {}
    print >> sys.stderr, "Loading corpus file", input
    corpusRoot = ETUtils.ETFromObj(input).getroot()
    
    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = {}
    for document in documents:
        counter.update()
        docSet = document.get("set")
        if docSet == None:
            if verbose: print >> sys.stderr, "Warning, no set defined for document", document.get("id")
            if not countsByType.has_key("No set"):
                countsByType["No set"] = 0
            countsByType["No set"] += 1
            continue
        elif not newCorpora.has_key(docSet):
            newCorpora[docSet] = ET.Element("corpus")
            for k, v in corpusRoot.attrib.iteritems():
                newCorpora[docSet].set(k, v)
            countsByType[docSet] = 0
        newCorpora[docSet].append(document)
        countsByType[docSet] += 1
        
    # Make merged sets
    for mergedSet in mergedSets:
        tag = "-and-".join(sorted(mergedSet))
        if not newCorpora.has_key(tag):
            newCorpora[tag] = ET.Element("corpus")
            for k, v in corpusRoot.attrib.iteritems():
                newCorpora[tag].set(k, v)
            countsByType[tag] = 0    
        for componentSet in mergedSet:
            for element in newCorpora[componentSet].findall("document"):
                newCorpora[tag].append(element)
                countsByType[tag] += 1
        
    print >> sys.stderr, "Documents per set"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + str(k) + ":", countsByType[k]
    
    if stem == None:
        outDir, stem = os.path.dirname(outDir), os.path.basename(outDir)
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    
    print >> sys.stderr, "Writing output files to directory", outDir
    if saveCombined:
        print >> sys.stderr, "Saving combined input to", stem + tail
        ETUtils.write(corpusRoot, stem + tail)
    else:
        print >> sys.stderr, "Combined input not saved"
    for docSet in sorted(newCorpora.keys()):
        outFilename = os.path.join(outDir, stem + "-" + docSet + tail)
        print >> sys.stderr, "Writing set", docSet, "to", outFilename
        ETUtils.write(newCorpora[docSet], outFilename)
Exemple #24
0
def addMTMX(input, mtmxDir, output=None):
    from collections import defaultdict
    # read interaction XML
    print "Reading interaction XML"
    counts = defaultdict(int)
    xml = ETUtils.ETFromObj(input).getroot()
    docById = {}
    for document in xml.getiterator("document"):
        docId = document.get("origId")
        assert docId not in docById
        docById[docId] = document
        counts["document"] += 1
    for entity in xml.getiterator("entity"):
        counts["entity"] += 1
    
    # read MTMX files
    print "Processing MTMX"
    for filename in sorted(os.listdir(mtmxDir)):
        if filename.endswith(".xml"):
            print >> sys.stderr, filename,
            fileId = filename.split("_")[0]
            if fileId not in docById:
                print >> sys.stderr, "skipped"
                continue
            else:
                print >> sys.stderr, "processing"
            doc = docById[fileId]
            entityByOrigId = {}
            for entity in doc.getiterator("entity"):
                assert entity.get("origId") not in entityByOrigId, entity.get("origId")
                entityByOrigId[entity.get("origId")] = entity
            mtmx = ETUtils.ETFromObj(os.path.join(mtmxDir, filename)).getroot()
            for phrase in mtmx.getiterator("PHRASE"):
                if phrase.get("ID") in entityByOrigId:
                    entity = entityByOrigId[phrase.get("ID")]
                    mapCount = 0
                    for map in phrase.getiterator("MAP"):
                        if (map.get("NAME").lower() == entity.get("text").lower()) or (map.get("NAME_SHORT").lower() == entity.get("text").lower()):
                            if entity.get("mtmxProb") != None:
                                if int(entity.get("mtmxProb")) > int(map.get("PROB")):
                                    break
                                else:
                                    counts["mapped-multi"] += 1
                                    counts["mapped-multi-"+str(mapCount)] += 1
                                    #print filename, phrase.get("ID")
                            else:
                                counts["mapped-at-least-once"] += 1
                            entity.set("mtmxProb", str(map.get("PROB")))
                            entity.set("mtmxCui", str(map.get("CUI")))
                            entity.set("mtmxName", str(map.get("NAME")))
                            entity.set("mtmxNameShort", str(map.get("NAME_SHORT")))
                            entity.set("mtmxSemTypes", str(map.get("SEMTYPES")))
                            counts["mappings"] += 1
                            mapCount += 1
    print >> sys.stderr, counts
    if output != None:
        ETUtils.write(xml, output)
Exemple #25
0
def test(extractPath, downloadPath, inCorpusPath, outCorpusPath):
    download(extractPath, downloadPath)
    specAnn = readResources(extractPath)
    inCorpus = ETUtils.ETFromObj(inCorpusPath)
    insertElements(inCorpus.getroot(), specAnn)
    ETUtils.write(inCorpus.getroot(), outCorpusPath)


#process("/tmp/extract", "/tmp/download", "/home/jari/Dropbox/data/BioNLP16/corpora/BB_EVENT_16-devel.xml", "/tmp/ner.xml")
Exemple #26
0
def convertDDI13(outDir, downloadDir=None, datasets=["DDI13_TRAIN", "DDI13_TEST_TASK_9.1", "DDI13_TEST_TASK_9.2"], redownload=False, insertParses=True, parse=False, makeIntermediateFiles=True, debug=False):
    cwd = os.getcwd()
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    os.chdir(outDir)
    logFileName = os.path.join(outDir, "DDI13-conversion-log.txt")
    Stream.openLog(logFileName)
    print >> sys.stderr, "=======================", "Converting DDI'13 corpus", "======================="
    
    tempdir = tempfile.mkdtemp()
    downloaded = downloadFiles(downloadDir, tempdir, redownload)
    
    for dataset in datasets:       
        corpusTree = getCorpusXML()
        xml = corpusTree.getroot()
        print >> sys.stderr, "Merging input XMLs"
        assert downloaded[dataset] != None
        combineXML(xml, "train", downloaded[dataset], subDirs=["DrugBank", "MedLine", "NER"])
        print >> sys.stderr, "Processing elements"
        processElements(xml)
        
        if dataset == "DDI13_TRAIN":
            print >> sys.stderr, "Dividing training set into folds"
            divideSets(xml, "train", 10)
        else:
            for doc in xml.getiterator("document"):
                doc.set("set", "test")

        if parse:
            print >> sys.stderr, "Parsing"
            parseXML(corpusTree, os.path.join(tempdir, "parsing"), debug)
        elif insertParses:
            assert parse == False
            print >> sys.stderr, "Inserting McCC parses"
            Tools.BLLIPParser.insertParses(corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"source":"TEES"})
            print >> sys.stderr, "Inserting Stanford conversions"
            Tools.StanfordParser.insertParses(corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"stanfordSource":"TEES"})
        # Check what was produced by the conversion
        print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------"
        analyzer = StructureAnalyzer()
        analyzer.analyze([xml])
        print >> sys.stderr, analyzer.toString()
        if "9.1" in dataset:
            outFileName = os.path.join(outDir, "DDI13-test-task9.1.xml")
        elif "9.2" in dataset:
            outFileName = os.path.join(outDir, "DDI13-test-task9.2.xml")
        else:
            outFileName = os.path.join(outDir, "DDI13-train.xml")
        print >> sys.stderr, "Writing output to", outFileName
        ETUtils.write(xml, outFileName)
    
    Stream.closeLog(logFileName)
    if not debug and tempdir != None:
        print >> sys.stderr, "Removing temporary directory", tempdir
        shutil.rmtree(tempdir)
    os.chdir(cwd)
Exemple #27
0
def findHeads(input, parse, tokenization=None, output=None, removeExisting=True, iterate=False):
    if iterate:
        from Utils.ProgressCounter import ProgressCounter
        import InteractionXML.SentenceElements as SentenceElements
        print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization
        print >> sys.stderr, "Removing existing head offsets"
        removeCount = 0
        counter = ProgressCounter(None, "Find heads")
        counter.showMilliseconds = True
        for sentences in SentenceElements.getCorpusIterator(input, output, parse, tokenization):
            for sentence in sentences:
                if removeExisting:
                    for e in sentence.sentence.findall("entity"):
                        if e.get("headOffset") != None:
                            removeCount += 1
                            del e.attrib["headOffset"]
                graph = SentenceGraph.SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies)
                graph.mapInteractions(sentence.entities, sentence.interactions)
                # Make sure every parse gets head scores
                #if graph.tokenHeadScores == None:
                #    graph.getTokenHeadScores()
            counter.update(len(sentences), "Finding heads ("+sentences[-1].sentence.get("id")+"): ")                
        print >> sys.stderr, "Removed head offsets from", removeCount, "entities"    
    else:
        xml = ETUtils.ETFromObj(input)
        if removeExisting:
            print >> sys.stderr, "Removing existing head offsets"
            removeCount = 0
            xml = ETUtils.ETFromObj(input)
            for d in xml.getroot().findall("document"):
                for s in d.findall("sentence"):
                    for e in s.findall("entity"):
                        if e.get("headOffset") != None:
                            removeCount += 1
                            del e.attrib["headOffset"]
            print >> sys.stderr, "Removed head offsets from", removeCount, "entities"
        
        # SentenceGraph automatically calculates head offsets and adds them to entities if they are missing
        print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization
        corpusElements = SentenceGraph.loadCorpus(xml, parse, tokenization)
        
        # Make sure every parse gets head scores
        for sentence in corpusElements.sentences:
            if sentence.sentenceGraph == None:
                continue
            if sentence.sentenceGraph.tokenHeadScores == None:
                sentence.sentenceGraph.getTokenHeadScores()
        
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusElements.rootElement, output)
        return xml
Exemple #28
0
def validateCorpus(input, output, strict=True):
    print >> sys.stderr, "Validating XML"
    print >> sys.stderr, "Loading corpus file", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()
    
    counts = validate(corpusRoot, strict)
    print >> sys.stderr, "Corpus validated:", dict(counts)
    
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemple #29
0
def makeConfigXML(workdir, bannerDir, oldVersion=True):
    conf = ET.Element("banner-configuration")
    banner = ET.SubElement(conf, "banner")
    eval = ET.SubElement(banner, "eval")
    datasetName = ET.SubElement(eval, "datasetName").text = "banner.eval.dataset.BC2GMDataset"
    # Dataset
    dataset = ET.SubElement(eval, "dataset")
    ET.SubElement(dataset, "sentenceFilename").text = workdir + "/input.txt"
    ET.SubElement(dataset, "mentionTestFilename").text = workdir + "/empty.eval"
    ET.SubElement(dataset, "mentionAlternateFilename").text = workdir + "/empty.eval"
    codecs.open(os.path.join(workdir, "empty.eval"), "wt", "utf-8").close()
    # More eval level stuff
    ET.SubElement(eval, "idInputFilename").text = workdir + "/ids.txt"
    ET.SubElement(eval, "rawInputFilename").text = workdir + "/raw.txt"
    ET.SubElement(eval, "trainingInputFilename").text = workdir + "/training.txt"
    ET.SubElement(eval, "outputFilename").text = workdir + "/output.txt"
    codecs.open(os.path.join(workdir, "output.txt"), "wt", "utf-8").close()
    ET.SubElement(eval, "inContextAnalysisFilename").text = workdir + "/contextAnalysis.html"
    ET.SubElement(eval, "mentionFilename").text = workdir + "/mention.txt"
    ET.SubElement(eval, "modelFilename").text = bannerDir + "/output/model_BC2GM.bin"
    ET.SubElement(eval, "lemmatiserDataDirectory").text = bannerDir + "/nlpdata/lemmatiser"
    ET.SubElement(eval, "posTaggerDataDirectory").text = bannerDir + "/nlpdata/tagger"
    ET.SubElement(eval, "posTagger").text = "dragon.nlp.tool.HeppleTagger"
    ET.SubElement(eval, "tokenizer").text = "banner.tokenization.SimpleTokenizer"
    ET.SubElement(eval, "useParenthesisPostProcessing").text = "true"
    ET.SubElement(eval, "useLocalAbbreviationPostProcessing").text = "true"
    ET.SubElement(eval, "useNumericNormalization").text = "true"
    ET.SubElement(eval, "tagFormat").text = "IOB"
    ET.SubElement(eval, "crfOrder").text = "2"
    if not oldVersion:
        ET.SubElement(eval, "mentionTypes").text = "Required"
        ET.SubElement(eval, "sameTypeOverlapOption").text = "Exception"
        ET.SubElement(eval, "differentTypeOverlapOption").text = "Exception"
    ET.SubElement(eval, "dictionaryTagger").text = "banner.tagging.dictionary.DictionaryTagger"
    # End eval element
    tagging = ET.SubElement(banner, "tagging") 
    dictionary = ET.SubElement(tagging, "dictionary")
    dictionaryTagger = ET.SubElement(dictionary, "DictionaryTagger")
    ET.SubElement(dictionaryTagger, "filterContainedMentions").text = "true"
    ET.SubElement(dictionaryTagger, "normalizeMixedCase").text = "false"
    ET.SubElement(dictionaryTagger, "normalizeDigits").text = "false"
    ET.SubElement(dictionaryTagger, "canonize").text = "false"
    ET.SubElement(dictionaryTagger, "generate2PartVariations").text = "true"
    ET.SubElement(dictionaryTagger, "dropEndParentheticals").text = "false"
    ET.SubElement(dictionaryTagger, "dictionaryFile").text = bannerDir + "/dict/single.txt"
    ET.SubElement(dictionaryTagger, "dictionaryType").text = "GENE"
    # Write to file
    filename = workdir + "/banner_config.xml"
    ETUtils.write(conf, workdir + "/banner_config.xml")
    return workdir + "/banner_config.xml"
def negateEvents(input, output=None, verbose=False):
    if not (ET.iselement(input) and input.tag == "sentence"):
        print >> sys.stderr, "Loading corpus file", input
        corpusTree = ETUtils.ETFromObj(input)
        corpusRoot = corpusTree.getroot()

    if not (ET.iselement(input) and input.tag == "sentence"):
        sentences = corpusRoot.getiterator("sentence")
    else:
        sentences = [input]
    counts = defaultdict(int)
    for sentence in sentences:
        for entity in sentence.findall("entity"):
            counts["all-entities"] += 1
            eType = entity.get("type")
            if not isNegatableEPITrigger(eType):
                counts["out-of-scope"] += 1
                continue
            eBaseType = getEPIBaseType(eType)
            eText = entity.get("text").lower()
            eNewType = determineNewType(eType, eText)

            # Insert changed charOffset
            counts["entities"] += 1
            if verbose:
                print "Entity", entity.get("id"), [entity.get("text")], [
                    eType, eBaseType, eNewType
                ],
            if eNewType != eBaseType:
                counts["negated"] += 1
                if verbose: print "NEGATED",
            if eNewType == eType:
                counts["correct"] += 1
                if verbose: print "CORRECT"
            else:
                counts["incorrect"] += 1
                if eNewType == eBaseType:
                    counts["incorrect-pos"] += 1
                else:
                    counts["incorrect-neg"] += 1
                if verbose: print "INCORRECT"
            entity.set("type", eNewType)
    if verbose:
        print counts

    if not (ET.iselement(input) and input.tag == "sentence"):
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusRoot, output)
        return corpusTree
Exemple #31
0
 def classify(self,
              data,
              model,
              output,
              parse=None,
              task=None,
              goldData=None,
              workDir=None,
              fromStep=None,
              omitSteps=None,
              validate=False):
     model = self.openModel(model, "r")
     self.enterState(self.STATE_CLASSIFY)
     self.setWorkDir(workDir)
     if workDir == None:
         self.setTempWorkDir()
     model = self.openModel(model, "r")
     if parse == None: parse = self.getStr(self.tag + "parse", model)
     workOutputTag = os.path.join(self.workDir,
                                  os.path.basename(output) + "-")
     xml = self.classifyToXML(
         data, model, None, workOutputTag,
         model.get(self.tag + "classifier-model", defaultIfNotExist=None),
         goldData, parse,
         float(model.getStr("recallAdjustParameter",
                            defaultIfNotExist=1.0)))
     if (validate):
         self.structureAnalyzer.load(model)
         self.structureAnalyzer.validate(xml)
         ETUtils.write(xml, output + "-pred.xml.gz")
     else:
         shutil.copy2(workOutputTag + self.tag + "pred.xml.gz",
                      output + "-pred.xml.gz")
     EvaluateInteractionXML.run(self.evaluator, xml, data, parse)
     stParams = self.getBioNLPSharedTaskParams(self.bioNLPSTParams, model)
     if stParams["convert"]:  #self.useBioNLPSTFormat:
         extension = ".zip" if (stParams["convert"] == "zip") else ".tar.gz"
         Utils.STFormat.ConvertXML.toSTFormat(
             xml,
             output + "-events" + extension,
             outputTag=stParams["a2Tag"],
             writeExtra=(stParams["scores"] == True))
         if stParams["evaluate"]:  #self.stEvaluator != None:
             if task == None:
                 task = self.getStr(self.tag + "task", model)
             self.stEvaluator.evaluate(output + "-events" + extension, task)
     self.deleteTempWorkDir()
     self.exitState()
def negateEvents(input, output=None, verbose=False):
    if not (ET.iselement(input) and input.tag == "sentence"):
        print >> sys.stderr, "Loading corpus file", input
        corpusTree = ETUtils.ETFromObj(input)
        corpusRoot = corpusTree.getroot()
    
    if not (ET.iselement(input) and input.tag == "sentence"):
        sentences = corpusRoot.getiterator("sentence")
    else:
        sentences = [input]
    counts = defaultdict(int)
    for sentence in sentences:
        for entity in sentence.findall("entity"):
            counts["all-entities"] += 1
            eType = entity.get("type")
            if not isNegatableEPITrigger(eType):
                counts["out-of-scope"] += 1
                continue
            eBaseType = getEPIBaseType(eType)
            eText = entity.get("text").lower()
            eNewType = determineNewType(eType, eText)
        
            # Insert changed charOffset
            counts["entities"] += 1
            if verbose:
                print "Entity", entity.get("id"), [entity.get("text")], [eType, eBaseType, eNewType],
            if eNewType != eBaseType:
                counts["negated"] += 1
                if verbose: print "NEGATED",
            if eNewType == eType:
                counts["correct"] += 1
                if verbose: print "CORRECT"
            else:
                counts["incorrect"] += 1
                if eNewType == eBaseType:
                    counts["incorrect-pos"] += 1
                else:
                    counts["incorrect-neg"] += 1
                if verbose: print "INCORRECT"
            entity.set("type", eNewType)
    if verbose:
        print counts
    
    if not (ET.iselement(input) and input.tag == "sentence"):
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusRoot, output)
        return corpusTree                    
Exemple #33
0
 def parse(self, input, output=None, tokenizationName=None, parseName="McCC", requireEntities=False, skipIds=[], skipParsed=True, timeout=600, makePhraseElements=True, debug=False, pathParser=None, pathBioModel="AUTO", addTimeStamp=True):
     print >> sys.stderr, "BLLIP parser"
     corpusTree, corpusRoot = self.getCorpus(input)
     workdir = tempfile.mkdtemp()
     infileName, numCorpusSentences = self.makeInputFile(workdir, corpusRoot, requireEntities, skipIds, skipParsed, tokenizationName, debug)
     bllipOutput = self.runProcess(infileName, workdir, pathParser, pathBioModel, tokenizationName, timeout)        
     self.insertPennTrees(bllipOutput, corpusRoot, parseName, requireEntities, skipIds, skipParsed)
     if output != None:
         print >> sys.stderr, "Writing output to", output
         ETUtils.write(corpusRoot, output)
     # Remove work directory
     if not debug:
         shutil.rmtree(workdir)
     else:
         print >> sys.stderr, "Parser IO files at", workdir
     return corpusTree
def mergeCorpora(corpusIds, outputId, inputDir, outDir):
    merged = Catenate.catenateElements(corpusIds, inputDir)
    for dataSet in ("devel", "train"):
        renameElements(merged[dataSet].getroot(), {"Localization":"Lives_In", 
                             "Host":"Habitat", 
                             "HostPart":"Habitat",
                             "Food":"Habitat",
                             "Soil":"Habitat",
                             "Medical":"Habitat",
                             "Water":"Habitat",
                             "Bacterium":"Bacteria"})
        DeleteElements.removeElements(merged[dataSet].getroot(), {"interaction":{"type":"PartOf"}})
        if outDir != None:
            outPath = os.path.join(outDir, outputId + "-" + dataSet + ".xml")
            print "Writing set", dataSet, "to", outPath
            ETUtils.write(merged[dataSet].getroot(), outPath)
Exemple #35
0
def processCorpus(inputFilename, outputFilename, rules, reverse=False):
    print >> sys.stderr, "Deleting elements, rules =", rules
    print >> sys.stderr, "Loading corpus file", inputFilename
    corpusTree = ETUtils.ETFromObj(inputFilename)
    corpusRoot = corpusTree.getroot()
    
    countsByType = defaultdict(int)
    removeElements(corpusRoot, rules, reverse, countsByType)
    
    print >> sys.stderr, "Deleted elements"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ":", countsByType[k]
    
    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
Exemple #36
0
def processCorpus(inputFilename, outputFilename, rules, reverse=False):
    print >> sys.stderr, "Deleting elements, rules =", rules
    print >> sys.stderr, "Loading corpus file", inputFilename
    corpusTree = ETUtils.ETFromObj(inputFilename)
    corpusRoot = corpusTree.getroot()

    countsByType = defaultdict(int)
    removeElements(corpusRoot, rules, reverse, countsByType)

    print >> sys.stderr, "Deleted elements"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ":", countsByType[k]

    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
Exemple #37
0
def mergeSets(input, corpusDir=None, output=None, allowNone=False):
    # Find the files
    if isinstance(input, dict):
        filenames = [{"path":input[x], "set":x} for x in input]
    else:
        if corpusDir == None:
            if os.path.dirname(input):
                corpusDir = os.path.dirname(input)
                input = os.path.basename(input)
            else:
                corpusDir = os.path.normpath(Settings.DATAPATH + "/corpora")
        print >> sys.stderr, "Searching for corpus files at " + corpusDir + " using pattern " + input
        filenames = [{"path":os.path.join(corpusDir, x), "set":None} for x in getMatchingFiles(input, corpusDir)]
    
    # Merge the files
    print >> sys.stderr, "Merging input files", filenames
    if len(filenames) == 0:
        if allowNone:
            print >> sys.stderr, "Nothing to merge"
            return
        else:
            raise Exception("No input files found for merging")
    newRoot = None
    counts = defaultdict(int)
    for filename in filenames:
        print >> sys.stderr, "Merging file", filename["path"]
        xml = ETUtils.ETFromObj(filename["path"]).getroot()
        if newRoot == None:
            newRoot = ET.Element("corpus", xml.attrib)
        else:
            assert newRoot.attrib == xml.attrib
        for doc in xml.iter("document"):
            assert doc.get("set") != None, doc.attrib
            if filename["set"] != None:
                assert filename["set"] == doc.get("set")
            counts["set=" + doc.get("set")] += 1
            counts["set(" + filename["path"] + ")=" + doc.get("set")] += 1
        for element in xml:
            newRoot.append(element)
    print >> sys.stderr, dict(counts)
    if output != None:
        print "Writing merged corpus to", output
        ETUtils.write(newRoot, output)
    return ET.ElementTree(newRoot)
Exemple #38
0
def mixSets(input, output, docOrigIds, sourceSet, targetSet):
    print >> sys.stderr, "Mixing Sets", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()

    if docOrigIds != None:
        for document in corpusRoot.getiterator("document"):
            docId = document.get("pmid")
            if docId == None:
                docId = document.get("origId")
            if docId in docOrigIds:
                assert document.get("set") == sourceSet
                document.set("set", targetSet)
                docOrigIds.remove(docId)
        assert len(docOrigIds) == 0, docOrigIds

    sentenceIds = None
    if sentenceIds != None:
        for document in corpusRoot.getiterator("document"):
            removed = []
            for sentence in document.findall("sentence"):
                assert document.get("set") == sourceSet
                sentenceId = sentence.get("id")
                if sentenceId in sentenceIds:
                    removed.append(document.remove(sentence))
                    sentenceIds.remove(sentenceId)
            if len(removed) > 0:
                newDoc = ET.Element("document")
                for attr in document.attrib:
                    newDoc.set(attr, document.get(attr))
                newDoc.set("id", None)
                newDoc.set("set", targetSet)
                for sentence in removed:
                    newDoc.append(sentence)
                corpusRoot.append(newDoc)
        assert len(sentenceIds) == None

        RecalculateIds.recalculateIds(corpusTree, onlyWithinSentence=False)

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
def convert(inPath, outDir, corpusId, directed, negatives, preprocess, preprocessorParameters=None, debug=False, clear=False, constParser="BLLIP-BIO", depParser="STANFORD-CONVERT", logging=True):
    assert negatives in ("INCLUDE", "SKIP", "REVERSE_POS")
    # Download the corpus if needed
    if inPath == None:
        if not hasattr(Settings, "SE10T8_CORPUS"):
            SemEval2010Task8Tools.install()
        inPath = Settings.SE10T8_CORPUS
    assert os.path.exists(inPath)
    # Prepare the output directory
    if not os.path.exists(outDir):
        print "Making output directory", outDir
        os.makedirs(outDir)
    elif clear:
        print "Removing output directory", outDir
        shutil.rmtree(outDir)
    # Start logging
    if logging:
        Stream.openLog(os.path.join(outDir, "log.txt"), clear=clear)
    # Read and process the corpus files
    archive = zipfile.ZipFile(inPath, 'r')
    usedIds = set()
    tree = None
    for fileName, setName in [("SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT", "train"),\
                              ("SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT", "test")]:
        print "Processing file", fileName, "as set", setName
        f = archive.open(fileName)
        tree = processLines(f.readlines(), setName, directed=directed, negatives=negatives, usedIds=usedIds, tree=tree, corpusId=corpusId)
        f.close()
    # Divide the training set into training and development sets
    MakeSets.processCorpus(tree, None, "train", [("train", 0.7), ("devel", 1.0)], 1)
    # Write out the converted corpus
    convertedPath = os.path.join(outDir, corpusId + "-converted.xml")
    ETUtils.write(tree.getroot(), convertedPath)
    # Preprocess the converted corpus
    if preprocess:
        outPath = os.path.join(outDir, corpusId + ".xml")
        preprocessor = Preprocessor(constParser, depParser)
        preprocessor.setArgForAllSteps("debug", debug)
        preprocessor.stepArgs("CONVERT")["corpusName"] = corpusId
        preprocessor.process(convertedPath, outPath, preprocessorParameters, omitSteps=["SPLIT-SENTENCES", "NER", "SPLIT-NAMES"])
    # Stop logging
    if logging:
        Stream.closeLog(os.path.join(outDir, "log.txt"))
Exemple #40
0
 def parse(self, parserName, input, output=None, debug=False, reparse=False, syntaxNetDir=None, modelDir=None):
     # Run the parser process
     if syntaxNetDir == None:
         syntaxNetDir = Settings.SYNTAXNET_DIR
     corpusTree, corpusRoot = self.getCorpus(input)
     workdir = tempfile.mkdtemp()
     inPath = self.makeInputFile(corpusRoot, workdir)
     outPath = ProcessUtils.runSentenceProcess(self.run, syntaxNetDir, inPath, workdir, True, "SyntaxNetParser", "Parsing", processArgs={"modelDir":modelDir})
     self.insertCoNLLParses(outPath, corpusRoot, parserName, unescaping=True, conllFormat="conllx")
     # Remove work directory
     if not debug:
         shutil.rmtree(workdir)
     else:
         print >> sys.stderr, "Parser IO files at", workdir
     # Write the output XML file
     if output != None:
         print >> sys.stderr, "Writing output to", output
         ETUtils.write(corpusRoot, output)
     return corpusTree
Exemple #41
0
def mixSets(input, output, docOrigIds, sourceSet, targetSet):
    print >> sys.stderr, "Mixing Sets", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()
    
    if docOrigIds != None:
        for document in corpusRoot.getiterator("document"):
            docId = document.get("pmid")
            if docId == None:
                docId = document.get("origId")
            if docId in docOrigIds:
                assert document.get("set") == sourceSet
                document.set("set", targetSet)
                docOrigIds.remove(docId)
        assert len(docOrigIds) == 0, docOrigIds
    
    sentenceIds = None
    if sentenceIds != None:
        for document in corpusRoot.getiterator("document"):
            removed = []
            for sentence in document.findall("sentence"):
                assert document.get("set") == sourceSet
                sentenceId = sentence.get("id")
                if sentenceId in sentenceIds:
                    removed.append(document.remove(sentence))
                    sentenceIds.remove(sentenceId)
            if len(removed) > 0:
                newDoc = ET.Element("document")
                for attr in document.attrib:
                    newDoc.set(attr, document.get(attr))
                newDoc.set("id", None)
                newDoc.set("set", targetSet)
                for sentence in removed:
                    newDoc.append(sentence)
                corpusRoot.append(newDoc)
        assert len(sentenceIds) == None
    
        RecalculateIds.recalculateIds(corpusTree, onlyWithinSentence=False)
             
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemple #42
0
def processCorpus(input, output, rules):
    print >> sys.stderr, "Deleting attributes, rules =", rules
    print >> sys.stderr, "Loading corpus file", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()
    
    countsByType = {}
    for key in sorted(rules.keys()):
        for attribute in rules[key]:
            countsByType[key + ":" + attribute] = 0
        removeAttributes(corpusRoot, key, rules[key], countsByType)
    
    print >> sys.stderr, "Removed"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ":", countsByType[k]
    
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemple #43
0
def processCorpus(input, output, rules):
    print >> sys.stderr, "Deleting attributes, rules =", rules
    print >> sys.stderr, "Loading corpus file", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()

    countsByType = {}
    for key in sorted(rules.keys()):
        for attribute in rules[key]:
            countsByType[key + ":" + attribute] = 0
        removeAttributes(corpusRoot, key, rules[key], countsByType)

    print >> sys.stderr, "Removed"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ":", countsByType[k]

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemple #44
0
def processCorpus(input, output, rules):
    if rules == None:
        raise Exception("No mapping rules defined")
    elif isinstance(rules, basestring):
        rules = eval(rules)
    print >> sys.stderr, "Mapping attributes, rules =", rules
    print >> sys.stderr, "Loading corpus file", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()

    counts = defaultdict(int)
    for key in sorted(rules.keys()):
        mapAttributes(corpusRoot, key, rules[key], counts)

    print >> sys.stderr, "Mapped", dict(counts)

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemple #45
0
def processCorpus(input, output, rules):
    if rules == None:
        raise Exception("No mapping rules defined")
    elif isinstance(rules, basestring):
        rules = eval(rules)
    print >> sys.stderr, "Mapping attributes, rules =", rules
    print >> sys.stderr, "Loading corpus file", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()
    
    counts = defaultdict(int)
    for key in sorted(rules.keys()):
        mapAttributes(corpusRoot, key, rules[key], counts)
    
    print >> sys.stderr, "Mapped", dict(counts)
    
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemple #46
0
def toInteractionXML(documents, corpusName="CORPUS", output=None):
    corpusRoot = ET.Element("corpus")
    corpusRoot.set("source", corpusName)
    docCounter = 0
    for doc in documents:
        docEl = addDocumentElement(doc, corpusRoot, docCounter, corpusName)
        docCounter += 1
        # prepare mapping structures
        tMap = {}
        eventMap = {}
        for event in doc.events:
            eventMap[event.id] = event
        # write elements
        addEntityElements(doc, docEl, tMap, eventMap)
        addInteractionElements(doc, docEl, tMap)
        addParseElements(doc, docEl)

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return ET.ElementTree(corpusRoot)
def splitMergedElements(inputFilename, outputFilename=None):
    print >> sys.stderr, "##### Split elements with merged types #####"
    print >> sys.stderr, "Loading corpus", inputFilename
    corpusTree = ETUtils.ETFromObj(inputFilename)
    corpusRoot = corpusTree.getroot()
    
    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = {"entity":[0,0], "interaction":[0,0], "pair":[0,0]}
    for document in documents:
        counter.update()
        for sentence in document.findall("sentence"):
            processSentence(sentence, countsByType)
    print >> sys.stderr, "Results"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ": removed", countsByType[k][0], "created", countsByType[k][1]
    
    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
Exemple #48
0
def toInteractionXML(documents, corpusName="CORPUS", output=None):
    corpusRoot = ET.Element("corpus")
    corpusRoot.set("source", corpusName)
    docCounter = 0
    for doc in documents:
        docEl = addDocumentElement(doc, corpusRoot, docCounter, corpusName)
        docCounter += 1
        # prepare mapping structures
        tMap = {}
        eventMap = {}
        for event in doc.events:
            eventMap[event.id] = event
        # write elements
        addEntityElements(doc, docEl, tMap, eventMap)
        addInteractionElements(doc, docEl, tMap)
        addParseElements(doc, docEl)
    
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return ET.ElementTree(corpusRoot)
Exemple #49
0
def splitMergedElements(inputFilename, outputFilename=None):
    print >> sys.stderr, "##### Split elements with merged types #####"
    print >> sys.stderr, "Loading corpus", inputFilename
    corpusTree = ETUtils.ETFromObj(inputFilename)
    corpusRoot = corpusTree.getroot()

    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = {"entity": [0, 0], "interaction": [0, 0], "pair": [0, 0]}
    for document in documents:
        counter.update()
        for sentence in document.findall("sentence"):
            processSentence(sentence, countsByType)
    print >> sys.stderr, "Results"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ": removed", countsByType[k][
            0], "created", countsByType[k][1]

    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
def processCorpus(inputFilename, outputFilename, rules):
    print >> sys.stderr, "Loading corpus file", inputFilename
    if inputFilename.rsplit(".", 1)[-1] == "gz":
        import gzip
        corpusTree = ET.parse(gzip.open(inputFilename))
    else:
        corpusTree = ET.parse(inputFilename)
    corpusRoot = corpusTree.getroot()

    countsByType = {}
    for key in sorted(rules.keys()):
        for attribute in rules[key]:
            countsByType[key + ":" + attribute] = 0
        removeAttributes(corpusRoot, key, rules[key], countsByType)

    print >> sys.stderr, "Removed"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ":", countsByType[k]

    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
def fixAltOffsets(input, output=None):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()

    docCount = 0
    sentencesCreated = 0
    sentences = [x for x in corpusRoot.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "FixAltOffsets")
    fixCount = 0
    # fix spans
    for sentence in sentences:
        counter.update(
            1, "Fixing AltOffsets for sentence (" + sentence.get("id") + "): ")
        sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        for entity in sentence.findall("entity"):
            altOffsetString = entity.get("altOffset")
            if altOffsetString == None:
                continue
            #print altOffsetString
            altOffsets = Range.charOffsetToTuples(altOffsetString)
            assert len(altOffsets) == 1
            for i in range(len(altOffsets)):
                altOffset = altOffsets[i]
                altOffsets[i] = (altOffset[0] - sentOffset[0],
                                 altOffset[1] - sentOffset[0])
            entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))
            fixCount += 1

    print >> sys.stderr, "Fixed", fixCount, "altOffsets"

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemple #52
0
def mergeSentences(input, output, verbose=False):
    print >> sys.stderr, "Merging sentences into documents"
    print >> sys.stderr, "Loading corpus file", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()

    counts = defaultdict(int)
    for document in corpusRoot.findall("document"):
        counts["documents"] += 1
        # Check that the entity has only sentence elements as children
        children = [x for x in document]
        docChildTypes = sorted(set([x.tag for x in children]))
        if len(docChildTypes) == 0:
            counts["documents-with-no-sentences"] += 1
            continue
        elif len(docChildTypes) > 1 or docChildTypes[0] != "sentence":
            raise Exception("Document '" + str(document.get("id")) +
                            "' has non-sentence children: " +
                            str(docChildTypes))
        # Process all the child sentence elements
        docId = document.get("id")
        interactions = []
        entities = []
        entityById = {}
        interactionById = {}
        combinedText = ""
        calculatedOffset = (0, 0)
        for sentence in children:
            document.remove(sentence)
            sentenceText = sentence.get("head", "") + sentence.get(
                "text", "") + sentence.get("tail", "")
            sentOffset = sentence.get("charOffset")
            if sentence == children[0]:
                noDefinedOffsets = sentOffset == None
            elif (sentOffset == None) != noDefinedOffsets:
                raise Exception("Only some sentences in document '" + docId +
                                "' have defined offsets")
            if sentOffset == None:
                if sentence != children[-1]:
                    sentenceText = sentenceText + " "
                calculatedOffset = (calculatedOffset[1],
                                    calculatedOffset[1] + len(sentenceText))
                sentOffset = calculatedOffset
            else:
                sentOffset = Range.charOffsetToSingleTuple(sentOffset)
            combinedText += sentenceText
            # Collect and update the entity elements
            for entity in sentence.findall("entity"):
                # Map sentence-level entity offsets to document level
                for offsetKey in ("charOffset", "headOffset"):
                    if entity.get(offsetKey) != None:
                        offset = Range.charOffsetToTuples(
                            entity.get(offsetKey))
                        for i in range(len(offset)):
                            offset[i] = (offset[i][0] + sentOffset[0],
                                         offset[i][1] + sentOffset[0])
                        entity.set(offsetKey, Range.tuplesToCharOffset(offset))
                # Compare mapped offsets to origOffset, if available
                if entity.get("origOffset") != None:
                    if entity.get("charOffset") != entity.get("origOffset"):
                        raise Exception(
                            "Document '" + str(document.get("id")) +
                            "' entity '" + str(entity.get("id")) +
                            "' new charOffset differs from origOffset: " +
                            str([
                                entity.get("charOffset"),
                                entity.get("origOffset")
                            ]))
                    counts["checked-origOffsets"] += 1
                    del entity.attrib["origOffset"]
                assert entity.get("id") not in entityById
                entityById[entity.get(
                    "id"
                )] = entity  # For re-mapping the interaction 'e1' and 'e2' attributes
                entities.append(entity)
                counts["moved-entities"] += 1
            # Collect and update the interaction elements
            for interaction in sentence.findall("interaction"):
                assert interaction.get("id") not in interactionById
                interactionById[interaction.get(
                    "id"
                )] = interaction  # For re-mapping the interaction 'siteOf' attributes
                interactions.append(interaction)
                counts["moved-interactions"] += 1
        # Check that the combined sentence text matches the document text, if available
        if document.get("text") != None and document.get(
                "text") != combinedText:
            if combinedText == document.get(
                    "text")[0:len(combinedText)] and document.get(
                        "text")[len(combinedText):].strip() == "":
                if verbose:
                    print >> sys.stderr, "Warning, document '" + document.get(
                        "id"
                    ) + "' text has trailing whitespace not included in the combined sentence text"
                combinedText = document.get("text")
                counts["missing-trailing-whitespace"] += 1
            else:
                raise Exception(
                    "Document '" + str(document.get("id")) +
                    "' text differs from combined sentence text: " +
                    str([document.get("text"), combinedText]))
            counts["checked-document-texts"] += 1
        # Check that the entities' texts match the document text
        for entity in entities:
            offset = Range.charOffsetToTuples(entity.get("charOffset"))
            if len(offset) == 1:  # Compare only continous entities
                if not Range.contains((0, len(combinedText)), offset[0]):
                    raise Exception(
                        "Document '" + str(document.get("id")) + "' entity '" +
                        str(entity.get("id")) +
                        "' offset is not contained in combined sentence text: "
                        + str([
                            entity.attrib, offset, [0, len(combinedText)],
                            combinedText
                        ]))
                combTextSpan = combinedText[offset[0][0]:offset[0][1]]
                if entity.get("text") != combTextSpan:
                    raise Exception(
                        "Document '" + str(document.get("id")) + "' entity '" +
                        str(entity.get("id")) +
                        "' text does not match combined sentence text: " +
                        str([entity.get("text"), combTextSpan]))
                counts["checked-charOffsets"] += 1
        # Set the combined text as the document text
        document.set("text", combinedText)
        # Update entity and interaction ids (not done earlier so that possible error messages will refer to original ids, also because of siteOf-remapping)
        for i in range(len(entities)):
            entities[i].set("id", docId + ".e" +
                            str(i))  # Update the id for the document level
        for i in range(len(interactions)):
            interaction.set("id", docId + ".i" +
                            str(i))  # Update the id for the document level
        # Update interaction e1 and e2 ids (cannot be done earlier because interactions may refer to entities from multiple sentences)
        for i in range(len(interactions)):
            interaction = interactions[i]
            for entKey in ("e1", "e2"):
                interaction.set(entKey,
                                entityById[interaction.get(entKey)].get("id"))
            if interaction.get("siteOf") != None:
                interaction.set(
                    "siteOf",
                    interactionById[interaction.get("siteOf")].get("id"))
        # Add the entity and interaction elements to the document
        document.extend(entities)
        document.extend(interactions)
    print >> sys.stderr, "Counts:", dict(counts)

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemple #53
0
    def writeXML(self,
                 examples,
                 predictions,
                 corpus,
                 outputFile,
                 classSet=None,
                 parse=None,
                 tokenization=None,
                 goldCorpus=None,
                 exampleStyle=None,
                 structureAnalyzer=None):
        """
        Writes task 3 examples to interaction XML. Assumes task 3 classification
        is done with SVMMulticlass Classifier, used for two classes.
        """
        print >> sys.stderr, "Adding task 3 to Interaction XML"
        examples, predictions = self.loadExamples(examples, predictions)

        if type(classSet) == types.StringType:  # class names are in file
            classSet = IdSet(filename=classSet)
        classIds = None
        if classSet != None:
            classIds = classSet.getIds()

        corpusTree = ETUtils.ETFromObj(corpus)
        corpusRoot = corpusTree.getroot()

        # Determine subtask
        task3Type = None
        for example in examples:
            assert example[3].has_key("t3type")
            task3Type = example[3]["t3type"]
            break
        if task3Type == None:
            if outputFile != None:
                print >> sys.stderr, "Writing corpus to", outputFile
                ETUtils.write(corpusRoot, outputFile)
            return corpusTree
        assert task3Type in ["multiclass", "speculation", "negation"]

        # Remove the task 3 subtask information if it already exists
        for entity in corpusRoot.getiterator("entity"):
            if task3Type == "multiclass":
                entity.set("speculation", "False")
                entity.set("negation", "False")
            elif task3Type == "speculation":
                entity.set("speculation", "False")
            else:  # task3Type == "negation"
                entity.set("negation", "False")

        specMap = {}
        negMap = {}
        for example, prediction in itertools.izip(examples, predictions):
            assert example[3]["xtype"] == "task3"
            if example[3]["t3type"] == "multiclass":
                if isinstance(prediction, dict):
                    encoded = prediction["prediction"]
                    predictedModifiers = [
                        classSet.getName(i) for i in range(len(encoded))
                        if encoded[i] == 1
                    ]
                else:
                    predictedClassName = classSet.getName(prediction[0])
                    predictedModifiers = ""
                    if predictedClassName != "neg":
                        predictedModifiers = predictedClassName.split("---")
                if "negation" in predictedModifiers:
                    assert not negMap.has_key(example[3]["entity"])
                    negMap[example[3]["entity"]] = (True, prediction)
                if "speculation" in predictedModifiers:
                    assert not specMap.has_key(example[3]["entity"])
                    specMap[example[3]["entity"]] = (True, prediction)
            else:
                if example[3]["t3type"] == "speculation":
                    map = specMap
                else:
                    map = negMap
                if prediction[0] != 1:
                    assert not map.has_key(example[3]["entity"])
                    map[example[3]["entity"]] = (True, prediction)
                else:
                    assert not map.has_key(example[3]["entity"])
                    map[example[3]["entity"]] = (False, prediction)

        for entity in corpusRoot.getiterator("entity"):
            eId = entity.get("id")
            if task3Type == "multiclass":
                if specMap.has_key(eId):
                    entity.set("speculation", str(specMap[eId][0]))
                    entity.set(
                        "modConf",
                        self.getPredictionStrengthString(
                            specMap[eId][1], classSet, classIds))
                if negMap.has_key(eId):
                    entity.set("negation", str(negMap[eId][0]))
                    entity.set(
                        "modConf",
                        self.getPredictionStrengthString(
                            negMap[eId][1], classSet, classIds))
            else:
                if task3Type == "speculation":
                    if specMap.has_key(eId):
                        entity.set("speculation", str(specMap[eId][0]))
                        entity.set(
                            "specConf",
                            self.getPredictionStrengthString(
                                specMap[eId][1], classSet, classIds, [""]))
                elif task3Type == "negation":
                    if negMap.has_key(eId):
                        entity.set("negation", str(negMap[eId][0]))
                        entity.set(
                            "negConf",
                            self.getPredictionStrengthString(
                                negMap[eId][1], classSet, classIds,
                                ["", "speculation"]))

        # Write corpus
        if outputFile != None:
            print >> sys.stderr, "Writing corpus to", outputFile
            ETUtils.write(corpusRoot, outputFile)
        return corpusTree
    optparser.add_option("-d",
                         "--debug",
                         default=False,
                         action="store_true",
                         dest="debug",
                         help="Debug mode")
    optparser.add_option("-v",
                         "--validate",
                         default=None,
                         dest="validate",
                         help="validate input",
                         metavar="FILE")
    (options, args) = optparser.parse_args()

    s = StructureAnalyzer()
    if options.load:
        s.load(None, options.input)
    else:
        s.analyze(options.input.split(","))
    print >> sys.stderr, "--- Structure Analysis ----"
    print >> sys.stderr, s.toString()
    if options.validate != None:
        print >> sys.stderr, "--- Validation ----"
        xml = ETUtils.ETFromObj(options.validate)
        s.validate(xml, simulation=False, debug=options.debug)
        if options.output != None:
            ETUtils.write(xml, options.output)
    elif options.output != None:
        print >> sys.stderr, "Structure analysis saved to", options.output
        s.save(None, options.output)
Exemple #55
0
def mainFunc(input,
             output=None,
             parseName="McCC",
             tokenizationName=None,
             newParseName=None,
             newTokenizationName=None,
             logFileName=None,
             removeOld=True):
    print >> sys.stderr, "Protein Name Splitter"
    if logFileName != None:
        print >> sys.stderr, "Writing log to", logFileName
        logFile = open(logFileName, "wt")
    else:
        logFile = None
    #if input.endswith(".gz"):
    #    inFile = gzip.GzipFile(input)
    #else:
    #    inFile = open(input)
    tree = ETUtils.ETFromObj(input)

    if tokenizationName == None:
        tokenizationName = parseName

    #tree = ElementTree.parse(inFile)
    root = tree.getroot()

    sentences = [x for x in root.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "Split Protein Names")
    counter.showMilliseconds = True
    missingTokCount = 0
    for sentence in sentences:
        sId = sentence.get("id")
        counter.update(1, "Splitting names (" + sId + "): ")

        tok = getTokenization(tokenizationName,
                              sentence,
                              sId,
                              remove=removeOld)
        if tok == None:
            missingTokCount += 1
            continue

        assert tok is not None, "Missing tokenization '%s' in sentence %s!" % (
            tokenizationName, sId)

        parse = getParse(parseName,
                         tokenizationName,
                         sentence,
                         sId,
                         remove=removeOld)
        assert parse is not None, "Missing parse '%s' in sentence %s!" % (
            parseName, sId)

        split = splitTokens(tok, sentence, logFile)

        # Default names
        if removeOld:
            if newTokenizationName == None:
                newTokenizationName = tok.get("tokenizer")
            if newParseName == None:
                newParseName = parse.get("parser")
        else:
            if newTokenizationName == None:
                newTokenizationName = "split-" + tok.get("tokenizer")
            if newParseName == None:
                newParseName = "split-" + parse.get("parser")

        # add a new tokenization with the split tokens.
        splittok = addTokenization(newTokenizationName, sentence, sId)
        addTokensToTree(split, splittok)
        for a in tok.attrib:
            if splittok.get(a) == None:
                splittok.set(a, tok.get(a))
        #splittok.set("split-")

        # make a mapping from original to split token ids. Store the
        # head token when given.
        tokenIdMap = {}
        for t in split:
            if t.head:
                head = t.head
                # traverse
                while head.head is not None:
                    assert head.head != t, "Cyclic heads"
                    head = head.head

                # should match (nah, punctuation problems)
                # assert t.origId not in tokenIdMap or tokenIdMap[t.origId] == head.id, "Head conflict"
                tokenIdMap[t.origId] = head.id
            else:
                # only allow overwrite of existing entry if the current token
                # is not punctuation.
                if t.origId not in tokenIdMap or not t.isPunct():
                    tokenIdMap[t.origId] = t.id

        # make a copy of the specified parse that refers to the split tokens
        # instead of the originals.
        newparse = addParse(newParseName, newTokenizationName, sentence, sId)
        for a in parse.attrib:
            if newparse.get(a) == None:
                newparse.set(a, parse.get(a))
        newparse.set("ProteinNameSplitter", "True")
        splittok.set("ProteinNameSplitter", "True")

        depSeqId = 0  #1
        for d in parse.getiterator("dependency"):
            t1, t2, dType = d.get("t1"), d.get("t2"), d.get("type")
            assert t1 in tokenIdMap and t2 in tokenIdMap, "INTERNAL ERROR"

            dep = ElementTree.SubElement(newparse, "dependency")
            dep.set("t1", tokenIdMap[t1])
            dep.set("t2", tokenIdMap[t2])
            dep.set("type", dType)
            dep.set("id", "sd_%d" % depSeqId)
            depSeqId += 1

        # Add in new dependencies between the split parts.
        for t in [tok for tok in split if tok.head is not None]:
            dep = ElementTree.SubElement(newparse, "dependency")
            dep.set("t1", t.head.id)
            dep.set("t2", t.id)
            dep.set("type", t.depType)
            dep.set("split", "PNS")
            dep.set("id", "spd_%d" % depSeqId)
            depSeqId += 1

        for phrase in parse.getiterator("phrase"):
            newparse.append(phrase)

            # debugging
            #print >> sys.stderr, "NEW DEP IN", sId

    print >> sys.stderr, "Tokenization missing from", missingTokCount, "sentences"

    #indent(root)
    if logFile != None:
        logFile.close()

    # debugging
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(tree, output)
    return tree