Exemple #1
0
def processCorpus(inPath, outPath, sourceSet, newSets, seed=1):
    print >> sys.stderr, "Loading corpus file", inPath
    corpusTree = ETUtils.ETFromObj(inPath)
    corpusRoot = corpusTree.getroot()

    rand = random.Random(seed)
    documents = corpusRoot.findall("document")
    counts = {"old": defaultdict(int), "new": defaultdict(int)}
    for document in documents:
        counts["old"][document.get("set")] += 1
        if sourceSet != None and document.get("set") != sourceSet:
            counts["new"][document.get("set")] += 1
            continue
        value = rand.random()
        document.set("setValue", str(value))
        document.set("origSet", document.get("set", ""))
        for setName, cutoff in newSets:
            if value <= cutoff:
                document.set("set", setName)
                break
        counts["new"][document.get("set")] += 1
    #for key in counts:
    #    counts[key] = dict(counts[key])
    print "MakeSets result:", "old=" + str(dict(
        counts["old"])) + ", new=" + str(dict(counts["new"]))
    if outPath != None:
        ETUtils.write(corpusRoot, outPath)
    return corpusTree
Exemple #2
0
 def parse(self, parserName, input, output=None, debug=False, reparse=False, stanfordParserDir=None, stanfordParserArgs=None, action="convert", outputFormat=None, memory=None):
     #global stanfordParserDir, stanfordParserArgs
     assert action in ("convert", "penn", "dep")
     if stanfordParserDir == None:
         stanfordParserDir = Settings.STANFORD_PARSER_DIR
     # Run the parser process
     corpusTree, corpusRoot = self.getCorpus(input)
     workdir = tempfile.mkdtemp()
     inPath = self.makeInputFile(corpusRoot, workdir, parserName, reparse, action, debug)
     outPath = self.runProcess(stanfordParserArgs, stanfordParserDir, inPath, workdir, action, outputFormat, memory)
     self.printStderr(outPath)
     # Insert the parses    
     if action in ("convert", "dep"):
         #self.insertDependencyParses(outPath, corpusRoot, parserName, {"stanford-mode":action}, addTimeStamp=True, skipExtra=0, removeExisting=True)
         self.insertStanfordDependencyParses(outPath, corpusRoot, parserName, skipParsed=reparse, removeExisting=reparse)
     elif action == "penn":
         self.insertPennTrees(outPath, corpusRoot, parserName)
     # Remove work directory
     if not debug:
         shutil.rmtree(workdir)
     else:
         print >> sys.stderr, "Parser IO files at", workdir
     # Write the output XML file
     if output != None:
         print >> sys.stderr, "Writing output to", output
         ETUtils.write(corpusRoot, output)
     return corpusTree
Exemple #3
0
def removeUnconnectedEntities(input, output=None):
    input = ETUtils.ETFromObj(input)
    root = input.getroot()
    removed = 0
    preserved = 0
    for document in root.findall("document"):
        sentMap = {}  # allow for intersentence interactions
        for sentence in document.findall("sentence"):
            sentMap[sentence.get("id")] = sentence
        connected = set()
        for interaction in document.getiterator("interaction"):
            connected.add(interaction.get("e1"))
            connected.add(interaction.get("e2"))
        entities = []
        for entity in document.getiterator("entity"):
            entities.append(entity)
        for entity in entities:
            if entity.get("given") == "True":  # never remove named entities
                continue
            eId = entity.get("id")
            if eId not in connected:
                if eId.find(".s") != -1:  # sentence level entity
                    sentMap[eId.rsplit(".", 1)[0]].remove(entity)
                else:  # document level entity
                    document.remove(entity)
                removed += 1
            else:
                preserved += 1

    print >> sys.stderr, "Removed", removed, "entities, preserved", preserved, "entities"

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(root, output)
    return input
Exemple #4
0
def findHeadsSyntactic(corpus, parse, tokenization):
    """
    Determine the head token for a named entity or trigger. The head token is the token closest
    to the root for the subtree of the dependency parse spanned by the text of the element.
    
    @param entityElement: a semantic node (trigger or named entity)
    @type entityElement: cElementTree.Element
    @param verbose: Print selected head tokens on screen
    @param verbose: boolean
    """
    counts = [0,0]
    sentences = [x for x in corpus.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "SYNTAX")
    for sentence in sentences:
        counter.update()
        tokElement = ETUtils.getElementByAttrib(sentence, "sentenceanalyses/tokenizations/tokenization", {"tokenizer":tokenization})
        parseElement = ETUtils.getElementByAttrib(sentence, "sentenceanalyses/parses/parse", {"parser":parse})
        if tokElement == None or parseElement == None:
            print >> sys.stderr, "Warning, sentence", sentence.get("id"), "missing parse or tokenization" 
        tokens = tokElement.findall("token")
        tokenHeadScores = getTokenHeadScores(tokens, parseElement.findall("dependency"), sentenceId=sentence.get("id"))
        for entity in sentence.findall("entity"):
            if entity.get("headOffset") == None:
                headToken = getEntityHeadToken(entity, tokens, tokenHeadScores)
                # The ElementTree entity-element is modified by setting the headOffset attribute
                entity.set("headOffset", headToken.get("charOffset"))
                entity.set("headMethod", "Syntax")
                entity.set("headString", headToken.get("text"))
                counts[0] += 1
    return counts
Exemple #5
0
def catenateElements(inputs, output):
    print >> sys.stderr, "##### Catenate interaction XML as elements #####"
    c1 = RecalculateIds.recalculateIds(input1, None, False, 0)
    numDocs = len(c1.getroot().findall("document"))
    print >> sys.stderr, "Documents in input 1:", numDocs
    c2 = RecalculateIds.recalculateIds(input2, None, False, numDocs)

    print >> sys.stderr, "Appending documents"
    c1Root = c1.getroot()
    for document in c2.getroot().findall("document"):
        c1Root.append(document)

    print >> sys.stderr, "Validating ids"
    ids = set()
    for element in c1Root.getiterator("entity"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("interaction"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("sentence"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("document"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(c1Root, output)
    return c1
Exemple #6
0
def mergeAll(input, output=None, debug=False, iterate=False):
    if iterate:
        origItems = defaultdict(int)
        removedItems = defaultdict(int)
        for docSentences in SentenceElements.getCorpusIterator(input, output):
            entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(
                docSentences, debug)
            for key in entitiesByType:
                origItems[key] += entitiesByType[key]
            for key in duplicatesRemovedByType:
                removedItems[key] += duplicatesRemovedByType[key]
            interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(
                docSentences, debug)
            for key in interactionsByType:
                origItems[key] += interactionsByType[key]
            for key in duplicatesRemovedByType:
                removedItems[key] += duplicatesRemovedByType[key]
        printStats(origItems, removedItems)
        return None
    else:
        corpusElements = CorpusElements.loadCorpus(
            input, removeIntersentenceInteractions=False)
        print >> sys.stderr, "Merging duplicate entities"
        entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(
            corpusElements.sentences, debug)
        printStats(entitiesByType, duplicatesRemovedByType)
        print >> sys.stderr, "Merging duplicate interactions"
        interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(
            corpusElements.sentences, debug)
        printStats(interactionsByType, duplicatesRemovedByType)
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusElements.rootElement, output)
        return corpusElements
 def run(cls,inFile,multiplier=1.0,outFile=None,targetLabel="neg", binary=False):
     """inFile can be a string with file name (.xml or .xml.gz) or an ElementTree or an Element or an open input stream
     multiplier adjusts the level of boosting the non-negative predictions, it is a real number (0,inf)
     multiplier 1.0 does nothing, <1.0 decreases negative class confidence, >1.0 increases negative class confidence
     the root of the modified tree is returned and, if outFile is a string, written out to outFile as well"""
     print >> sys.stderr, "##### Recall adjust with multiplier " + str(multiplier)[:5] + " #####"
     tree=ETUtils.ETFromObj(inFile)
     if not ET.iselement(tree):
         assert isinstance(tree,ET.ElementTree)
         root=tree.getroot()
     else:
         root = tree
     
     if multiplier != -1:
         if binary:
             print >> sys.stderr, "Recall binary mode"
             classRanges = getClassRanges(root.getiterator("entity"))
             assert len(classRanges.keys()) in [0,2]
             if len(classRanges.keys()) == 0:
                 print >> sys.stderr, "Warning, recall adjustment skipped because no prediction weights found"
         else:
             print >> sys.stderr, "Recall multiclass mode"
             classRanges = None
         for entityNode in root.getiterator("entity"):
             adjustEntity(entityNode,targetLabel,multiplier,classRanges)
     if outFile:
         ETUtils.write(root,outFile)
     return tree
Exemple #8
0
 def classify(self, data, model, output, parse=None, task=None, goldData=None, workDir=None, fromStep=None, omitSteps=None, validate=False):
     model = self.openModel(model, "r")
     self.enterState(self.STATE_CLASSIFY)
     self.setWorkDir(workDir)
     if workDir == None:
         self.setTempWorkDir()
     model = self.openModel(model, "r")
     if parse == None: parse = self.getStr(self.tag+"parse", model)
     workOutputTag = os.path.join(self.workDir, os.path.basename(output) + "-")
     xml = self.classifyToXML(data, model, None, workOutputTag, 
         model.get(self.tag+"classifier-model", defaultIfNotExist=None), goldData, parse, float(model.getStr("recallAdjustParameter", defaultIfNotExist=1.0)))
     if (validate):
         self.structureAnalyzer.load(model)
         self.structureAnalyzer.validate(xml)
         ETUtils.write(xml, output+"-pred.xml.gz")
     else:
         shutil.copy2(workOutputTag+self.tag+"pred.xml.gz", output+"-pred.xml.gz")
     EvaluateInteractionXML.run(self.evaluator, xml, data, parse)
     stParams = self.getBioNLPSharedTaskParams(self.bioNLPSTParams, model)
     if stParams["convert"]: #self.useBioNLPSTFormat:
         extension = ".zip" if (stParams["convert"] == "zip") else ".tar.gz" 
         Utils.STFormat.ConvertXML.toSTFormat(xml, output+"-events" + extension, outputTag=stParams["a2Tag"], writeExtra=(stParams["scores"] == True))
         if stParams["evaluate"]: #self.stEvaluator != None:
             if task == None: 
                 task = self.getStr(self.tag+"task", model)
             self.stEvaluator.evaluate(output+"-events" + extension, task)
     self.deleteTempWorkDir()
     self.exitState()
Exemple #9
0
def catenateElements(inputs, output):
    print >> sys.stderr, "##### Catenate interaction XML as elements #####"
    c1 = RecalculateIds.recalculateIds(input1, None, False, 0)
    numDocs = len(c1.getroot().findall("document"))
    print >> sys.stderr, "Documents in input 1:", numDocs
    c2 = RecalculateIds.recalculateIds(input2, None, False, numDocs)
    
    print >> sys.stderr, "Appending documents"
    c1Root = c1.getroot()
    for document in c2.getroot().findall("document"):
        c1Root.append(document)
    
    print >> sys.stderr, "Validating ids"
    ids = set()
    for element in c1Root.getiterator("entity"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("interaction"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("sentence"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("document"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(c1Root, output)
    return c1
def mergeAll(input, output=None, debug=False, iterate=False):
    if iterate:
        origItems = defaultdict(int)
        removedItems = defaultdict(int)
        for docSentences in SentenceElements.getCorpusIterator(input, output):
            entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(docSentences, debug)
            for key in entitiesByType: origItems[key] += entitiesByType[key]
            for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key]
            interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(docSentences, debug)
            for key in interactionsByType: origItems[key] += interactionsByType[key]
            for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key]
        printStats(origItems, removedItems)
        return None
    else:
        corpusElements = CorpusElements.loadCorpus(input, removeIntersentenceInteractions=False)
        print >> sys.stderr, "Merging duplicate entities"
        entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(corpusElements.sentences, debug)
        printStats(entitiesByType, duplicatesRemovedByType)
        print >> sys.stderr, "Merging duplicate interactions"
        interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(corpusElements.sentences, debug)
        printStats(interactionsByType, duplicatesRemovedByType)
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusElements.rootElement, output)
        return corpusElements
Exemple #11
0
 def parse(self,
           input,
           output=None,
           tokenizationName=None,
           parseName="McCC",
           requireEntities=False,
           skipIds=[],
           skipParsed=True,
           timeout=600,
           makePhraseElements=True,
           debug=False,
           pathParser=None,
           pathBioModel="AUTO",
           addTimeStamp=True):
     print >> sys.stderr, "BLLIP parser"
     corpusTree, corpusRoot = self.getCorpus(input)
     workdir = tempfile.mkdtemp()
     infileName, numCorpusSentences = self.makeInputFile(
         workdir, corpusRoot, requireEntities, skipIds, skipParsed,
         tokenizationName, debug)
     bllipOutput = self.runProcess(infileName, workdir, pathParser,
                                   pathBioModel, tokenizationName, timeout)
     self.insertPennTrees(bllipOutput, corpusRoot, parseName,
                          requireEntities, skipIds, skipParsed)
     if output != None:
         print >> sys.stderr, "Writing output to", output
         ETUtils.write(corpusRoot, output)
     # Remove work directory
     if not debug:
         shutil.rmtree(workdir)
     else:
         print >> sys.stderr, "Parser IO files at", workdir
     return corpusTree
Exemple #12
0
def processCorpus(inputFilename, outputFilename, rules):
    print >> sys.stderr, "Deleting elements, rules =", rules
    print >> sys.stderr, "Loading corpus file", inputFilename
    corpusTree = ETUtils.ETFromObj(inputFilename)
    corpusRoot = corpusTree.getroot()

    for eType in rules.keys():
        for attrRule in rules[eType].keys():
            if type(rules[eType][attrRule]) in types.StringTypes:
                rules[eType][attrRule] = rules[eType][attrRule].split("|")

    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = defaultdict(int)
    for document in documents:
        counter.update()
        for sentence in document.findall("sentence"):
            processSentence(sentence, rules, countsByType)
    print >> sys.stderr, "Deleted elements"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ":", countsByType[k]

    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
Exemple #13
0
def fixAltOffsets(input, output=None):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    docCount = 0
    sentencesCreated = 0
    sentences = [x for x in corpusRoot.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "FixAltOffsets")
    fixCount = 0
    # fix spans
    for sentence in sentences:
        counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ")
        sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        for entity in sentence.findall("entity"):
            altOffsetString = entity.get("altOffset")
            if altOffsetString == None:
                continue
            #print altOffsetString
            altOffsets = Range.charOffsetToTuples(altOffsetString)
            assert len(altOffsets) == 1
            for i in range(len(altOffsets)):
                altOffset = altOffsets[i] 
                altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
            entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))
            fixCount += 1
        
    print >> sys.stderr, "Fixed", fixCount, "altOffsets"
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemple #14
0
def loadEventXML(path, verbose=False):
    xml = ETUtils.ETFromObj(path)
    sentDict = {}
    for sentence in xml.getiterator("sentence"):
        sentenceText = getText(sentence).strip()
        if not sentDict.has_key(sentenceText):
            sentDict[sentenceText] = []

    for event in xml.getiterator("event"):
        sentenceText = getText(event).strip()
        if not sentDict.has_key(sentenceText):
            sentDict[sentenceText] = []
        events = sentDict[sentenceText]
        
        clue = event.find("clue")
        clueTuple = getClue(clue)
        eventType = event.find("type").get("class")
        if eventType == "Protein_amino_acid_phosphorylation":
            eventType = "Phosphorylation"
        if type(clueTuple) == types.StringType:
            if verbose: print "Event", eventType, "clue with no clueType:", ETUtils.toStr(clue)
        else:
            assert sentenceText[clueTuple[1]:clueTuple[2]+1] == clueTuple[0], (sentenceText, sentenceText[clueTuple[1]:clueTuple[2]+1], clueTuple)
            event = (clueTuple[1], clueTuple[2], eventType, clueTuple[0])
            if event not in events:
                events.append(event)
    return sentDict
Exemple #15
0
 def run(cls,inFile,multiplier=1.0,outFile=None,targetLabel="neg", binary=False):
     """inFile can be a string with file name (.xml or .xml.gz) or an ElementTree or an Element or an open input stream
     multiplier adjusts the level of boosting the non-negative predictions, it is a real number (0,inf)
     multiplier 1.0 does nothing, <1.0 decreases negative class confidence, >1.0 increases negative class confidence
     the root of the modified tree is returned and, if outFile is a string, written out to outFile as well"""
     print >> sys.stderr, "##### Recall adjust with multiplier " + str(multiplier)[:5] + " #####"
     tree=ETUtils.ETFromObj(inFile)
     if not ET.iselement(tree):
         assert isinstance(tree,ET.ElementTree)
         root=tree.getroot()
     else:
         root = tree
     
     if multiplier != -1:
         if binary:
             print >> sys.stderr, "Recall binary mode"
             classRanges = getClassRanges(root.getiterator("entity"))
             assert len(classRanges.keys()) in [0,2]
             if len(classRanges.keys()) == 0:
                 print >> sys.stderr, "Warning, recall adjustment skipped because no prediction weights found"
         else:
             print >> sys.stderr, "Recall multiclass mode"
             classRanges = None
         for entityNode in root.getiterator("entity"):
             adjustEntity(entityNode,targetLabel,multiplier,classRanges)
     if outFile:
         ETUtils.write(root,outFile)
     return tree
Exemple #16
0
def makeSubset(input, output=None, ratio=1.0, seed=0):
    if ratio == 1.0:
        if output != None:
            shutil.copy2(input, output)
            return output
        else:
            return input
    totalFolds = 100
    selectedFolds = int(ratio * 100.0)
    print >>sys.stderr, "====== Making subset ======"
    print >>sys.stderr, "Subset for ", input, "ratio", ratio, "seed", seed
    xml = ETUtils.ETFromObj(input).getroot()
    count = 0
    sentCount = 0
    for document in xml.findall("document"):
        sentCount += len(document.findall("sentence"))
        count += 1
    division = Core.Split.getFolds(count, totalFolds, seed)
    # print division, selectedFolds - 1
    index = 0
    removeCount = 0
    sentRemoveCount = 0
    for document in xml.findall("document"):
        if division[index] > selectedFolds - 1:
            xml.remove(document)
            sentRemoveCount += len(document.findall("sentence"))
            removeCount += 1
        index += 1
    print >>sys.stderr, "Subset", "doc:", count, "removed:", removeCount, "sent:", sentCount, "sentremoved:", sentRemoveCount
    xml.set("subsetRatio", str(ratio))
    xml.set("subsetSeed", str(seed))
    if output != None:
        ETUtils.write(xml, output)
    return output
Exemple #17
0
def makeSubset(input, output=None, ratio=1.0, seed=0):
    if ratio == 1.0:
        if output != None:
            shutil.copy2(input, output)
            return output
        else:
            return input
    totalFolds = 100
    selectedFolds = int(ratio * 100.0)
    print >> sys.stderr, "====== Making subset ======"
    print >> sys.stderr, "Subset for ", input, "ratio", ratio, "seed", seed
    xml = ETUtils.ETFromObj(input).getroot()
    count = 0
    sentCount = 0
    for document in xml.findall("document"):
        sentCount += len(document.findall("sentence"))
        count += 1
    division = Core.Split.getFolds(count, totalFolds, seed)
    #print division, selectedFolds - 1
    index = 0
    removeCount = 0
    sentRemoveCount = 0
    for document in xml.findall("document"):
        if division[index] > selectedFolds - 1:
            xml.remove(document)
            sentRemoveCount += len(document.findall("sentence"))
            removeCount += 1
        index += 1
    print >> sys.stderr, "Subset", "doc:", count, "removed:", removeCount, "sent:", sentCount, "sentremoved:", sentRemoveCount
    xml.set("subsetRatio", str(ratio))
    xml.set("subsetSeed", str(seed))
    if output != None:
        ETUtils.write(xml, output)
    return output
def removeUnconnectedEntities(input, output=None):
    input = ETUtils.ETFromObj(input)
    root = input.getroot()
    removed = 0
    preserved = 0
    for document in root.findall("document"):
        sentMap = {} # allow for intersentence interactions
        for sentence in document.findall("sentence"):
            sentMap[sentence.get("id")] = sentence
        connected = set()
        for interaction in document.getiterator("interaction"):
            connected.add(interaction.get("e1"))
            connected.add(interaction.get("e2"))
        entities = []
        for entity in document.getiterator("entity"):
            entities.append(entity)
        for entity in entities:
            if entity.get("isName") == "True": # never remove named entities
                continue
            eId = entity.get("id")
            if eId not in connected:
                if eId.find(".s") != -1: # sentence level entity
                    sentMap[eId.rsplit(".", 1)[0]].remove(entity)
                else: # document level entity
                    document.remove(entity)
                removed += 1
            else:
                preserved += 1
    
    print >> sys.stderr, "Removed", removed, "entities, preserved", preserved, "entities"
    
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(root, output)
    return input
Exemple #19
0
def processCorpus(inPath, outPath, sourceSet, newSets, seed=1):
    print >> sys.stderr, "Loading corpus file", inPath
    corpusTree = ETUtils.ETFromObj(inPath)
    corpusRoot = corpusTree.getroot()
    
    rand = random.Random(seed)
    documents = corpusRoot.findall("document")
    counts = {"old":defaultdict(int), "new":defaultdict(int)}
    for document in documents:
        counts["old"][document.get("set")] += 1
        if sourceSet != None and document.get("set") != sourceSet:
            counts["new"][document.get("set")] += 1
            continue
        value = rand.random()
        document.set("setValue", str(value))
        document.set("origSet", document.get("set", ""))
        for setName, cutoff in newSets:
            if value <= cutoff:
                document.set("set", setName)
                break
        counts["new"][document.get("set")] += 1
    #for key in counts:
    #    counts[key] = dict(counts[key])
    print "MakeSets result:", "old=" + str(dict(counts["old"])) + ", new=" + str(dict(counts["new"]))
    if outPath != None:
        ETUtils.write(corpusRoot, outPath)
    return corpusTree
Exemple #20
0
def processCorpus(inputFilename, outputFilename, rules):
    print >> sys.stderr, "Deleting elements, rules =", rules
    print >> sys.stderr, "Loading corpus file", inputFilename
    corpusTree = ETUtils.ETFromObj(inputFilename)
    corpusRoot = corpusTree.getroot()
    
    for eType in rules.keys():
        for attrRule in rules[eType].keys():
            rules[eType][attrRule] = rules[eType][attrRule].split("|")
    
    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = defaultdict(int)
    for document in documents:
        counter.update()
        for sentence in document.findall("sentence"):
            processSentence(sentence, rules, countsByType)
    print >> sys.stderr, "Deleted elements"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ":", countsByType[k]
    
    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
Exemple #21
0
def process(input, output=None):
    download("/tmp/extract", "/tmp/download")
    specAnn = readResources("/tmp/extract")
    insertElements(input.getroot(), specAnn)
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(input.getroot(), output)
    return input
Exemple #22
0
def test(extractPath, downloadPath, inCorpusPath, outCorpusPath):
    download(extractPath, downloadPath)
    specAnn = readResources(extractPath)
    inCorpus = ETUtils.ETFromObj(inCorpusPath)
    insertElements(inCorpus.getroot(), specAnn)
    ETUtils.write(inCorpus.getroot(), outCorpusPath)

#process("/tmp/extract", "/tmp/download", "/home/jari/Dropbox/data/BioNLP16/corpora/BB_EVENT_16-devel.xml", "/tmp/ner.xml")
Exemple #23
0
def process(input, output=None):
    download("/tmp/extract", "/tmp/download")
    specAnn = readResources("/tmp/extract")
    insertElements(input.getroot(), specAnn)
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(input.getroot(), output)
    return input
Exemple #24
0
 def insertParses(self, parseDir, input, output=None, parseName="McCC", extensions=None, subDirs=None, debug=False, skipParsed=False, docMatchKeys=None, conllFormat=None, splitting=True, unescapeFormats="AUTO", tokenMerging=True, extMap=None, sdFailedFormat="empty", origIdType=None, posTags=None):
     corpusTree, corpusRoot = self.getCorpus(input)
     if not os.path.exists(parseDir):
         raise Exception("Cannot find parse input '" + str(parseDir) + "'")
     if not os.path.isdir(parseDir):
         raise Exception("Parse input '" + str(parseDir) + "' is not a directory")
     if extensions == None:
         extensions = self.allExt
     elif isinstance(extensions, basestring):
         extensions = extensions.split(",")
     extensions = [x for x in extensions if x in self.allExt]
     unescapeFormats = self.getUnescapeFormats(unescapeFormats)
     if docMatchKeys == None:
         docMatchKeys = ["origId", "pmid", "id"]
     elif isinstance(docMatchKeys, basestring):
         docMatchKeys = docMatchKeys.split(",")
     print >> sys.stderr, "Inserting parses from file types:", extensions
     counts = defaultdict(int)
     files = self.getParseFiles(parseDir, extensions, subDirs, counts, extMap=extMap, origIdType=origIdType)
     typeCounts = {x:defaultdict(int) for x in extensions}
     # Make document elements if needed
     documents = [x for x in corpusRoot.findall("document")]
     if len(documents) == 0:
         typeCounts["document-generation"] = defaultdict(int)
         documents = self.prepareDocuments(corpusRoot, files)
     counter = ProgressCounter(len(files), "Parse Insertion")
     # Insert parses and make sentence elements if needed
     typeCounts["sentence-splitting"] = defaultdict(int)
     print >> sys.stderr, "Inserting parses for", len(files), "out of total", len(documents), "documents"
     for document in documents:
         counts["document"] += 1
         matchFound = False
         for docMatchValue in [document.get(x) for x in docMatchKeys if document.get(x) != None]:
             if docMatchValue in files:
                 if matchFound:
                     raise Exception("Multiple matching parses for document " + str(document.attrib) + " using keys " + str(docMatchKeys))
                 matchFound = True
                 counter.update(1, "Inserting parses for (" + document.get("id") + "/" + str(docMatchValue) + "): ")
                 counts["document-match"] += 1
                 for ext in extensions:
                     if ext not in files[docMatchValue]:
                         continue
                     counts[ext + "-match"] += 1
                     sentences = [x for x in self.getSentences(document, skipParsed=skipParsed)]
                     self.insertParse(document, sentences, ext, files[docMatchValue][ext], parseName, splitting, typeCounts, conllFormat, unescapeFormats=unescapeFormats, tokenMerging=tokenMerging, sdFailedFormat=sdFailedFormat, posTags=posTags)
         if not matchFound:
             counts["document-no-match"] += 1
     if len(typeCounts["sentence-splitting"]) > 0:
         print >> sys.stderr, "Sentence Splitting Counts", dict(typeCounts["sentence-splitting"])
     print >> sys.stderr, "Counts", dict(counts)
     for ext in extensions:
         if len(typeCounts[ext]) > 0:
             print >> sys.stderr, "Counts for type '" + ext + "':", dict(typeCounts[ext])
     # Write the output XML file
     if output != None:
         print >> sys.stderr, "Writing output to", output
         ETUtils.write(corpusRoot, output)
     return corpusTree
Exemple #25
0
def test(extractPath, downloadPath, inCorpusPath, outCorpusPath):
    download(extractPath, downloadPath)
    specAnn = readResources(extractPath)
    inCorpus = ETUtils.ETFromObj(inCorpusPath)
    insertElements(inCorpus.getroot(), specAnn)
    ETUtils.write(inCorpus.getroot(), outCorpusPath)


#process("/tmp/extract", "/tmp/download", "/home/jari/Dropbox/data/BioNLP16/corpora/BB_EVENT_16-devel.xml", "/tmp/ner.xml")
Exemple #26
0
def processCorpus(input, outDir, stem=None, tail=".xml", mergedSets=[], saveCombined=False, verbose=False):
    newCorpora = {}
    print >> sys.stderr, "Loading corpus file", input
    corpusRoot = ETUtils.ETFromObj(input).getroot()
    
    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = {}
    for document in documents:
        counter.update()
        docSet = document.get("set")
        if docSet == None:
            if verbose: print >> sys.stderr, "Warning, no set defined for document", document.get("id")
            if not countsByType.has_key("No set"):
                countsByType["No set"] = 0
            countsByType["No set"] += 1
            continue
        elif not newCorpora.has_key(docSet):
            newCorpora[docSet] = ET.Element("corpus")
            for k, v in corpusRoot.attrib.iteritems():
                newCorpora[docSet].set(k, v)
            countsByType[docSet] = 0
        newCorpora[docSet].append(document)
        countsByType[docSet] += 1
        
    # Make merged sets
    for mergedSet in mergedSets:
        tag = "-and-".join(sorted(mergedSet))
        if not newCorpora.has_key(tag):
            newCorpora[tag] = ET.Element("corpus")
            for k, v in corpusRoot.attrib.iteritems():
                newCorpora[tag].set(k, v)
            countsByType[tag] = 0    
        for componentSet in mergedSet:
            for element in newCorpora[componentSet].findall("document"):
                newCorpora[tag].append(element)
                countsByType[tag] += 1
        
    print >> sys.stderr, "Documents per set"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + str(k) + ":", countsByType[k]
    
    if stem == None:
        outDir, stem = os.path.dirname(outDir), os.path.basename(outDir)
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    
    print >> sys.stderr, "Writing output files to directory", outDir
    if saveCombined:
        print >> sys.stderr, "Saving combined input to", stem + tail
        ETUtils.write(corpusRoot, stem + tail)
    else:
        print >> sys.stderr, "Combined input not saved"
    for docSet in sorted(newCorpora.keys()):
        outFilename = os.path.join(outDir, stem + "-" + docSet + tail)
        print >> sys.stderr, "Writing set", docSet, "to", outFilename
        ETUtils.write(newCorpora[docSet], outFilename)
Exemple #27
0
def addMTMX(input, mtmxDir, output=None):
    from collections import defaultdict
    # read interaction XML
    print "Reading interaction XML"
    counts = defaultdict(int)
    xml = ETUtils.ETFromObj(input).getroot()
    docById = {}
    for document in xml.getiterator("document"):
        docId = document.get("origId")
        assert docId not in docById
        docById[docId] = document
        counts["document"] += 1
    for entity in xml.getiterator("entity"):
        counts["entity"] += 1
    
    # read MTMX files
    print "Processing MTMX"
    for filename in sorted(os.listdir(mtmxDir)):
        if filename.endswith(".xml"):
            print >> sys.stderr, filename,
            fileId = filename.split("_")[0]
            if fileId not in docById:
                print >> sys.stderr, "skipped"
                continue
            else:
                print >> sys.stderr, "processing"
            doc = docById[fileId]
            entityByOrigId = {}
            for entity in doc.getiterator("entity"):
                assert entity.get("origId") not in entityByOrigId, entity.get("origId")
                entityByOrigId[entity.get("origId")] = entity
            mtmx = ETUtils.ETFromObj(os.path.join(mtmxDir, filename)).getroot()
            for phrase in mtmx.getiterator("PHRASE"):
                if phrase.get("ID") in entityByOrigId:
                    entity = entityByOrigId[phrase.get("ID")]
                    mapCount = 0
                    for map in phrase.getiterator("MAP"):
                        if (map.get("NAME").lower() == entity.get("text").lower()) or (map.get("NAME_SHORT").lower() == entity.get("text").lower()):
                            if entity.get("mtmxProb") != None:
                                if int(entity.get("mtmxProb")) > int(map.get("PROB")):
                                    break
                                else:
                                    counts["mapped-multi"] += 1
                                    counts["mapped-multi-"+str(mapCount)] += 1
                                    #print filename, phrase.get("ID")
                            else:
                                counts["mapped-at-least-once"] += 1
                            entity.set("mtmxProb", str(map.get("PROB")))
                            entity.set("mtmxCui", str(map.get("CUI")))
                            entity.set("mtmxName", str(map.get("NAME")))
                            entity.set("mtmxNameShort", str(map.get("NAME_SHORT")))
                            entity.set("mtmxSemTypes", str(map.get("SEMTYPES")))
                            counts["mappings"] += 1
                            mapCount += 1
    print >> sys.stderr, counts
    if output != None:
        ETUtils.write(xml, output)
Exemple #28
0
def convertDDI13(outDir, downloadDir=None, datasets=["DDI13_TRAIN", "DDI13_TEST_TASK_9.1", "DDI13_TEST_TASK_9.2"], redownload=False, insertParses=True, parse=False, makeIntermediateFiles=True, debug=False):
    cwd = os.getcwd()
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    os.chdir(outDir)
    logFileName = os.path.join(outDir, "DDI13-conversion-log.txt")
    Stream.openLog(logFileName)
    print >> sys.stderr, "=======================", "Converting DDI'13 corpus", "======================="
    
    tempdir = tempfile.mkdtemp()
    downloaded = downloadFiles(downloadDir, tempdir, redownload)
    
    for dataset in datasets:       
        corpusTree = getCorpusXML()
        xml = corpusTree.getroot()
        print >> sys.stderr, "Merging input XMLs"
        assert downloaded[dataset] != None
        combineXML(xml, "train", downloaded[dataset], subDirs=["DrugBank", "MedLine", "NER"])
        print >> sys.stderr, "Processing elements"
        processElements(xml)
        
        if dataset == "DDI13_TRAIN":
            print >> sys.stderr, "Dividing training set into folds"
            divideSets(xml, "train", 10)
        else:
            for doc in xml.getiterator("document"):
                doc.set("set", "test")

        if parse:
            print >> sys.stderr, "Parsing"
            parseXML(corpusTree, os.path.join(tempdir, "parsing"), debug)
        elif insertParses:
            assert parse == False
            print >> sys.stderr, "Inserting McCC parses"
            Tools.BLLIPParser.insertParses(corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"source":"TEES"})
            print >> sys.stderr, "Inserting Stanford conversions"
            Tools.StanfordParser.insertParses(corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"stanfordSource":"TEES"})
        # Check what was produced by the conversion
        print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------"
        analyzer = StructureAnalyzer()
        analyzer.analyze([xml])
        print >> sys.stderr, analyzer.toString()
        if "9.1" in dataset:
            outFileName = os.path.join(outDir, "DDI13-test-task9.1.xml")
        elif "9.2" in dataset:
            outFileName = os.path.join(outDir, "DDI13-test-task9.2.xml")
        else:
            outFileName = os.path.join(outDir, "DDI13-train.xml")
        print >> sys.stderr, "Writing output to", outFileName
        ETUtils.write(xml, outFileName)
    
    Stream.closeLog(logFileName)
    if not debug and tempdir != None:
        print >> sys.stderr, "Removing temporary directory", tempdir
        shutil.rmtree(tempdir)
    os.chdir(cwd)
Exemple #29
0
def findHeads(input, parse, tokenization=None, output=None, removeExisting=True, iterate=False):
    if iterate:
        from Utils.ProgressCounter import ProgressCounter
        import InteractionXML.SentenceElements as SentenceElements
        print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization
        print >> sys.stderr, "Removing existing head offsets"
        removeCount = 0
        counter = ProgressCounter(None, "Find heads")
        counter.showMilliseconds = True
        for sentences in SentenceElements.getCorpusIterator(input, output, parse, tokenization):
            for sentence in sentences:
                if removeExisting:
                    for e in sentence.sentence.findall("entity"):
                        if e.get("headOffset") != None:
                            removeCount += 1
                            del e.attrib["headOffset"]
                graph = SentenceGraph.SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies)
                graph.mapInteractions(sentence.entities, sentence.interactions)
                # Make sure every parse gets head scores
                #if graph.tokenHeadScores == None:
                #    graph.getTokenHeadScores()
            counter.update(len(sentences), "Finding heads ("+sentences[-1].sentence.get("id")+"): ")                
        print >> sys.stderr, "Removed head offsets from", removeCount, "entities"    
    else:
        xml = ETUtils.ETFromObj(input)
        if removeExisting:
            print >> sys.stderr, "Removing existing head offsets"
            removeCount = 0
            xml = ETUtils.ETFromObj(input)
            for d in xml.getroot().findall("document"):
                for s in d.findall("sentence"):
                    for e in s.findall("entity"):
                        if e.get("headOffset") != None:
                            removeCount += 1
                            del e.attrib["headOffset"]
            print >> sys.stderr, "Removed head offsets from", removeCount, "entities"
        
        # SentenceGraph automatically calculates head offsets and adds them to entities if they are missing
        print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization
        corpusElements = SentenceGraph.loadCorpus(xml, parse, tokenization)
        
        # Make sure every parse gets head scores
        for sentence in corpusElements.sentences:
            if sentence.sentenceGraph == None:
                continue
            if sentence.sentenceGraph.tokenHeadScores == None:
                sentence.sentenceGraph.getTokenHeadScores()
        
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusElements.rootElement, output)
        return xml
Exemple #30
0
def validateCorpus(input, output, strict=True):
    print >> sys.stderr, "Validating XML"
    print >> sys.stderr, "Loading corpus file", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()
    
    counts = validate(corpusRoot, strict)
    print >> sys.stderr, "Corpus validated:", dict(counts)
    
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemple #31
0
def getCorpusIterator(input,
                      output,
                      parse,
                      tokenization=None,
                      removeNameInfo=False,
                      removeIntersentenceInteractions=True):
    import Utils.ElementTreeUtils as ETUtils
    from Utils.InteractionXML.SentenceElements import SentenceElements
    #import xml.etree.cElementTree as ElementTree

    if output != None:
        etWriter = ETUtils.ETWriter(output)
    for eTuple in ETUtils.ETIteratorFromObj(input, ("start", "end")):
        element = eTuple[1]
        if eTuple[0] in ["end", "memory"] and element.tag == "document":
            sentences = []
            for sentenceElement in element.findall("sentence"):
                #print ElementTree.tostring(sentenceElement)
                sentence = SentenceElements(sentenceElement,
                                            parse,
                                            tokenization,
                                            removeIntersentenceInteractions=
                                            removeIntersentenceInteractions)
                if len(sentence.tokens
                       ) == 0:  # or len(sentence.dependencies) == 0:
                    sentence.sentenceGraph = None
                else:
                    # Construct the basic SentenceGraph (only syntactic information)
                    graph = SentenceGraph(sentence.sentence, sentence.tokens,
                                          sentence.dependencies)
                    # Add semantic information, i.e. the interactions
                    graph.mapInteractions(sentence.entities,
                                          sentence.interactions)
                    graph.interSentenceInteractions = sentence.interSentenceInteractions
                    #duplicateInteractionEdgesRemoved += graph.duplicateInteractionEdgesRemoved
                    sentence.sentenceGraph = graph
                    graph.parseElement = sentence.parseElement
                    graph.documentElement = element
                sentences.append(sentence)
            yield sentences
            if output != None:
                etWriter.write(element)
        elif element.tag == "corpus" and output != None:
            if eTuple[0] == "start":
                etWriter.begin(element)
            else:
                etWriter.end(element)
        if eTuple[0] == "end" and element.tag in ["document", "corpus"]:
            element.clear()
    if output != None:
        etWriter.close()
def negateEvents(input, output=None, verbose=False):
    if not (ET.iselement(input) and input.tag == "sentence"):
        print >> sys.stderr, "Loading corpus file", input
        corpusTree = ETUtils.ETFromObj(input)
        corpusRoot = corpusTree.getroot()

    if not (ET.iselement(input) and input.tag == "sentence"):
        sentences = corpusRoot.getiterator("sentence")
    else:
        sentences = [input]
    counts = defaultdict(int)
    for sentence in sentences:
        for entity in sentence.findall("entity"):
            counts["all-entities"] += 1
            eType = entity.get("type")
            if not isNegatableEPITrigger(eType):
                counts["out-of-scope"] += 1
                continue
            eBaseType = getEPIBaseType(eType)
            eText = entity.get("text").lower()
            eNewType = determineNewType(eType, eText)

            # Insert changed charOffset
            counts["entities"] += 1
            if verbose:
                print "Entity", entity.get("id"), [entity.get("text")], [
                    eType, eBaseType, eNewType
                ],
            if eNewType != eBaseType:
                counts["negated"] += 1
                if verbose: print "NEGATED",
            if eNewType == eType:
                counts["correct"] += 1
                if verbose: print "CORRECT"
            else:
                counts["incorrect"] += 1
                if eNewType == eBaseType:
                    counts["incorrect-pos"] += 1
                else:
                    counts["incorrect-neg"] += 1
                if verbose: print "INCORRECT"
            entity.set("type", eNewType)
    if verbose:
        print counts

    if not (ET.iselement(input) and input.tag == "sentence"):
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusRoot, output)
        return corpusTree
Exemple #33
0
def makeConfigXML(workdir, bannerDir, oldVersion=True):
    conf = ET.Element("banner-configuration")
    banner = ET.SubElement(conf, "banner")
    eval = ET.SubElement(banner, "eval")
    datasetName = ET.SubElement(eval, "datasetName").text = "banner.eval.dataset.BC2GMDataset"
    # Dataset
    dataset = ET.SubElement(eval, "dataset")
    ET.SubElement(dataset, "sentenceFilename").text = workdir + "/input.txt"
    ET.SubElement(dataset, "mentionTestFilename").text = workdir + "/empty.eval"
    ET.SubElement(dataset, "mentionAlternateFilename").text = workdir + "/empty.eval"
    codecs.open(os.path.join(workdir, "empty.eval"), "wt", "utf-8").close()
    # More eval level stuff
    ET.SubElement(eval, "idInputFilename").text = workdir + "/ids.txt"
    ET.SubElement(eval, "rawInputFilename").text = workdir + "/raw.txt"
    ET.SubElement(eval, "trainingInputFilename").text = workdir + "/training.txt"
    ET.SubElement(eval, "outputFilename").text = workdir + "/output.txt"
    codecs.open(os.path.join(workdir, "output.txt"), "wt", "utf-8").close()
    ET.SubElement(eval, "inContextAnalysisFilename").text = workdir + "/contextAnalysis.html"
    ET.SubElement(eval, "mentionFilename").text = workdir + "/mention.txt"
    ET.SubElement(eval, "modelFilename").text = bannerDir + "/output/model_BC2GM.bin"
    ET.SubElement(eval, "lemmatiserDataDirectory").text = bannerDir + "/nlpdata/lemmatiser"
    ET.SubElement(eval, "posTaggerDataDirectory").text = bannerDir + "/nlpdata/tagger"
    ET.SubElement(eval, "posTagger").text = "dragon.nlp.tool.HeppleTagger"
    ET.SubElement(eval, "tokenizer").text = "banner.tokenization.SimpleTokenizer"
    ET.SubElement(eval, "useParenthesisPostProcessing").text = "true"
    ET.SubElement(eval, "useLocalAbbreviationPostProcessing").text = "true"
    ET.SubElement(eval, "useNumericNormalization").text = "true"
    ET.SubElement(eval, "tagFormat").text = "IOB"
    ET.SubElement(eval, "crfOrder").text = "2"
    if not oldVersion:
        ET.SubElement(eval, "mentionTypes").text = "Required"
        ET.SubElement(eval, "sameTypeOverlapOption").text = "Exception"
        ET.SubElement(eval, "differentTypeOverlapOption").text = "Exception"
    ET.SubElement(eval, "dictionaryTagger").text = "banner.tagging.dictionary.DictionaryTagger"
    # End eval element
    tagging = ET.SubElement(banner, "tagging") 
    dictionary = ET.SubElement(tagging, "dictionary")
    dictionaryTagger = ET.SubElement(dictionary, "DictionaryTagger")
    ET.SubElement(dictionaryTagger, "filterContainedMentions").text = "true"
    ET.SubElement(dictionaryTagger, "normalizeMixedCase").text = "false"
    ET.SubElement(dictionaryTagger, "normalizeDigits").text = "false"
    ET.SubElement(dictionaryTagger, "canonize").text = "false"
    ET.SubElement(dictionaryTagger, "generate2PartVariations").text = "true"
    ET.SubElement(dictionaryTagger, "dropEndParentheticals").text = "false"
    ET.SubElement(dictionaryTagger, "dictionaryFile").text = bannerDir + "/dict/single.txt"
    ET.SubElement(dictionaryTagger, "dictionaryType").text = "GENE"
    # Write to file
    filename = workdir + "/banner_config.xml"
    ETUtils.write(conf, workdir + "/banner_config.xml")
    return workdir + "/banner_config.xml"
Exemple #34
0
def getSubset(input,
              output=None,
              fraction=1.0,
              seed=0,
              ids=None,
              attributes=None,
              invert=False,
              targetElementTag="document"):
    distribution = None
    if ids == None and attributes == None:
        print >> sys.stderr, "No id-file, using pseudorandom distribution"
        distribution = getSample(
            getElementCounts(input, [targetElementTag])[targetElementTag],
            fraction, seed)
    elif attributes != None:
        print >> sys.stderr, "Selecting subset with attributes:", attributes
        for key in attributes:
            assert type(attributes[key]) in (types.ListType,
                                             types.TupleType), attributes

    counts = defaultdict(int)

    outWriter = None
    if output != None:
        outWriter = ETUtils.ETWriter(output)
    targetElementCount = 0
    skip = False
    for event in ETUtils.ETIteratorFromObj(input, ("start", "end")):
        if event[0] == "start":
            if event[1].tag == targetElementTag:
                skip = select(targetElementCount, distribution, event[1], ids,
                              attributes, invert)
                targetElementCount += 1
            if not skip:
                outWriter.begin(event[1])
                counts[event[1].tag + ":kept"] += 1
            else:
                counts[event[1].tag + ":removed"] += 1
        elif event[0] == "end":
            if not skip:
                outWriter.end(event[1])
            if event[1].tag == targetElementTag:
                skip = False
    if output != None:
        outWriter.close()
        ETUtils.encodeNewlines(output)

    print >> sys.stderr, "Subset for " + str(input) + ": " + str(counts)
Exemple #35
0
 def classify(self,
              data,
              model,
              output,
              parse=None,
              task=None,
              goldData=None,
              workDir=None,
              fromStep=None,
              omitSteps=None,
              validate=False):
     model = self.openModel(model, "r")
     self.enterState(self.STATE_CLASSIFY)
     self.setWorkDir(workDir)
     if workDir == None:
         self.setTempWorkDir()
     model = self.openModel(model, "r")
     if parse == None: parse = self.getStr(self.tag + "parse", model)
     workOutputTag = os.path.join(self.workDir,
                                  os.path.basename(output) + "-")
     xml = self.classifyToXML(
         data, model, None, workOutputTag,
         model.get(self.tag + "classifier-model", defaultIfNotExist=None),
         goldData, parse,
         float(model.getStr("recallAdjustParameter",
                            defaultIfNotExist=1.0)))
     if (validate):
         self.structureAnalyzer.load(model)
         self.structureAnalyzer.validate(xml)
         ETUtils.write(xml, output + "-pred.xml.gz")
     else:
         shutil.copy2(workOutputTag + self.tag + "pred.xml.gz",
                      output + "-pred.xml.gz")
     EvaluateInteractionXML.run(self.evaluator, xml, data, parse)
     stParams = self.getBioNLPSharedTaskParams(self.bioNLPSTParams, model)
     if stParams["convert"]:  #self.useBioNLPSTFormat:
         extension = ".zip" if (stParams["convert"] == "zip") else ".tar.gz"
         Utils.STFormat.ConvertXML.toSTFormat(
             xml,
             output + "-events" + extension,
             outputTag=stParams["a2Tag"],
             writeExtra=(stParams["scores"] == True))
         if stParams["evaluate"]:  #self.stEvaluator != None:
             if task == None:
                 task = self.getStr(self.tag + "task", model)
             self.stEvaluator.evaluate(output + "-events" + extension, task)
     self.deleteTempWorkDir()
     self.exitState()
def negateEvents(input, output=None, verbose=False):
    if not (ET.iselement(input) and input.tag == "sentence"):
        print >> sys.stderr, "Loading corpus file", input
        corpusTree = ETUtils.ETFromObj(input)
        corpusRoot = corpusTree.getroot()
    
    if not (ET.iselement(input) and input.tag == "sentence"):
        sentences = corpusRoot.getiterator("sentence")
    else:
        sentences = [input]
    counts = defaultdict(int)
    for sentence in sentences:
        for entity in sentence.findall("entity"):
            counts["all-entities"] += 1
            eType = entity.get("type")
            if not isNegatableEPITrigger(eType):
                counts["out-of-scope"] += 1
                continue
            eBaseType = getEPIBaseType(eType)
            eText = entity.get("text").lower()
            eNewType = determineNewType(eType, eText)
        
            # Insert changed charOffset
            counts["entities"] += 1
            if verbose:
                print "Entity", entity.get("id"), [entity.get("text")], [eType, eBaseType, eNewType],
            if eNewType != eBaseType:
                counts["negated"] += 1
                if verbose: print "NEGATED",
            if eNewType == eType:
                counts["correct"] += 1
                if verbose: print "CORRECT"
            else:
                counts["incorrect"] += 1
                if eNewType == eBaseType:
                    counts["incorrect-pos"] += 1
                else:
                    counts["incorrect-neg"] += 1
                if verbose: print "INCORRECT"
            entity.set("type", eNewType)
    if verbose:
        print counts
    
    if not (ET.iselement(input) and input.tag == "sentence"):
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusRoot, output)
        return corpusTree                    
def loadDrugBank(filename, preTag="{http://drugbank.ca}", verbose=False):
    data = defaultdict(lambda: defaultdict(list))
    print "Loading DrugBank XML from", filename
    xml = ETUtils.ETFromObj(filename)
    print "Processing DrugBank XML"
    root = xml.getroot()
    assert root.tag == preTag + "drugs", root.tag
    for drug in root.findall(preTag + "drug"):
        id = drug.find(preTag + "drugbank-id").text
        name = drug.find(preTag + "name").text
        if verbose: print id, name
        assert id not in data
        data[id]["name"] = name
        data[id]["id"] = id
        # TODO: Enzymes & targets
        # TODO: hydrophobicity
        getNestedItems(drug, "synonym", data[id], preTag)
        getNestedItems(drug, "brand", data[id], preTag)
        getNestedItems(drug, "group", data[id], preTag)
        getNestedItems(drug, "category", data[id], preTag, "categories")
        interactions = drug.find(preTag + "drug-interactions").findall(
            preTag + "drug-interaction")
        for interaction in interactions:
            data[id]["interaction"].append([
                interaction.find(preTag + "drug").text,
                interaction.find(preTag + "name").text,
                interaction.find(preTag + "description").text,
            ])
    return data
def loadEventXML(path, verbose=False):
    xml = ETUtils.ETFromObj(path)
    sentDict = {}
    for sentence in xml.getiterator("sentence"):
        sentenceText = getText(sentence).strip()
        if not sentDict.has_key(sentenceText):
            sentDict[sentenceText] = []

    for event in xml.getiterator("event"):
        sentenceText = getText(event).strip()
        if not sentDict.has_key(sentenceText):
            sentDict[sentenceText] = []
        events = sentDict[sentenceText]
        
        clue = event.find("clue")
        clueTuple = getClue(clue)
        eventType = event.find("type").get("class")
        if eventType == "Protein_amino_acid_phosphorylation":
            eventType = "Phosphorylation"
        if type(clueTuple) == types.StringType:
            if verbose: print "Event", eventType, "clue with no clueType:", ETUtils.toStr(clue)
        else:
            assert sentenceText[clueTuple[1]:clueTuple[2]+1] == clueTuple[0], (sentenceText, sentenceText[clueTuple[1]:clueTuple[2]+1], clueTuple)
            event = (clueTuple[1], clueTuple[2], eventType, clueTuple[0])
            if event not in events:
                events.append(event)
    return sentDict
Exemple #39
0
def addSets(corpus, xml, evalStandardDownloadPath, evalStandardPackageDir="ppi-eval-standard"):
    #evalStandardExtractPath = os.path.join(tempfile.gettempdir(), "PPIEvalStandard")
    evalStandardPath = os.path.join(tempfile.gettempdir(), evalStandardPackageDir)
    if not os.path.exists(evalStandardPath):
        print >> sys.stderr, "Extracting evaluation standard from", evalStandardDownloadPath
        Utils.Download.extractPackage(evalStandardDownloadPath, tempfile.gettempdir())
    print >> sys.stderr, "Using extracted evaluation standard at", evalStandardPath
    assert os.path.exists(evalStandardPath)
    docSets = {}
    for dataSet in "train", "test":
        dataSetXMLPath = os.path.join(evalStandardPath, dataSet, corpus + "-" + dataSet + ".xml")
        print >> sys.stderr, "Loading evaluation standard XML from", dataSetXMLPath
        dataSetXML = ETUtils.ETFromObj(dataSetXMLPath)
        for document in dataSetXML.getroot().findall("document"):
            assert document.get("id") not in docSets
            docSets[document.get("id")] = {"set":dataSet, "element":document}
    print >> sys.stderr, "Assigning sets"
    counts = defaultdict(int)
    for document in xml.findall("document"):
        counts["documents"] += 1
        docId = document.get("id")
        if docId in docSets:
            counts["documents-in-eval-standard"] += 1
            document.set("set", docSets[docId]["set"])
            if document.get("origId") != None and docSets[docId]["element"].get("origId") != None:
                assert document.get("origId") == docSets[docId]["element"].get("origId"), docId
                counts["documents-match-by-origId"] += 1
            counts["eval-standard-set:" + docSets[docId]["set"]] += 1
        else:
            print >> sys.stderr, "Warning, removing document", document.get("id"), "which is not included in the PPI evaluation standard"
            counts["missing-from-eval-standard"] += 1
            xml.remove(document)
    print >> sys.stderr, "PPI Evaluation Standard sets for corpus", corpus, "documents:", dict(counts)
    return xml
Exemple #40
0
def convertCorpus(corpus, outDir=None, downloadDir=None, redownload=False, removeAnalyses=True, develFraction=0.3, logPath=None):
    assert corpus in PPI_CORPORA
    if logPath == "AUTO":
        logPath = outDir + "/conversion/" + corpus + "-conversion-log.txt" if outDir != None else None
    if logPath:
        Stream.openLog(logPath)
    print >> sys.stderr, "==========", "Converting PPI corpus", corpus, "=========="
    downloaded = downloadCorpus(corpus, outDir, downloadDir, redownload)
    print >> sys.stderr, "---------------", "Updating Interaction XML format", "---------------"
    print >> sys.stderr, "Loading", downloaded[corpus + "_LEARNING_FORMAT"]
    xml = ETUtils.ETFromObj(downloaded[corpus + "_LEARNING_FORMAT"])
    root = xml.getroot()
    updateXML(root, removeAnalyses)
    print >> sys.stderr, "---------------", "Adding sets from the PPI evaluation standard", "---------------"
    addSets(corpus, root, downloaded["PPI_EVALUATION_STANDARD"])
    if develFraction > 0.0:
        print >> sys.stderr, "---------------", "Generating devel set", "---------------"
        MakeSets.processCorpus(xml, None, "train", [("devel", develFraction), ("train", 1.0)], 1)
    if outDir != None:
        print >> sys.stderr, "---------------", "Writing corpus", "---------------"
        #if intermediateFiles:
        #print >> sys.stderr, "Writing combined corpus"
        #ETUtils.write(xml, os.path.join(outDir, corpus + ".xml"))
        print >> sys.stderr, "Dividing into sets"
        Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, corpus, ".xml")
    
    if logPath != None:
        Stream.closeLog(logPath)
    return xml  
def mergeCorpora(corpusIds, outputId, inputDir, outDir):
    merged = Catenate.catenateElements(corpusIds, inputDir)
    for dataSet in ("devel", "train"):
        renameElements(merged[dataSet].getroot(), {"Localization":"Lives_In", 
                             "Host":"Habitat", 
                             "HostPart":"Habitat",
                             "Food":"Habitat",
                             "Soil":"Habitat",
                             "Medical":"Habitat",
                             "Water":"Habitat",
                             "Bacterium":"Bacteria"})
        DeleteElements.removeElements(merged[dataSet].getroot(), {"interaction":{"type":"PartOf"}})
        if outDir != None:
            outPath = os.path.join(outDir, outputId + "-" + dataSet + ".xml")
            print "Writing set", dataSet, "to", outPath
            ETUtils.write(merged[dataSet].getroot(), outPath)
Exemple #42
0
def makeDDISubmissionFile(input, output):
    xml = ETUtils.ETFromObj(input)
    outFile = open(output, "wt")
    for sentence in xml.getiterator("sentence"):
        # First determine which pairs interact
        intMap = defaultdict(lambda: defaultdict(lambda: None))
        for interaction in sentence.findall("interaction"):
            # Make mapping both ways to discard edge directionality. This isn't actually needed,
            # since MultiEdgeExampleBuilder builds entity pairs in the same order as this function,
            # but shouldn't harm to include it and now it works regardless of pair direction.
            if interaction.get("type") != "neg":
                intMap[interaction.get("e1")][interaction.get(
                    "e2")] = interaction
                intMap[interaction.get("e2")][interaction.get(
                    "e1")] = interaction
        # Then write all pairs to the output file
        entities = sentence.findall("entity")
        for i in range(0, len(entities) - 1):
            for j in range(i + 1, len(entities)):
                eIId = entities[i].get("id")
                eJId = entities[j].get("id")
                outFile.write(eIId + "\t" + eJId + "\t")
                if intMap[eIId][eJId] != None:
                    outFile.write("1\n")
                else:
                    outFile.write("0\n")
Exemple #43
0
def loadDocs(inDir, idStart=0):
    print "Loading documents from", inDir
    sentences = {"positive": [], "negative": []}
    docCounts = {}
    docById = {}
    documents = []
    for filename in sorted(os.listdir(inDir)):
        if filename.endswith(".xml"):
            print "Reading", filename,
            xml = ETUtils.ETFromObj(os.path.join(inDir, filename))
            for document in xml.getiterator("document"):
                counts = [0, 0]
                for sentence in document.findall("sentence"):
                    #sentence.set("document.get("origId") + "." + sentence.get("origId"))
                    truePairs = False
                    for pair in sentence.findall("pair"):
                        if pair.get("interaction") == "true":
                            truePairs = True
                            break
                    if truePairs:
                        counts[0] += 1
                        sentences["positive"].append(sentence)
                    else:
                        counts[1] += 1
                        sentences["negative"].append(sentence)
                assert document.get("id") not in docCounts
                docCounts[document.get("id")] = counts
                docById[document.get("id")] = document
                documents.append(document)
                print counts,
                #print ETUtils.toStr(document)
            print
    print "Positive sentences:", len(sentences["positive"])
    print "Negative sentences:", len(sentences["negative"])
    return documents, docById, docCounts
Exemple #44
0
def getEmptyCorpus(xml, deletionRules=None, removeNames=False):
    """
    A convenience function for getting an empty corpus, useful for testing for information leaks
    in the event extraction process.
    """
    if type(xml) in types.StringTypes:
        # XML is read from disk, so it's a new copy and can be safely modified
        xml = ETUtils.ETFromObj(xml)
    else:
        # XML is already an object in memory. To prevent problems with other users of it, a copy
        # is created before deleting elements.
        xml = copy.deepcopy(xml)
    if deletionRules == None:  # use default rules for BioNLP Shared Task
        # We remove all interactions, and all entities that are not named entities. This leaves only
        # the gold standard protein/gene names
        if removeNames:
            deletionRules = {"interaction": {}, "entity": {}}
        else:
            deletionRules = {
                "interaction": {},
                "entity": {
                    "given": (None, "False")
                }
            }
    # Remove elements and return the emptied XML
    return processCorpus(xml, None, deletionRules)
Exemple #45
0
 def parse(self, input, output=None, tokenizationName=None, parseName="McCC", requireEntities=False, skipIds=[], skipParsed=True, timeout=600, makePhraseElements=True, debug=False, pathParser=None, pathBioModel="AUTO", addTimeStamp=True):
     print >> sys.stderr, "BLLIP parser"
     corpusTree, corpusRoot = self.getCorpus(input)
     workdir = tempfile.mkdtemp()
     infileName, numCorpusSentences = self.makeInputFile(workdir, corpusRoot, requireEntities, skipIds, skipParsed, tokenizationName, debug)
     bllipOutput = self.runProcess(infileName, workdir, pathParser, pathBioModel, tokenizationName, timeout)        
     self.insertPennTrees(bllipOutput, corpusRoot, parseName, requireEntities, skipIds, skipParsed)
     if output != None:
         print >> sys.stderr, "Writing output to", output
         ETUtils.write(corpusRoot, output)
     # Remove work directory
     if not debug:
         shutil.rmtree(workdir)
     else:
         print >> sys.stderr, "Parser IO files at", workdir
     return corpusTree
    def analyze(self, inputs, model=None):
        self._init()
        if type(inputs) in types.StringTypes:
            inputs = [inputs]
        for xml in inputs:
            print >> sys.stderr, "Analyzing", xml
            xml = ETUtils.ETFromObj(xml)

            for document in xml.getiterator("document"):
                # Collect elements into dictionaries
                entityById = {}
                for entity in document.getiterator("entity"):
                    entityById[entity.get("id")] = entity
                interactions = []
                interactionsByE1 = defaultdict(list)
                for interaction in document.getiterator("interaction"):
                    interactions.append(interaction)
                    interactionsByE1[interaction.get("e1")].append(interaction)
                siteOfTypes = self.buildSiteOfMap(interactions,
                                                  interactionsByE1, entityById)
                # Add entity elements to analysis
                for entity in document.getiterator("entity"):
                    self.addEntityElement(entity, interactionsByE1)
                # Add interaction elements to analysis
                for interaction in interactions:
                    self.addInteractionElement(interaction, entityById,
                                               siteOfTypes[interaction])
                # Calculate event definition argument limits from event instances
                for event in self.events.values():
                    event.countArguments()

        self._updateSupportingAnalyses()
        if model != None:
            self.save(model)
Exemple #47
0
def processCorpus(input, attrs=["text"]):
    print attrs
    print >> sys.stderr, "Loading corpus file", input
    corpusRoot = ETUtils.ETFromObj(input).getroot()

    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = {}
    interactors = {}
    for document in documents:
        entDict = {}
        for entity in document.getiterator("entity"):
            entDict[entity.get("id")] = entity
        for interaction in document.getiterator("interaction"):
            e1 = entDict[interaction.get("e1")]
            e2 = entDict[interaction.get("e2")]
            # form identifier tuples
            e1Tuple = []
            for attr in attrs:
                e1Tuple.append(e1.get(attr))
            e1Tuple = tuple(e1Tuple)
            e2Tuple = []
            for attr in attrs:
                e2Tuple.append(e2.get(attr))
            e2Tuple = tuple(e2Tuple)
            interactors = [e1Tuple, e2Tuple]
            #interactors.sort()
            print interactors
Exemple #48
0
def combineXML(corpusXML, setName, dataDirs, subDirs=["DrugBank", "MedLine"]):
    # Add all documents into one XML
    ids = {}
    if isinstance(dataDirs, basestring):
        dataDirs = []
    for dataDir in dataDirs:
        if dataDir.startswith(".") or dataDir.startswith("_"):
            continue
        for subDir in [""] + subDirs:
            inDir = dataDir + "/" + subDir
            if "/." in dataDir or "/_" in dataDir:  # attempt to survive the junk directories
                continue
            if os.path.exists(inDir):
                for filename in sorted(os.listdir(inDir)):
                    if filename.endswith(".xml"):
                        print >> sys.stderr, "Reading", filename
                        xml = ETUtils.ETFromObj(os.path.join(inDir, filename))
                        document = xml.getroot()
                        assert document.tag == "document"
                        assert document.get("id") not in ids, (
                            document.get("id"), os.path.join(inDir, filename),
                            ids[document.get("id")])
                        ids[document.get("id")] = os.path.join(inDir, filename)
                        document.set("source", os.path.join(subDir, filename))
                        if setName != None:
                            document.set("set", setName)
                        corpusXML.append(document)
Exemple #49
0
 def convert(self, input, dataSetNames=None, corpusName=None, output=None):
     if isinstance(input, basestring) and (os.path.isdir(input)
                                           or input.endswith(".tar.gz")
                                           or input.endswith(".txt")
                                           or "," in input):
         print >> sys.stderr, "Converting ST-format to Interaction XML"
         # Get input file (or files)
         dataSetDirs = input
         documents = []
         if type(dataSetDirs) in types.StringTypes:
             dataSetDirs = dataSetDirs.split(",")
         # Get the list of "train", "devel" etc names for these sets
         if dataSetNames == None:
             dataSetNames = []
         elif type(dataSetNames) in types.StringTypes:
             dataSetNames = dataSetNames.split(",")
         # Convert all input files into one corpus
         for dataSetDir, dataSetName in itertools.izip_longest(
                 dataSetDirs, dataSetNames, fillvalue=None):
             print >> sys.stderr, "Reading", dataSetDir, "set,",
             docs = Utils.STFormat.STTools.loadSet(dataSetDir, dataSetName)
             print >> sys.stderr, len(docs), "documents"
             documents.extend(docs)
         print >> sys.stderr, "Resolving equivalences"
         Utils.STFormat.Equiv.process(documents)
         if corpusName == None:
             corpusName = "TEES"
         self.xml = Utils.STFormat.ConvertXML.toInteractionXML(
             documents, corpusName, output)
     else:
         print >> sys.stderr, "Processing source as interaction XML"
         self.xml = ETUtils.ETFromObj(input)
     return self.xml
Exemple #50
0
def process(input, output=None, preprocess=True, debug=False):
    """
    Run MetaMap.
    """    
    counter = ProgressCounter(id="MetaMap")
    
    # Create working directory
    workdir = tempfile.mkdtemp()
    
    outWriter = None
    if output != None:
        outWriter = ETUtils.ETWriter(output)
    
    # Loop iteratively over elements
    skip = False
    for event, element in ETUtils.ETIteratorFromObj(input, ("start", "end")):
        if event == "start": # element start message, element may not be fully read yet
            if element.tag == "sentence":
                sentence = element
                counter.update(1, "Processing MetaMap ("+sentence.get("id")+"): ")
                # Run metamap for the sentence element
            elif element.tag == "metamap": # skip the metamap element to remove the original one
                skip = True
            if not skip and output != None:
                outWriter.begin(element)
        
        elif event == "end": # element is fully read in memory
            if not skip and output != None:
                outWriter.end(element)

            if element.tag == "metamap":
                skip = False # write elements again after this one
                if preprocess:
                    element = convert(element, sentence)
                outWriter.write(element) # insert the new metamap element into the output stream
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        outWriter.close()
        ETUtils.encodeNewlines(output)

    if debug:
        print >> sys.stderr, "Work directory preserved for debugging at", workdir
    else:
        shutil.rmtree(workdir)

    return output
Exemple #51
0
def processCorpus(inputFilename, outputFilename, rules, reverse=False):
    print >> sys.stderr, "Deleting elements, rules =", rules
    print >> sys.stderr, "Loading corpus file", inputFilename
    corpusTree = ETUtils.ETFromObj(inputFilename)
    corpusRoot = corpusTree.getroot()
    
    countsByType = defaultdict(int)
    removeElements(corpusRoot, rules, reverse, countsByType)
    
    print >> sys.stderr, "Deleted elements"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ":", countsByType[k]
    
    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
Exemple #52
0
 def parse(self, parserName, input, output=None, debug=False, reparse=False, syntaxNetDir=None, modelDir=None):
     # Run the parser process
     if syntaxNetDir == None:
         syntaxNetDir = Settings.SYNTAXNET_DIR
     corpusTree, corpusRoot = self.getCorpus(input)
     workdir = tempfile.mkdtemp()
     inPath = self.makeInputFile(corpusRoot, workdir)
     outPath = ProcessUtils.runSentenceProcess(self.run, syntaxNetDir, inPath, workdir, True, "SyntaxNetParser", "Parsing", processArgs={"modelDir":modelDir})
     self.insertCoNLLParses(outPath, corpusRoot, parserName, unescaping=True, conllFormat="conllx")
     # Remove work directory
     if not debug:
         shutil.rmtree(workdir)
     else:
         print >> sys.stderr, "Parser IO files at", workdir
     # Write the output XML file
     if output != None:
         print >> sys.stderr, "Writing output to", output
         ETUtils.write(corpusRoot, output)
     return corpusTree
Exemple #53
0
def mixSets(input, output, docOrigIds, sourceSet, targetSet):
    print >> sys.stderr, "Mixing Sets", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()
    
    if docOrigIds != None:
        for document in corpusRoot.getiterator("document"):
            docId = document.get("pmid")
            if docId == None:
                docId = document.get("origId")
            if docId in docOrigIds:
                assert document.get("set") == sourceSet
                document.set("set", targetSet)
                docOrigIds.remove(docId)
        assert len(docOrigIds) == 0, docOrigIds
    
    sentenceIds = None
    if sentenceIds != None:
        for document in corpusRoot.getiterator("document"):
            removed = []
            for sentence in document.findall("sentence"):
                assert document.get("set") == sourceSet
                sentenceId = sentence.get("id")
                if sentenceId in sentenceIds:
                    removed.append(document.remove(sentence))
                    sentenceIds.remove(sentenceId)
            if len(removed) > 0:
                newDoc = ET.Element("document")
                for attr in document.attrib:
                    newDoc.set(attr, document.get(attr))
                newDoc.set("id", None)
                newDoc.set("set", targetSet)
                for sentence in removed:
                    newDoc.append(sentence)
                corpusRoot.append(newDoc)
        assert len(sentenceIds) == None
    
        RecalculateIds.recalculateIds(corpusTree, onlyWithinSentence=False)
             
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
def convert(inPath, outDir, corpusId, directed, negatives, preprocess, preprocessorParameters=None, debug=False, clear=False, constParser="BLLIP-BIO", depParser="STANFORD-CONVERT", logging=True):
    assert negatives in ("INCLUDE", "SKIP", "REVERSE_POS")
    # Download the corpus if needed
    if inPath == None:
        if not hasattr(Settings, "SE10T8_CORPUS"):
            SemEval2010Task8Tools.install()
        inPath = Settings.SE10T8_CORPUS
    assert os.path.exists(inPath)
    # Prepare the output directory
    if not os.path.exists(outDir):
        print "Making output directory", outDir
        os.makedirs(outDir)
    elif clear:
        print "Removing output directory", outDir
        shutil.rmtree(outDir)
    # Start logging
    if logging:
        Stream.openLog(os.path.join(outDir, "log.txt"), clear=clear)
    # Read and process the corpus files
    archive = zipfile.ZipFile(inPath, 'r')
    usedIds = set()
    tree = None
    for fileName, setName in [("SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT", "train"),\
                              ("SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT", "test")]:
        print "Processing file", fileName, "as set", setName
        f = archive.open(fileName)
        tree = processLines(f.readlines(), setName, directed=directed, negatives=negatives, usedIds=usedIds, tree=tree, corpusId=corpusId)
        f.close()
    # Divide the training set into training and development sets
    MakeSets.processCorpus(tree, None, "train", [("train", 0.7), ("devel", 1.0)], 1)
    # Write out the converted corpus
    convertedPath = os.path.join(outDir, corpusId + "-converted.xml")
    ETUtils.write(tree.getroot(), convertedPath)
    # Preprocess the converted corpus
    if preprocess:
        outPath = os.path.join(outDir, corpusId + ".xml")
        preprocessor = Preprocessor(constParser, depParser)
        preprocessor.setArgForAllSteps("debug", debug)
        preprocessor.stepArgs("CONVERT")["corpusName"] = corpusId
        preprocessor.process(convertedPath, outPath, preprocessorParameters, omitSteps=["SPLIT-SENTENCES", "NER", "SPLIT-NAMES"])
    # Stop logging
    if logging:
        Stream.closeLog(os.path.join(outDir, "log.txt"))
Exemple #55
0
def processCorpus(input, output, rules):
    if rules == None:
        raise Exception("No mapping rules defined")
    elif isinstance(rules, basestring):
        rules = eval(rules)
    print >> sys.stderr, "Mapping attributes, rules =", rules
    print >> sys.stderr, "Loading corpus file", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()
    
    counts = defaultdict(int)
    for key in sorted(rules.keys()):
        mapAttributes(corpusRoot, key, rules[key], counts)
    
    print >> sys.stderr, "Mapped", dict(counts)
    
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree