Example #1
0
def processParses(xml, splitTarget="McCC"):
    print >> sys.stderr, "Protein Name Splitting"
    #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget)
    ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True)
    print >> sys.stderr, "Head Detection"
    #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True)
    xml = FindHeads.findHeads(xml, splitTarget, tokenization=None, output=None, removeExisting=True)
Example #2
0
def processParses(xml, splitTarget="McCC"):
    print >> sys.stderr, "---------------", "Protein Name Splitting", "---------------"
    #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget)
    ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True)
    print >> sys.stderr, "---------------", "Head Detection", "---------------"
    #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True)
    xml = FindHeads.findHeads(xml, splitTarget, tokenization=None, output=None, removeExisting=True)
Example #3
0
def convertDDI(outDir, downloadDir=None, redownload=False, makeIntermediateFiles=True, debug=False):
    cwd = os.getcwd()
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    os.chdir(outDir)
    logFileName = os.path.join(outDir, "DDI11-conversion-log.txt")
    Stream.openLog(logFileName)
    print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "======================="
    corpusDir = outDir + "/DDI11-original"
    Utils.Download.downloadAndExtract(Settings.URL["DDI11_CORPUS"], corpusDir, downloadDir)
    
    bigfileName = os.path.join(outDir, "DDI11")
    #oldXML = ETUtils.ETFromObj(bigfileName+".xml")
    trainUnified = corpusDir + "/train"
    trainMTMX = corpusDir + "/train_MTMX"
    testUnified = corpusDir + "/test"
    testMTMX = corpusDir + "/test_MTMX"
    
    # Load main documents
    tempdir = tempfile.mkdtemp()
    print >> sys.stderr, "Temporary files directory at", tempdir
    documents, docById, docCounts = loadDocs(trainUnified)
    # Divide training data into a train and devel set
    sortedDocCounts = sorted(docCounts.iteritems(), key=lambda (k,v): (v,k), reverse=True)
    datasetCounts = {"train":[0,0], "devel":[0,0], "test":[0,0]}
    for i in range(0, len(sortedDocCounts)-3, 4):
        for j in [0,1]:
            docById[sortedDocCounts[i+j][0]].set("set", "train")
            datasetCounts["train"][0] += sortedDocCounts[i+j][1][0]
            datasetCounts["train"][1] += sortedDocCounts[i+j][1][1]
        docById[sortedDocCounts[i+2][0]].set("set", "train") #docById[sortedDocCounts[i+2][0]].set("set", "devel")
        docById[sortedDocCounts[i+3][0]].set("set", "devel") #docById[sortedDocCounts[i+3][0]].set("set", "test")
        datasetCounts["train"][0] += sortedDocCounts[i+2][1][0] #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0]
        datasetCounts["train"][1] += sortedDocCounts[i+2][1][1] #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1]
        datasetCounts["devel"][0] += sortedDocCounts[i+3][1][0] #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0]
        datasetCounts["devel"][1] += sortedDocCounts[i+3][1][1] #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1]
    for document in documents: # epajaolliset jaa yli
        if document.get("set") == None:
            document.set("set", "train")
    # Print division results
    print >> sys.stderr, datasetCounts
    for key in datasetCounts.keys():
        if datasetCounts[key][1] != 0:
            print key, datasetCounts[key][0] / float(datasetCounts[key][1])
        else:
            print key, datasetCounts[key][0], "/", float(datasetCounts[key][1])
    # Some of the train and test ids overlap. Let's change the train set ids, because test set ones are needed
    # for the final evaluation.
    changeIdCount = 1000
    for trainId in ['DrugDDI.d312', 'DrugDDI.d316', 'DrugDDI.d332', 'DrugDDI.d334', 'DrugDDI.d337', 
                    'DrugDDI.d342', 'DrugDDI.d349', 'DrugDDI.d354', 'DrugDDI.d373', 'DrugDDI.d379', 
                    'DrugDDI.d383', 'DrugDDI.d388', 'DrugDDI.d392', 'DrugDDI.d396', 'DrugDDI.d398', 
                    'DrugDDI.d409', 'DrugDDI.d411', 'DrugDDI.d415', 'DrugDDI.d425', 'DrugDDI.d430', 
                    'DrugDDI.d433', 'DrugDDI.d448', 'DrugDDI.d450', 'DrugDDI.d452', 'DrugDDI.d462', 
                    'DrugDDI.d467', 'DrugDDI.d470', 'DrugDDI.d474', 'DrugDDI.d480', 'DrugDDI.d482', 
                    'DrugDDI.d485', 'DrugDDI.d492', 'DrugDDI.d494', 'DrugDDI.d496', 'DrugDDI.d498', 
                    'DrugDDI.d500', 'DrugDDI.d503', 'DrugDDI.d506', 'DrugDDI.d518', 'DrugDDI.d523', 
                    'DrugDDI.d528', 'DrugDDI.d535', 'DrugDDI.d539', 'DrugDDI.d552', 'DrugDDI.d554', 
                    'DrugDDI.d558', 'DrugDDI.d561', 'DrugDDI.d570', 'DrugDDI.d578']:
        newId = "DrugDDI.d" + str(changeIdCount)
        print >> sys.stderr, "Changing train/devel id", trainId, "to", newId
        for element in docById[trainId].getiterator():
            for attrName, attrValue in element.attrib.iteritems():
                if trainId in attrValue:
                    element.set(attrName, attrValue.replace(trainId, newId))
        docById[newId] = docById[trainId]
        del docById[trainId]
        changeIdCount += 1
    # If test set exists, load it, too
    if testUnified != None:
        testDocuments, testDocById, testDocCounts = loadDocs(testUnified)
        for document in testDocuments:
            document.set("set", "test")
        documents = documents + testDocuments
        overlappingIds = []
        for key in docById:
            if key in testDocById:
                overlappingIds.append(key)
        for key in docById:
            assert key not in testDocById, (key, docById[key].get("origId"), testDocById[key].get("origId"), sorted(docById.keys()), sorted(testDocById.keys()), sorted(overlappingIds))
        docById.update(testDocById)
    
    # Add all documents into one XML
    xmlTree = ET.ElementTree(ET.Element("corpus"))
    root = xmlTree.getroot()
    root.set("source", "DDI11")
    for document in documents:
        root.append(document)
    if makeIntermediateFiles:
        ETUtils.write(root, bigfileName + "-documents-notfixed.xml")
    xml = xmlTree
    print >> sys.stderr, "Fixing DDI XML"
    fixEntities(xml)
    convertToInteractions(xml)
    # Add MTMX
    if trainMTMX != None:
        inDir = Utils.Download.getTopDir(tempdir, Utils.Download.downloadAndExtract(trainMTMX, tempdir, outDir + "/DDI11-original"))
        DDITools.addMTMX(xml, inDir)
    if testMTMX != None:
        inDir = Utils.Download.getTopDir(tempdir, Utils.Download.downloadAndExtract(testMTMX, tempdir, outDir + "/DDI11-original"))
        DDITools.addMTMX(xml, inDir)
    if makeIntermediateFiles:
        ETUtils.write(root, bigfileName + "-documents.xml")



    print >> sys.stderr, "---------------", "Inserting TEES-generated analyses", "---------------"
    Utils.Download.downloadAndExtract(Settings.URL["TEES_PARSES"], os.path.join(Settings.DATAPATH, "TEES-parses"), downloadDir, redownload=redownload)
    extractedFilename = os.path.join(Settings.DATAPATH, "TEES-parses") + "/DDI11"
    print >> sys.stderr, "Making sentences"
    Tools.SentenceSplitter.makeSentences(xml, extractedFilename, None)
    print >> sys.stderr, "Inserting McCC parses"
    Tools.BLLIPParser.insertParses(xml, extractedFilename, None, extraAttributes={"source":"TEES-preparsed"})
    print >> sys.stderr, "Inserting Stanford conversions"
    Tools.StanfordParser.insertParses(xml, extractedFilename, None, extraAttributes={"stanfordSource":"TEES-preparsed"})
    print >> sys.stderr, "Protein Name Splitting"
    splitTarget = "McCC"
    #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget)
    ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True)
    print >> sys.stderr, "Head Detection"
    #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True)
    xml = FindHeads.findHeads(xml, splitTarget, tokenization=None, output=None, removeExisting=True)    
    
    print >> sys.stderr, "Dividing into sets"
    Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, "DDI11", ".xml")
    
    Stream.closeLog(logFileName)
    if not debug:
        print >> sys.stderr, "Removing temporary directory", tempdir
        shutil.rmtree(tempdir)
    os.chdir(cwd)
Example #4
0
def convertDDI(outDir,
               downloadDir=None,
               redownload=False,
               makeIntermediateFiles=True,
               debug=False):
    cwd = os.getcwd()
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    os.chdir(outDir)
    logFileName = os.path.join(outDir, "DDI11-conversion-log.txt")
    Stream.openLog(logFileName)
    print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "======================="
    corpusDir = outDir + "/DDI11-original"
    Utils.Download.downloadAndExtract(Settings.URL["DDI11_CORPUS"], corpusDir,
                                      downloadDir)

    bigfileName = os.path.join(outDir, "DDI11")
    #oldXML = ETUtils.ETFromObj(bigfileName+".xml")
    trainUnified = corpusDir + "/train"
    trainMTMX = corpusDir + "/train_MTMX"
    testUnified = corpusDir + "/test"
    testMTMX = corpusDir + "/test_MTMX"

    # Load main documents
    tempdir = tempfile.mkdtemp()
    print >> sys.stderr, "Temporary files directory at", tempdir
    documents, docById, docCounts = loadDocs(trainUnified)
    # Divide training data into a train and devel set
    sortedDocCounts = sorted(docCounts.iteritems(),
                             key=lambda (k, v): (v, k),
                             reverse=True)
    datasetCounts = {"train": [0, 0], "devel": [0, 0], "test": [0, 0]}
    for i in range(0, len(sortedDocCounts) - 3, 4):
        for j in [0, 1]:
            docById[sortedDocCounts[i + j][0]].set("set", "train")
            datasetCounts["train"][0] += sortedDocCounts[i + j][1][0]
            datasetCounts["train"][1] += sortedDocCounts[i + j][1][1]
        docById[sortedDocCounts[i + 2][0]].set(
            "set",
            "train")  #docById[sortedDocCounts[i+2][0]].set("set", "devel")
        docById[sortedDocCounts[i + 3][0]].set(
            "set",
            "devel")  #docById[sortedDocCounts[i+3][0]].set("set", "test")
        datasetCounts["train"][0] += sortedDocCounts[i + 2][1][
            0]  #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0]
        datasetCounts["train"][1] += sortedDocCounts[i + 2][1][
            1]  #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1]
        datasetCounts["devel"][0] += sortedDocCounts[i + 3][1][
            0]  #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0]
        datasetCounts["devel"][1] += sortedDocCounts[i + 3][1][
            1]  #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1]
    for document in documents:  # epajaolliset jaa yli
        if document.get("set") == None:
            document.set("set", "train")
    # Print division results
    print >> sys.stderr, datasetCounts
    for key in datasetCounts.keys():
        if datasetCounts[key][1] != 0:
            print key, datasetCounts[key][0] / float(datasetCounts[key][1])
        else:
            print key, datasetCounts[key][0], "/", float(datasetCounts[key][1])
    # Some of the train and test ids overlap. Let's change the train set ids, because test set ones are needed
    # for the final evaluation.
    changeIdCount = 1000
    for trainId in [
            'DrugDDI.d312', 'DrugDDI.d316', 'DrugDDI.d332', 'DrugDDI.d334',
            'DrugDDI.d337', 'DrugDDI.d342', 'DrugDDI.d349', 'DrugDDI.d354',
            'DrugDDI.d373', 'DrugDDI.d379', 'DrugDDI.d383', 'DrugDDI.d388',
            'DrugDDI.d392', 'DrugDDI.d396', 'DrugDDI.d398', 'DrugDDI.d409',
            'DrugDDI.d411', 'DrugDDI.d415', 'DrugDDI.d425', 'DrugDDI.d430',
            'DrugDDI.d433', 'DrugDDI.d448', 'DrugDDI.d450', 'DrugDDI.d452',
            'DrugDDI.d462', 'DrugDDI.d467', 'DrugDDI.d470', 'DrugDDI.d474',
            'DrugDDI.d480', 'DrugDDI.d482', 'DrugDDI.d485', 'DrugDDI.d492',
            'DrugDDI.d494', 'DrugDDI.d496', 'DrugDDI.d498', 'DrugDDI.d500',
            'DrugDDI.d503', 'DrugDDI.d506', 'DrugDDI.d518', 'DrugDDI.d523',
            'DrugDDI.d528', 'DrugDDI.d535', 'DrugDDI.d539', 'DrugDDI.d552',
            'DrugDDI.d554', 'DrugDDI.d558', 'DrugDDI.d561', 'DrugDDI.d570',
            'DrugDDI.d578'
    ]:
        newId = "DrugDDI.d" + str(changeIdCount)
        print >> sys.stderr, "Changing train/devel id", trainId, "to", newId
        for element in docById[trainId].getiterator():
            for attrName, attrValue in element.attrib.iteritems():
                if trainId in attrValue:
                    element.set(attrName, attrValue.replace(trainId, newId))
        docById[newId] = docById[trainId]
        del docById[trainId]
        changeIdCount += 1
    # If test set exists, load it, too
    if testUnified != None:
        testDocuments, testDocById, testDocCounts = loadDocs(testUnified)
        for document in testDocuments:
            document.set("set", "test")
        documents = documents + testDocuments
        overlappingIds = []
        for key in docById:
            if key in testDocById:
                overlappingIds.append(key)
        for key in docById:
            assert key not in testDocById, (key, docById[key].get("origId"),
                                            testDocById[key].get("origId"),
                                            sorted(docById.keys()),
                                            sorted(testDocById.keys()),
                                            sorted(overlappingIds))
        docById.update(testDocById)

    # Add all documents into one XML
    xmlTree = ET.ElementTree(ET.Element("corpus"))
    root = xmlTree.getroot()
    root.set("source", "DDI11")
    for document in documents:
        root.append(document)
    if makeIntermediateFiles:
        ETUtils.write(root, bigfileName + "-documents-notfixed.xml")
    xml = xmlTree
    print >> sys.stderr, "Fixing DDI XML"
    fixEntities(xml)
    convertToInteractions(xml)
    # Add MTMX
    if trainMTMX != None:
        inDir = Utils.Download.getTopDir(
            tempdir,
            Utils.Download.downloadAndExtract(trainMTMX, tempdir,
                                              outDir + "/DDI11-original"))
        DDITools.addMTMX(xml, inDir)
    if testMTMX != None:
        inDir = Utils.Download.getTopDir(
            tempdir,
            Utils.Download.downloadAndExtract(testMTMX, tempdir,
                                              outDir + "/DDI11-original"))
        DDITools.addMTMX(xml, inDir)
    if makeIntermediateFiles:
        ETUtils.write(root, bigfileName + "-documents.xml")

    print >> sys.stderr, "---------------", "Inserting TEES-generated analyses", "---------------"
    Utils.Download.downloadAndExtract(Settings.URL["TEES_PARSES"],
                                      os.path.join(Settings.DATAPATH,
                                                   "TEES-parses"),
                                      downloadDir,
                                      redownload=redownload)
    extractedFilename = os.path.join(Settings.DATAPATH,
                                     "TEES-parses") + "/DDI11"
    print >> sys.stderr, "Making sentences"
    Tools.SentenceSplitter.makeSentences(xml, extractedFilename, None)
    print >> sys.stderr, "Inserting McCC parses"
    Tools.BLLIPParser.insertParses(
        xml,
        extractedFilename,
        None,
        extraAttributes={"source": "TEES-preparsed"})
    print >> sys.stderr, "Inserting Stanford conversions"
    Tools.StanfordParser.insertParses(
        xml,
        extractedFilename,
        None,
        extraAttributes={"stanfordSource": "TEES-preparsed"})
    print >> sys.stderr, "Protein Name Splitting"
    splitTarget = "McCC"
    #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget)
    ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True)
    print >> sys.stderr, "Head Detection"
    #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True)
    xml = FindHeads.findHeads(xml,
                              splitTarget,
                              tokenization=None,
                              output=None,
                              removeExisting=True)

    print >> sys.stderr, "Dividing into sets"
    Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, "DDI11", ".xml")

    Stream.closeLog(logFileName)
    if not debug:
        print >> sys.stderr, "Removing temporary directory", tempdir
        shutil.rmtree(tempdir)
    os.chdir(cwd)