def processParses(xml, splitTarget="McCC"): print >> sys.stderr, "Protein Name Splitting" #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget) ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True) print >> sys.stderr, "Head Detection" #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True) xml = FindHeads.findHeads(xml, splitTarget, tokenization=None, output=None, removeExisting=True)
def processParses(xml, splitTarget="McCC"): print >> sys.stderr, "---------------", "Protein Name Splitting", "---------------" #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget) ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True) print >> sys.stderr, "---------------", "Head Detection", "---------------" #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True) xml = FindHeads.findHeads(xml, splitTarget, tokenization=None, output=None, removeExisting=True)
def convertDDI(outDir, downloadDir=None, redownload=False, makeIntermediateFiles=True, debug=False): cwd = os.getcwd() if not os.path.exists(outDir): os.makedirs(outDir) os.chdir(outDir) logFileName = os.path.join(outDir, "DDI11-conversion-log.txt") Stream.openLog(logFileName) print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "=======================" corpusDir = outDir + "/DDI11-original" Utils.Download.downloadAndExtract(Settings.URL["DDI11_CORPUS"], corpusDir, downloadDir) bigfileName = os.path.join(outDir, "DDI11") #oldXML = ETUtils.ETFromObj(bigfileName+".xml") trainUnified = corpusDir + "/train" trainMTMX = corpusDir + "/train_MTMX" testUnified = corpusDir + "/test" testMTMX = corpusDir + "/test_MTMX" # Load main documents tempdir = tempfile.mkdtemp() print >> sys.stderr, "Temporary files directory at", tempdir documents, docById, docCounts = loadDocs(trainUnified) # Divide training data into a train and devel set sortedDocCounts = sorted(docCounts.iteritems(), key=lambda (k,v): (v,k), reverse=True) datasetCounts = {"train":[0,0], "devel":[0,0], "test":[0,0]} for i in range(0, len(sortedDocCounts)-3, 4): for j in [0,1]: docById[sortedDocCounts[i+j][0]].set("set", "train") datasetCounts["train"][0] += sortedDocCounts[i+j][1][0] datasetCounts["train"][1] += sortedDocCounts[i+j][1][1] docById[sortedDocCounts[i+2][0]].set("set", "train") #docById[sortedDocCounts[i+2][0]].set("set", "devel") docById[sortedDocCounts[i+3][0]].set("set", "devel") #docById[sortedDocCounts[i+3][0]].set("set", "test") datasetCounts["train"][0] += sortedDocCounts[i+2][1][0] #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0] datasetCounts["train"][1] += sortedDocCounts[i+2][1][1] #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1] datasetCounts["devel"][0] += sortedDocCounts[i+3][1][0] #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0] datasetCounts["devel"][1] += sortedDocCounts[i+3][1][1] #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1] for document in documents: # epajaolliset jaa yli if document.get("set") == None: document.set("set", "train") # Print division results print >> sys.stderr, datasetCounts for key in datasetCounts.keys(): if datasetCounts[key][1] != 0: print key, datasetCounts[key][0] / float(datasetCounts[key][1]) else: print key, datasetCounts[key][0], "/", float(datasetCounts[key][1]) # Some of the train and test ids overlap. Let's change the train set ids, because test set ones are needed # for the final evaluation. changeIdCount = 1000 for trainId in ['DrugDDI.d312', 'DrugDDI.d316', 'DrugDDI.d332', 'DrugDDI.d334', 'DrugDDI.d337', 'DrugDDI.d342', 'DrugDDI.d349', 'DrugDDI.d354', 'DrugDDI.d373', 'DrugDDI.d379', 'DrugDDI.d383', 'DrugDDI.d388', 'DrugDDI.d392', 'DrugDDI.d396', 'DrugDDI.d398', 'DrugDDI.d409', 'DrugDDI.d411', 'DrugDDI.d415', 'DrugDDI.d425', 'DrugDDI.d430', 'DrugDDI.d433', 'DrugDDI.d448', 'DrugDDI.d450', 'DrugDDI.d452', 'DrugDDI.d462', 'DrugDDI.d467', 'DrugDDI.d470', 'DrugDDI.d474', 'DrugDDI.d480', 'DrugDDI.d482', 'DrugDDI.d485', 'DrugDDI.d492', 'DrugDDI.d494', 'DrugDDI.d496', 'DrugDDI.d498', 'DrugDDI.d500', 'DrugDDI.d503', 'DrugDDI.d506', 'DrugDDI.d518', 'DrugDDI.d523', 'DrugDDI.d528', 'DrugDDI.d535', 'DrugDDI.d539', 'DrugDDI.d552', 'DrugDDI.d554', 'DrugDDI.d558', 'DrugDDI.d561', 'DrugDDI.d570', 'DrugDDI.d578']: newId = "DrugDDI.d" + str(changeIdCount) print >> sys.stderr, "Changing train/devel id", trainId, "to", newId for element in docById[trainId].getiterator(): for attrName, attrValue in element.attrib.iteritems(): if trainId in attrValue: element.set(attrName, attrValue.replace(trainId, newId)) docById[newId] = docById[trainId] del docById[trainId] changeIdCount += 1 # If test set exists, load it, too if testUnified != None: testDocuments, testDocById, testDocCounts = loadDocs(testUnified) for document in testDocuments: document.set("set", "test") documents = documents + testDocuments overlappingIds = [] for key in docById: if key in testDocById: overlappingIds.append(key) for key in docById: assert key not in testDocById, (key, docById[key].get("origId"), testDocById[key].get("origId"), sorted(docById.keys()), sorted(testDocById.keys()), sorted(overlappingIds)) docById.update(testDocById) # Add all documents into one XML xmlTree = ET.ElementTree(ET.Element("corpus")) root = xmlTree.getroot() root.set("source", "DDI11") for document in documents: root.append(document) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents-notfixed.xml") xml = xmlTree print >> sys.stderr, "Fixing DDI XML" fixEntities(xml) convertToInteractions(xml) # Add MTMX if trainMTMX != None: inDir = Utils.Download.getTopDir(tempdir, Utils.Download.downloadAndExtract(trainMTMX, tempdir, outDir + "/DDI11-original")) DDITools.addMTMX(xml, inDir) if testMTMX != None: inDir = Utils.Download.getTopDir(tempdir, Utils.Download.downloadAndExtract(testMTMX, tempdir, outDir + "/DDI11-original")) DDITools.addMTMX(xml, inDir) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents.xml") print >> sys.stderr, "---------------", "Inserting TEES-generated analyses", "---------------" Utils.Download.downloadAndExtract(Settings.URL["TEES_PARSES"], os.path.join(Settings.DATAPATH, "TEES-parses"), downloadDir, redownload=redownload) extractedFilename = os.path.join(Settings.DATAPATH, "TEES-parses") + "/DDI11" print >> sys.stderr, "Making sentences" Tools.SentenceSplitter.makeSentences(xml, extractedFilename, None) print >> sys.stderr, "Inserting McCC parses" Tools.BLLIPParser.insertParses(xml, extractedFilename, None, extraAttributes={"source":"TEES-preparsed"}) print >> sys.stderr, "Inserting Stanford conversions" Tools.StanfordParser.insertParses(xml, extractedFilename, None, extraAttributes={"stanfordSource":"TEES-preparsed"}) print >> sys.stderr, "Protein Name Splitting" splitTarget = "McCC" #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget) ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True) print >> sys.stderr, "Head Detection" #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True) xml = FindHeads.findHeads(xml, splitTarget, tokenization=None, output=None, removeExisting=True) print >> sys.stderr, "Dividing into sets" Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, "DDI11", ".xml") Stream.closeLog(logFileName) if not debug: print >> sys.stderr, "Removing temporary directory", tempdir shutil.rmtree(tempdir) os.chdir(cwd)
def convertDDI(outDir, downloadDir=None, redownload=False, makeIntermediateFiles=True, debug=False): cwd = os.getcwd() if not os.path.exists(outDir): os.makedirs(outDir) os.chdir(outDir) logFileName = os.path.join(outDir, "DDI11-conversion-log.txt") Stream.openLog(logFileName) print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "=======================" corpusDir = outDir + "/DDI11-original" Utils.Download.downloadAndExtract(Settings.URL["DDI11_CORPUS"], corpusDir, downloadDir) bigfileName = os.path.join(outDir, "DDI11") #oldXML = ETUtils.ETFromObj(bigfileName+".xml") trainUnified = corpusDir + "/train" trainMTMX = corpusDir + "/train_MTMX" testUnified = corpusDir + "/test" testMTMX = corpusDir + "/test_MTMX" # Load main documents tempdir = tempfile.mkdtemp() print >> sys.stderr, "Temporary files directory at", tempdir documents, docById, docCounts = loadDocs(trainUnified) # Divide training data into a train and devel set sortedDocCounts = sorted(docCounts.iteritems(), key=lambda (k, v): (v, k), reverse=True) datasetCounts = {"train": [0, 0], "devel": [0, 0], "test": [0, 0]} for i in range(0, len(sortedDocCounts) - 3, 4): for j in [0, 1]: docById[sortedDocCounts[i + j][0]].set("set", "train") datasetCounts["train"][0] += sortedDocCounts[i + j][1][0] datasetCounts["train"][1] += sortedDocCounts[i + j][1][1] docById[sortedDocCounts[i + 2][0]].set( "set", "train") #docById[sortedDocCounts[i+2][0]].set("set", "devel") docById[sortedDocCounts[i + 3][0]].set( "set", "devel") #docById[sortedDocCounts[i+3][0]].set("set", "test") datasetCounts["train"][0] += sortedDocCounts[i + 2][1][ 0] #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0] datasetCounts["train"][1] += sortedDocCounts[i + 2][1][ 1] #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1] datasetCounts["devel"][0] += sortedDocCounts[i + 3][1][ 0] #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0] datasetCounts["devel"][1] += sortedDocCounts[i + 3][1][ 1] #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1] for document in documents: # epajaolliset jaa yli if document.get("set") == None: document.set("set", "train") # Print division results print >> sys.stderr, datasetCounts for key in datasetCounts.keys(): if datasetCounts[key][1] != 0: print key, datasetCounts[key][0] / float(datasetCounts[key][1]) else: print key, datasetCounts[key][0], "/", float(datasetCounts[key][1]) # Some of the train and test ids overlap. Let's change the train set ids, because test set ones are needed # for the final evaluation. changeIdCount = 1000 for trainId in [ 'DrugDDI.d312', 'DrugDDI.d316', 'DrugDDI.d332', 'DrugDDI.d334', 'DrugDDI.d337', 'DrugDDI.d342', 'DrugDDI.d349', 'DrugDDI.d354', 'DrugDDI.d373', 'DrugDDI.d379', 'DrugDDI.d383', 'DrugDDI.d388', 'DrugDDI.d392', 'DrugDDI.d396', 'DrugDDI.d398', 'DrugDDI.d409', 'DrugDDI.d411', 'DrugDDI.d415', 'DrugDDI.d425', 'DrugDDI.d430', 'DrugDDI.d433', 'DrugDDI.d448', 'DrugDDI.d450', 'DrugDDI.d452', 'DrugDDI.d462', 'DrugDDI.d467', 'DrugDDI.d470', 'DrugDDI.d474', 'DrugDDI.d480', 'DrugDDI.d482', 'DrugDDI.d485', 'DrugDDI.d492', 'DrugDDI.d494', 'DrugDDI.d496', 'DrugDDI.d498', 'DrugDDI.d500', 'DrugDDI.d503', 'DrugDDI.d506', 'DrugDDI.d518', 'DrugDDI.d523', 'DrugDDI.d528', 'DrugDDI.d535', 'DrugDDI.d539', 'DrugDDI.d552', 'DrugDDI.d554', 'DrugDDI.d558', 'DrugDDI.d561', 'DrugDDI.d570', 'DrugDDI.d578' ]: newId = "DrugDDI.d" + str(changeIdCount) print >> sys.stderr, "Changing train/devel id", trainId, "to", newId for element in docById[trainId].getiterator(): for attrName, attrValue in element.attrib.iteritems(): if trainId in attrValue: element.set(attrName, attrValue.replace(trainId, newId)) docById[newId] = docById[trainId] del docById[trainId] changeIdCount += 1 # If test set exists, load it, too if testUnified != None: testDocuments, testDocById, testDocCounts = loadDocs(testUnified) for document in testDocuments: document.set("set", "test") documents = documents + testDocuments overlappingIds = [] for key in docById: if key in testDocById: overlappingIds.append(key) for key in docById: assert key not in testDocById, (key, docById[key].get("origId"), testDocById[key].get("origId"), sorted(docById.keys()), sorted(testDocById.keys()), sorted(overlappingIds)) docById.update(testDocById) # Add all documents into one XML xmlTree = ET.ElementTree(ET.Element("corpus")) root = xmlTree.getroot() root.set("source", "DDI11") for document in documents: root.append(document) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents-notfixed.xml") xml = xmlTree print >> sys.stderr, "Fixing DDI XML" fixEntities(xml) convertToInteractions(xml) # Add MTMX if trainMTMX != None: inDir = Utils.Download.getTopDir( tempdir, Utils.Download.downloadAndExtract(trainMTMX, tempdir, outDir + "/DDI11-original")) DDITools.addMTMX(xml, inDir) if testMTMX != None: inDir = Utils.Download.getTopDir( tempdir, Utils.Download.downloadAndExtract(testMTMX, tempdir, outDir + "/DDI11-original")) DDITools.addMTMX(xml, inDir) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents.xml") print >> sys.stderr, "---------------", "Inserting TEES-generated analyses", "---------------" Utils.Download.downloadAndExtract(Settings.URL["TEES_PARSES"], os.path.join(Settings.DATAPATH, "TEES-parses"), downloadDir, redownload=redownload) extractedFilename = os.path.join(Settings.DATAPATH, "TEES-parses") + "/DDI11" print >> sys.stderr, "Making sentences" Tools.SentenceSplitter.makeSentences(xml, extractedFilename, None) print >> sys.stderr, "Inserting McCC parses" Tools.BLLIPParser.insertParses( xml, extractedFilename, None, extraAttributes={"source": "TEES-preparsed"}) print >> sys.stderr, "Inserting Stanford conversions" Tools.StanfordParser.insertParses( xml, extractedFilename, None, extraAttributes={"stanfordSource": "TEES-preparsed"}) print >> sys.stderr, "Protein Name Splitting" splitTarget = "McCC" #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget) ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True) print >> sys.stderr, "Head Detection" #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True) xml = FindHeads.findHeads(xml, splitTarget, tokenization=None, output=None, removeExisting=True) print >> sys.stderr, "Dividing into sets" Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, "DDI11", ".xml") Stream.closeLog(logFileName) if not debug: print >> sys.stderr, "Removing temporary directory", tempdir shutil.rmtree(tempdir) os.chdir(cwd)