def removeX(filename, resultFileTag="a2"): documents = STTools.loadSet(filename) newFilename = os.path.join(tempfile.tempdir, filename.rsplit(".", 2)[0] + "-no-X.tar.gz") STTools.writeSet(documents, newFilename, resultFileTag=resultFileTag, writeExtra=False, files=["a2", "rel"]) return newFilename
def convert(datasets, outdir, corpusName): # Depends on CO-conversion bigfileName = corpusName + "-" + "-and-".join([x[0] for x in datasets]) documents = [] for pair in datasets: print >> sys.stderr, "Reading", pair[0], "set,", docs = ST.loadSet(pair[1], pair[0]) print >> sys.stderr, len(docs), "documents" documents.extend(docs) print >> sys.stderr, "Converting to", bigfileName+"-documents.xml" xml = STConvert.toInteractionXML(documents, corpusName, bigfileName+"-documents.xml") print >> sys.stderr, "Making sentences" xml = Tools.GeniaSentenceSplitter.makeSentences(xml, bigfileName+"-sentences.xml") #print >> sys.stderr, "Copying parses" #parsePath = "/home/jari/biotext/BioNLP2011/data/CO/co-devel-and-train-and-test.xml" #InteractionXML.CopyParse.copyParse(bigfileName+"-sentences.xml", parsePath, bigfileName+"-copied-parses.xml", "split-McClosky", "split-McClosky") print >> sys.stderr, "Parsing" Tools.CharniakJohnsonParser.parse(bigfileName+"-sentences.xml", bigfileName+"-parsed.xml", tokenizationName=None, parseName="McClosky", requireEntities=False) print >> sys.stderr, "Stanford Conversion" Tools.StanfordParser.convertXML("McClosky", bigfileName+"-parsed.xml", bigfileName+"-stanford.xml") print >> sys.stderr, "Protein Name Splitting" splitterCommand = "python /home/jari/cvs_checkout/PPI_Learning/Analysers/ProteinNameSplitter.py -f " + bigfileName+"-stanford.xml" + " -o " + bigfileName+"-split.xml" + " -p " + "McClosky" + " -t " + "McClosky" + " -s split-McClosky" + " -n split-McClosky" subprocess.call(splitterCommand, shell=True) print >> sys.stderr, "Head Detection" xml = FindHeads.findHeads(bigfileName+"-split.xml", "split-McClosky", tokenization=None, output=bigfileName+".xml", removeExisting=True) print >> sys.stderr, "Dividing into sets" InteractionXML.DivideSets.processCorpus(bigfileName+".xml", outDir, corpusName + "-", ".xml", [("devel", "train")]) if "devel" in [x[0] for x in datasets]: print >> sys.stderr, "Creating empty devel set" deletionRules = {"interaction":{},"entity":{"isName":"False"}} InteractionXML.DeleteElements.processCorpus(corpusName + "-devel.xml", corpusName + "-devel-empty.xml", deletionRules) return xml
def convert(datasets, outdir, corpusName): bigfileName = corpusName + "-" + "-and-".join([x[0] for x in datasets]) documents = [] for pair in datasets: print >> sys.stderr, "Reading", pair[0], "set,", docs = ST.loadSet(pair[1], pair[0]) print >> sys.stderr, len(docs), "documents" documents.extend(docs) # print >> sys.stderr, "Converting to", bigfileName+"-documents.xml" # xml = STConvert.toInteractionXML(documents, corpusName, bigfileName+"-documents.xml") # print >> sys.stderr, "Making sentences" # xml = Tools.GeniaSentenceSplitter.makeSentences(xml, bigfileName+"-sentences.xml") # print >> sys.stderr, "Parsing" # Tools.CharniakJohnsonParser.parse(xml, bigfileName+"-parsed.xml", tokenizationName=None, parseName="McClosky", requireEntities=False) # print >> sys.stderr, "Stanford Conversion" # Tools.StanfordParser.convertXML("McClosky", bigfileName+"-parsed.xml", bigfileName+"-stanford.xml") # print >> sys.stderr, "Protein Name Splitting" # splitterCommand = "python /home/jari/cvs_checkout/PPI_Learning/Analysers/ProteinNameSplitter.py -f " + bigfileName+"-stanford.xml" + " -o " + bigfileName+"-split.xml" + " -p " + "McClosky" + " -t " + "McClosky" + " -s split-McClosky" + " -n split-McClosky" # subprocess.call(splitterCommand, shell=True) # print >> sys.stderr, "Fix AltOffsets" # import InteractionXML.FixAltOffsets # xml = InteractionXML.FixAltOffsets.fixAltOffsets(bigfileName+"-split.xml") # print >> sys.stderr, "Head Detection" # xml = FindHeads.findHeads(xml, "split-McClosky", tokenization=None, output=bigfileName+".xml", removeExisting=True) print >> sys.stderr, "Dividing into sets" #InteractionXML.DivideSets.processCorpus(bigfileName+".xml", outDir, corpusName + "-", ".xml", [("devel", "train")]) if "devel" in [x[0] for x in datasets]: print >> sys.stderr, "Creating empty devel set" deletionRules = {"interaction": {}, "entity": {"isName": "False"}} InteractionXML.DeleteElements.processCorpus( corpusName + "-devel.xml", corpusName + "-devel-empty.xml", deletionRules) return xml
def convert(datasets, outdir, corpusName): bigfileName = corpusName + "-" + "-and-".join([x[0] for x in datasets]) documents = [] for pair in datasets: print >> sys.stderr, "Reading", pair[0], "set,", docs = ST.loadSet(pair[1], pair[0], "a1") print >> sys.stderr, len(docs), "documents" documents.extend(docs) print >> sys.stderr, "Converting to", bigfileName+"-documents.xml" xml = STConvert.toInteractionXML(documents, corpusName, bigfileName+"-documents.xml") print >> sys.stderr, "Making sentences" xml = Tools.GeniaSentenceSplitter.makeSentences(xml, bigfileName+"-sentences.xml", postProcess=False) print >> sys.stderr, "Parsing" Tools.CharniakJohnsonParser.parse(bigfileName+"-sentences.xml", bigfileName+"-parsed.xml", tokenizationName=None, parseName="McClosky", requireEntities=False) print >> sys.stderr, "Stanford Conversion" Tools.StanfordParser.convertXML("McClosky", bigfileName+"-parsed.xml", bigfileName+"-stanford.xml") print >> sys.stderr, "Protein Name Splitting" splitterCommand = "python /home/jari/cvs_checkout/PPI_Learning/Analysers/ProteinNameSplitter.py -f " + bigfileName+"-stanford.xml" + " -o " + bigfileName+"-split.xml" + " -p " + "McClosky" + " -t " + "McClosky" + " -s split-McClosky" + " -n split-McClosky" subprocess.call(splitterCommand, shell=True) print >> sys.stderr, "Head Detection" xml = FindHeads.findHeads(bigfileName+"-split.xml", "split-McClosky", tokenization=None, output=bigfileName+".xml", removeExisting=True) print >> sys.stderr, "Dividing into sets" InteractionXML.DivideSets.processCorpus(bigfileName+".xml", outdir, corpusName + "-", ".xml", [("devel", "train")])
def convert(datasets, analysisTags, analysisPath, corpusName): global moveBI bigfileName = corpusName + "-" + "-and-".join([x[0] for x in datasets]) documents = [] for pair in datasets: print >> sys.stderr, "Reading", pair[0], "set,", sitesAreArguments = False if corpusName == "EPI": sitesAreArguments = True docs = ST.loadSet(pair[1], pair[0], "a2", sitesAreArguments=sitesAreArguments) print >> sys.stderr, len(docs), "documents" documents.extend(docs) print >> sys.stderr, "Resolving equivalences" STFormat.Equiv.process(documents) print >> sys.stderr, "Checking data validity" for doc in documents: STFormat.Validate.validate(doc.events, simulation=True, verbose=True, docId=doc.id) print >> sys.stderr, "Writing all documents to geniaformat" ST.writeSet(documents, "all-geniaformat", resultFileTag="a2", debug=False, task=2, validate=False) print >> sys.stderr, "Converting to", bigfileName + "-documents.xml" xml = STConvert.toInteractionXML(documents, corpusName, bigfileName + "-documents.xml") if corpusName == "BI": InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train", "devel") for pair in datasets: if True: #corpusName != "BI": print >> sys.stderr, "Adding analyses for set", pair[0] addAnalyses(xml, analysisTags[pair[0]], analysisPath, bigfileName) ETUtils.write(xml, bigfileName + "-sentences.xml") processParses(corpusName, xml) # Write out converted data ETUtils.write(xml, bigfileName + ".xml") InteractionXML.MergeDuplicateEntities.mergeAll(xml, bigfileName + "-nodup.xml") for sourceTag in ["", "-nodup"]: print >> sys.stderr, "Dividing into sets" InteractionXML.DivideSets.processCorpus( bigfileName + sourceTag + ".xml", "./", corpusName + "-", sourceTag + ".xml", [("devel", "train")]) if "devel" in [x[0] for x in datasets]: print >> sys.stderr, "Converting back" STConvert.toSTFormat(corpusName + "-devel" + sourceTag + ".xml", "roundtrip/" + corpusName + "-devel" + sourceTag + "-task2", outputTag="a2", task=2) STConvert.toSTFormat(corpusName + "-devel" + sourceTag + ".xml", "roundtrip/" + corpusName + "-devel" + sourceTag + "-task1", outputTag="a2", task=1) if corpusName == "GE": print >> sys.stderr, "Evaluating task 2 back-conversion" BioNLP11GeniaTools.evaluate("roundtrip/" + corpusName + "-devel" + sourceTag + "-task2", task=2, verbose=True, debug=False) print >> sys.stderr, "Evaluating task 1 back-conversion" BioNLP11GeniaTools.evaluate("roundtrip/" + corpusName + "-devel" + sourceTag + "-task1", task=1, verbose=True, debug=False) elif corpusName in ["BI", "BB"]: print >> sys.stderr, "Evaluating task 2 back-conversion" BioNLP11GeniaTools.evaluateBX( "roundtrip/" + corpusName + "-devel" + sourceTag + "-task2", corpusName) print >> sys.stderr, "Evaluating task 1 back-conversion" BioNLP11GeniaTools.evaluateBX( "roundtrip/" + corpusName + "-devel" + sourceTag + "-task1", corpusName) print >> sys.stderr, "Creating empty devel set" deletionRules = {"interaction": {}, "entity": {"isName": "False"}} InteractionXML.DeleteElements.processCorpus( corpusName + "-devel" + sourceTag + ".xml", corpusName + "-devel" + sourceTag + "-empty.xml", deletionRules)
def convertDownloaded(outdir, corpus, files, intermediateFiles=True, evaluate=True): global moveBI workdir = outdir + "/conversion/" + corpus if os.path.exists(workdir): shutil.rmtree(workdir) os.makedirs(workdir) print >> sys.stderr, "---------------", "Converting to XML", "---------------" # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are # applied equally datasets = ["devel", "train", "test"] bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets)) documents = [] for setName in datasets: sourceFile = files[corpus + "_" + setName.upper()] print >> sys.stderr, "Reading", setName, "set from", sourceFile, "temp at ", sitesAreArguments = False if corpus == "EPI": sitesAreArguments = True docs = ST.loadSet(sourceFile, setName, "a2", sitesAreArguments=sitesAreArguments) print >> sys.stderr, "Read", len(docs), "documents" documents.extend(docs) print >> sys.stderr, "Resolving equivalences" STFormat.Equiv.process(documents) if evaluate: print >> sys.stderr, "Checking data validity" for doc in documents: STFormat.Validate.validate(doc.events, simulation=True, verbose=True, docId=doc.id) print >> sys.stderr, "Writing all documents to geniaformat" ST.writeSet(documents, os.path.join(workdir, "all-geniaformat"), resultFileTag="a2", debug=False, task=2, validate=False) if intermediateFiles: print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName + "-documents.xml" xml = STConvert.toInteractionXML(documents, corpus, bigfileName + "-documents.xml") else: print >> sys.stderr, "Converting to XML" xml = STConvert.toInteractionXML(documents, corpus, None) if corpus == "BI": InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train", "devel") addAnalyses(xml, corpus, datasets, files, bigfileName) if intermediateFiles: print >> sys.stderr, "Writing combined corpus", bigfileName + "-sentences.xml" ETUtils.write(xml, bigfileName + "-sentences.xml") processParses(xml) print >> sys.stderr, "---------------", "Writing corpora", "---------------" # Write out converted data if intermediateFiles: print >> sys.stderr, "Writing combined corpus", bigfileName + ".xml" ETUtils.write(xml, bigfileName + ".xml") print >> sys.stderr, "Dividing into sets" InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml") if evaluate and "devel" in datasets: print >> sys.stderr, "---------------", "Evaluating conversion", "---------------" print >> sys.stderr, "Converting back" STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task1", outputTag="a2", task=1) STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task2", outputTag="a2", task=2) print >> sys.stderr, "Evaluating task 1 back-conversion" BioNLP11GeniaTools.evaluate( workdir + "/roundtrip/" + corpus + "-devel" + "-task1", corpus + ".1") print >> sys.stderr, "Evaluating task 2 back-conversion" BioNLP11GeniaTools.evaluate( workdir + "/roundtrip/" + corpus + "-devel" + "-task2", corpus + ".2") print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping"
def convertDownloaded(outdir, corpus, files, intermediateFiles=True, evaluate=True): global moveBI workdir = outdir + "/conversion/" + corpus if os.path.exists(workdir): shutil.rmtree(workdir) os.makedirs(workdir) print >> sys.stderr, "---------------", "Converting to XML", "---------------" # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are # applied equally datasets = ["devel", "train", "test"] bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets)) documents = [] for setName in datasets: sourceFile = files[corpus + "_" + setName.upper()] print >> sys.stderr, "Reading", setName, "set from", sourceFile, "temp at ", sitesAreArguments = False if corpus == "EPI": sitesAreArguments = True docs = ST.loadSet(sourceFile, setName, "a2", sitesAreArguments=sitesAreArguments) print >> sys.stderr, "Read", len(docs), "documents" documents.extend(docs) print >> sys.stderr, "Resolving equivalences" STFormat.Equiv.process(documents) if evaluate: print >> sys.stderr, "Checking data validity" for doc in documents: STFormat.Validate.validate(doc.events, simulation=True, verbose=True, docId=doc.id) print >> sys.stderr, "Writing all documents to geniaformat" ST.writeSet(documents, os.path.join(workdir, "all-geniaformat"), resultFileTag="a2", debug=False, task=2, validate=False) if intermediateFiles: print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName+"-documents.xml" xml = STConvert.toInteractionXML(documents, corpus, bigfileName+"-documents.xml") else: print >> sys.stderr, "Converting to XML" xml = STConvert.toInteractionXML(documents, corpus, None) if corpus == "BI": InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train", "devel") addAnalyses(xml, corpus, datasets, files, bigfileName) if intermediateFiles: print >> sys.stderr, "Writing combined corpus", bigfileName+"-sentences.xml" ETUtils.write(xml, bigfileName+"-sentences.xml") processParses(xml) print >> sys.stderr, "---------------", "Writing corpora", "---------------" # Write out converted data if intermediateFiles: print >> sys.stderr, "Writing combined corpus", bigfileName+".xml" ETUtils.write(xml, bigfileName+".xml") print >> sys.stderr, "Dividing into sets" InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml") if evaluate and "devel" in datasets: print >> sys.stderr, "---------------", "Evaluating conversion", "---------------" print >> sys.stderr, "Converting back" STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task1", outputTag="a2", task=1) STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task2", outputTag="a2", task=2) print >> sys.stderr, "Evaluating task 1 back-conversion" BioNLP11GeniaTools.evaluate(workdir + "/roundtrip/" + corpus + "-devel" + "-task1", corpus + ".1") print >> sys.stderr, "Evaluating task 2 back-conversion" BioNLP11GeniaTools.evaluate(workdir + "/roundtrip/" + corpus + "-devel" + "-task2", corpus + ".2") print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping"
def removeX(filename, resultFileTag="a2"): documents = STTools.loadSet(filename) newFilename = os.path.join(tempfile.tempdir, filename.rsplit(".", 2)[0] + "-no-X.tar.gz") STTools.writeSet(documents, newFilename, resultFileTag=resultFileTag, writeExtra=False, files=["a2","rel"]) return newFilename