Example #1
0
def removeX(filename, resultFileTag="a2"):
    documents = STTools.loadSet(filename)
    newFilename = os.path.join(tempfile.tempdir,
                               filename.rsplit(".", 2)[0] + "-no-X.tar.gz")
    STTools.writeSet(documents,
                     newFilename,
                     resultFileTag=resultFileTag,
                     writeExtra=False,
                     files=["a2", "rel"])
    return newFilename
Example #2
0
def convert(datasets, outdir, corpusName):
    # Depends on CO-conversion
    
    bigfileName = corpusName + "-" + "-and-".join([x[0] for x in datasets])
    documents = []
    for pair in datasets:
        print >> sys.stderr, "Reading", pair[0], "set,",
        docs = ST.loadSet(pair[1], pair[0])
        print >> sys.stderr, len(docs), "documents"
        documents.extend(docs)

    print >> sys.stderr, "Converting to", bigfileName+"-documents.xml"
    xml = STConvert.toInteractionXML(documents, corpusName, bigfileName+"-documents.xml")
    print >> sys.stderr, "Making sentences"
    xml = Tools.GeniaSentenceSplitter.makeSentences(xml, bigfileName+"-sentences.xml")
    #print >> sys.stderr, "Copying parses"
    #parsePath = "/home/jari/biotext/BioNLP2011/data/CO/co-devel-and-train-and-test.xml"
    #InteractionXML.CopyParse.copyParse(bigfileName+"-sentences.xml", parsePath, bigfileName+"-copied-parses.xml", "split-McClosky", "split-McClosky")
    print >> sys.stderr, "Parsing"
    Tools.CharniakJohnsonParser.parse(bigfileName+"-sentences.xml", bigfileName+"-parsed.xml", tokenizationName=None, parseName="McClosky", requireEntities=False)
    print >> sys.stderr, "Stanford Conversion"
    Tools.StanfordParser.convertXML("McClosky", bigfileName+"-parsed.xml", bigfileName+"-stanford.xml")
    print >> sys.stderr, "Protein Name Splitting"
    splitterCommand = "python /home/jari/cvs_checkout/PPI_Learning/Analysers/ProteinNameSplitter.py -f " + bigfileName+"-stanford.xml" + " -o " + bigfileName+"-split.xml" + " -p " + "McClosky" + " -t " + "McClosky" + " -s split-McClosky" + " -n split-McClosky"
    subprocess.call(splitterCommand, shell=True)
    print >> sys.stderr, "Head Detection"
    xml = FindHeads.findHeads(bigfileName+"-split.xml", "split-McClosky", tokenization=None, output=bigfileName+".xml", removeExisting=True)
    print >> sys.stderr, "Dividing into sets"
    InteractionXML.DivideSets.processCorpus(bigfileName+".xml", outDir, corpusName + "-", ".xml", [("devel", "train")])
    if "devel" in [x[0] for x in datasets]:
        print >> sys.stderr, "Creating empty devel set"
        deletionRules = {"interaction":{},"entity":{"isName":"False"}}
        InteractionXML.DeleteElements.processCorpus(corpusName + "-devel.xml", corpusName + "-devel-empty.xml", deletionRules)
    return xml
Example #3
0
def convert(datasets, outdir, corpusName):
    bigfileName = corpusName + "-" + "-and-".join([x[0] for x in datasets])
    documents = []
    for pair in datasets:
        print >> sys.stderr, "Reading", pair[0], "set,",
        docs = ST.loadSet(pair[1], pair[0])
        print >> sys.stderr, len(docs), "documents"
        documents.extend(docs)


#    print >> sys.stderr, "Converting to", bigfileName+"-documents.xml"
#    xml = STConvert.toInteractionXML(documents, corpusName, bigfileName+"-documents.xml")
#    print >> sys.stderr, "Making sentences"
#    xml = Tools.GeniaSentenceSplitter.makeSentences(xml, bigfileName+"-sentences.xml")
#    print >> sys.stderr, "Parsing"
#    Tools.CharniakJohnsonParser.parse(xml, bigfileName+"-parsed.xml", tokenizationName=None, parseName="McClosky", requireEntities=False)
#    print >> sys.stderr, "Stanford Conversion"
#    Tools.StanfordParser.convertXML("McClosky", bigfileName+"-parsed.xml", bigfileName+"-stanford.xml")
#    print >> sys.stderr, "Protein Name Splitting"
#    splitterCommand = "python /home/jari/cvs_checkout/PPI_Learning/Analysers/ProteinNameSplitter.py -f " + bigfileName+"-stanford.xml" + " -o " + bigfileName+"-split.xml" + " -p " + "McClosky" + " -t " + "McClosky" + " -s split-McClosky" + " -n split-McClosky"
#    subprocess.call(splitterCommand, shell=True)
#    print >> sys.stderr, "Fix AltOffsets"
#    import InteractionXML.FixAltOffsets
#    xml = InteractionXML.FixAltOffsets.fixAltOffsets(bigfileName+"-split.xml")
#    print >> sys.stderr, "Head Detection"
#    xml = FindHeads.findHeads(xml, "split-McClosky", tokenization=None, output=bigfileName+".xml", removeExisting=True)
    print >> sys.stderr, "Dividing into sets"
    #InteractionXML.DivideSets.processCorpus(bigfileName+".xml", outDir, corpusName + "-", ".xml", [("devel", "train")])
    if "devel" in [x[0] for x in datasets]:
        print >> sys.stderr, "Creating empty devel set"
        deletionRules = {"interaction": {}, "entity": {"isName": "False"}}
        InteractionXML.DeleteElements.processCorpus(
            corpusName + "-devel.xml", corpusName + "-devel-empty.xml",
            deletionRules)
    return xml
Example #4
0
def convert(datasets, outdir, corpusName):
    bigfileName = corpusName + "-" + "-and-".join([x[0] for x in datasets])
    documents = []
    for pair in datasets:
        print >> sys.stderr, "Reading", pair[0], "set,",
        docs = ST.loadSet(pair[1], pair[0], "a1")
        print >> sys.stderr, len(docs), "documents"
        documents.extend(docs)

    print >> sys.stderr, "Converting to", bigfileName+"-documents.xml"
    xml = STConvert.toInteractionXML(documents, corpusName, bigfileName+"-documents.xml")
    print >> sys.stderr, "Making sentences"
    xml = Tools.GeniaSentenceSplitter.makeSentences(xml, bigfileName+"-sentences.xml", postProcess=False)
    print >> sys.stderr, "Parsing"
    Tools.CharniakJohnsonParser.parse(bigfileName+"-sentences.xml", bigfileName+"-parsed.xml", tokenizationName=None, parseName="McClosky", requireEntities=False)
    print >> sys.stderr, "Stanford Conversion"
    Tools.StanfordParser.convertXML("McClosky", bigfileName+"-parsed.xml", bigfileName+"-stanford.xml")
    print >> sys.stderr, "Protein Name Splitting"
    splitterCommand = "python /home/jari/cvs_checkout/PPI_Learning/Analysers/ProteinNameSplitter.py -f " + bigfileName+"-stanford.xml" + " -o " + bigfileName+"-split.xml" + " -p " + "McClosky" + " -t " + "McClosky" + " -s split-McClosky" + " -n split-McClosky"
    subprocess.call(splitterCommand, shell=True)
    print >> sys.stderr, "Head Detection"
    xml = FindHeads.findHeads(bigfileName+"-split.xml", "split-McClosky", tokenization=None, output=bigfileName+".xml", removeExisting=True)
    print >> sys.stderr, "Dividing into sets"
    InteractionXML.DivideSets.processCorpus(bigfileName+".xml", outdir, corpusName + "-", ".xml", [("devel", "train")])
def convert(datasets, analysisTags, analysisPath, corpusName):
    global moveBI

    bigfileName = corpusName + "-" + "-and-".join([x[0] for x in datasets])
    documents = []
    for pair in datasets:
        print >> sys.stderr, "Reading", pair[0], "set,",
        sitesAreArguments = False
        if corpusName == "EPI":
            sitesAreArguments = True
        docs = ST.loadSet(pair[1],
                          pair[0],
                          "a2",
                          sitesAreArguments=sitesAreArguments)
        print >> sys.stderr, len(docs), "documents"
        documents.extend(docs)

    print >> sys.stderr, "Resolving equivalences"
    STFormat.Equiv.process(documents)

    print >> sys.stderr, "Checking data validity"
    for doc in documents:
        STFormat.Validate.validate(doc.events,
                                   simulation=True,
                                   verbose=True,
                                   docId=doc.id)
    print >> sys.stderr, "Writing all documents to geniaformat"
    ST.writeSet(documents,
                "all-geniaformat",
                resultFileTag="a2",
                debug=False,
                task=2,
                validate=False)

    print >> sys.stderr, "Converting to", bigfileName + "-documents.xml"
    xml = STConvert.toInteractionXML(documents, corpusName,
                                     bigfileName + "-documents.xml")

    if corpusName == "BI":
        InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train",
                                       "devel")

    for pair in datasets:
        if True:  #corpusName != "BI":
            print >> sys.stderr, "Adding analyses for set", pair[0]
            addAnalyses(xml, analysisTags[pair[0]], analysisPath, bigfileName)
    ETUtils.write(xml, bigfileName + "-sentences.xml")
    processParses(corpusName, xml)

    # Write out converted data
    ETUtils.write(xml, bigfileName + ".xml")
    InteractionXML.MergeDuplicateEntities.mergeAll(xml,
                                                   bigfileName + "-nodup.xml")
    for sourceTag in ["", "-nodup"]:
        print >> sys.stderr, "Dividing into sets"
        InteractionXML.DivideSets.processCorpus(
            bigfileName + sourceTag + ".xml", "./", corpusName + "-",
            sourceTag + ".xml", [("devel", "train")])
        if "devel" in [x[0] for x in datasets]:
            print >> sys.stderr, "Converting back"
            STConvert.toSTFormat(corpusName + "-devel" + sourceTag + ".xml",
                                 "roundtrip/" + corpusName + "-devel" +
                                 sourceTag + "-task2",
                                 outputTag="a2",
                                 task=2)
            STConvert.toSTFormat(corpusName + "-devel" + sourceTag + ".xml",
                                 "roundtrip/" + corpusName + "-devel" +
                                 sourceTag + "-task1",
                                 outputTag="a2",
                                 task=1)
            if corpusName == "GE":
                print >> sys.stderr, "Evaluating task 2 back-conversion"
                BioNLP11GeniaTools.evaluate("roundtrip/" + corpusName +
                                            "-devel" + sourceTag + "-task2",
                                            task=2,
                                            verbose=True,
                                            debug=False)
                print >> sys.stderr, "Evaluating task 1 back-conversion"
                BioNLP11GeniaTools.evaluate("roundtrip/" + corpusName +
                                            "-devel" + sourceTag + "-task1",
                                            task=1,
                                            verbose=True,
                                            debug=False)
            elif corpusName in ["BI", "BB"]:
                print >> sys.stderr, "Evaluating task 2 back-conversion"
                BioNLP11GeniaTools.evaluateBX(
                    "roundtrip/" + corpusName + "-devel" + sourceTag +
                    "-task2", corpusName)
                print >> sys.stderr, "Evaluating task 1 back-conversion"
                BioNLP11GeniaTools.evaluateBX(
                    "roundtrip/" + corpusName + "-devel" + sourceTag +
                    "-task1", corpusName)
            print >> sys.stderr, "Creating empty devel set"
            deletionRules = {"interaction": {}, "entity": {"isName": "False"}}
            InteractionXML.DeleteElements.processCorpus(
                corpusName + "-devel" + sourceTag + ".xml",
                corpusName + "-devel" + sourceTag + "-empty.xml",
                deletionRules)
Example #6
0
def convertDownloaded(outdir,
                      corpus,
                      files,
                      intermediateFiles=True,
                      evaluate=True):
    global moveBI
    workdir = outdir + "/conversion/" + corpus
    if os.path.exists(workdir):
        shutil.rmtree(workdir)
    os.makedirs(workdir)

    print >> sys.stderr, "---------------", "Converting to XML", "---------------"
    # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are
    # applied equally
    datasets = ["devel", "train", "test"]
    bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets))
    documents = []
    for setName in datasets:
        sourceFile = files[corpus + "_" + setName.upper()]
        print >> sys.stderr, "Reading", setName, "set from", sourceFile, "temp at ",
        sitesAreArguments = False
        if corpus == "EPI":
            sitesAreArguments = True
        docs = ST.loadSet(sourceFile,
                          setName,
                          "a2",
                          sitesAreArguments=sitesAreArguments)
        print >> sys.stderr, "Read", len(docs), "documents"
        documents.extend(docs)

    print >> sys.stderr, "Resolving equivalences"
    STFormat.Equiv.process(documents)

    if evaluate:
        print >> sys.stderr, "Checking data validity"
        for doc in documents:
            STFormat.Validate.validate(doc.events,
                                       simulation=True,
                                       verbose=True,
                                       docId=doc.id)
        print >> sys.stderr, "Writing all documents to geniaformat"
        ST.writeSet(documents,
                    os.path.join(workdir, "all-geniaformat"),
                    resultFileTag="a2",
                    debug=False,
                    task=2,
                    validate=False)

    if intermediateFiles:
        print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName + "-documents.xml"
        xml = STConvert.toInteractionXML(documents, corpus,
                                         bigfileName + "-documents.xml")
    else:
        print >> sys.stderr, "Converting to XML"
        xml = STConvert.toInteractionXML(documents, corpus, None)

    if corpus == "BI":
        InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train",
                                       "devel")

    addAnalyses(xml, corpus, datasets, files, bigfileName)
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName + "-sentences.xml"
        ETUtils.write(xml, bigfileName + "-sentences.xml")
    processParses(xml)

    print >> sys.stderr, "---------------", "Writing corpora", "---------------"
    # Write out converted data
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName + ".xml"
        ETUtils.write(xml, bigfileName + ".xml")
    print >> sys.stderr, "Dividing into sets"
    InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml")

    if evaluate and "devel" in datasets:
        print >> sys.stderr, "---------------", "Evaluating conversion", "---------------"
        print >> sys.stderr, "Converting back"
        STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"),
                             workdir + "/roundtrip/" + corpus + "-devel" +
                             "-task1",
                             outputTag="a2",
                             task=1)
        STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"),
                             workdir + "/roundtrip/" + corpus + "-devel" +
                             "-task2",
                             outputTag="a2",
                             task=2)
        print >> sys.stderr, "Evaluating task 1 back-conversion"
        BioNLP11GeniaTools.evaluate(
            workdir + "/roundtrip/" + corpus + "-devel" + "-task1",
            corpus + ".1")
        print >> sys.stderr, "Evaluating task 2 back-conversion"
        BioNLP11GeniaTools.evaluate(
            workdir + "/roundtrip/" + corpus + "-devel" + "-task2",
            corpus + ".2")
        print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping"
Example #7
0
def convertDownloaded(outdir, corpus, files, intermediateFiles=True, evaluate=True):
    global moveBI
    workdir = outdir + "/conversion/" + corpus
    if os.path.exists(workdir):
        shutil.rmtree(workdir)
    os.makedirs(workdir)
    
    print >> sys.stderr, "---------------", "Converting to XML", "---------------"
    # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are
    # applied equally
    datasets = ["devel", "train", "test"]
    bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets))
    documents = []
    for setName in datasets:
        sourceFile = files[corpus + "_" + setName.upper()]
        print >> sys.stderr, "Reading", setName, "set from", sourceFile, "temp at ",
        sitesAreArguments = False
        if corpus == "EPI":
            sitesAreArguments = True
        docs = ST.loadSet(sourceFile, setName, "a2", sitesAreArguments=sitesAreArguments)
        print >> sys.stderr, "Read", len(docs), "documents"
        documents.extend(docs)
    
    print >> sys.stderr, "Resolving equivalences"
    STFormat.Equiv.process(documents)
    
    if evaluate:
        print >> sys.stderr, "Checking data validity"
        for doc in documents:
            STFormat.Validate.validate(doc.events, simulation=True, verbose=True, docId=doc.id)
        print >> sys.stderr, "Writing all documents to geniaformat"
        ST.writeSet(documents, os.path.join(workdir, "all-geniaformat"), resultFileTag="a2", debug=False, task=2, validate=False)
    
    if intermediateFiles:
        print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName+"-documents.xml"
        xml = STConvert.toInteractionXML(documents, corpus, bigfileName+"-documents.xml")
    else:
        print >> sys.stderr, "Converting to XML"
        xml = STConvert.toInteractionXML(documents, corpus, None)
    
    if corpus == "BI":
        InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train", "devel")
    
    addAnalyses(xml, corpus, datasets, files, bigfileName)
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName+"-sentences.xml"
        ETUtils.write(xml, bigfileName+"-sentences.xml")
    processParses(xml)
    
    print >> sys.stderr, "---------------", "Writing corpora", "---------------"
    # Write out converted data
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName+".xml"
        ETUtils.write(xml, bigfileName+".xml")
    print >> sys.stderr, "Dividing into sets"
    InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml")
    
    if evaluate and "devel" in datasets:
        print >> sys.stderr, "---------------", "Evaluating conversion", "---------------"
        print >> sys.stderr, "Converting back"
        STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task1", outputTag="a2", task=1)
        STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task2", outputTag="a2", task=2)
        print >> sys.stderr, "Evaluating task 1 back-conversion"
        BioNLP11GeniaTools.evaluate(workdir + "/roundtrip/" + corpus + "-devel" + "-task1", corpus + ".1")
        print >> sys.stderr, "Evaluating task 2 back-conversion"
        BioNLP11GeniaTools.evaluate(workdir + "/roundtrip/" + corpus + "-devel" + "-task2", corpus + ".2")
        print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping"
Example #8
0
def removeX(filename, resultFileTag="a2"):
    documents = STTools.loadSet(filename)
    newFilename = os.path.join(tempfile.tempdir, filename.rsplit(".", 2)[0] + "-no-X.tar.gz")
    STTools.writeSet(documents, newFilename, resultFileTag=resultFileTag, writeExtra=False, files=["a2","rel"])
    return newFilename