Example #1
0
def convertDownloaded(outdir, corpus, files, intermediateFiles=True, evaluate=True):
    global moveBI
    if evaluate:
        workdir = outdir + "/conversion/" + corpus
        if os.path.exists(workdir):
            shutil.rmtree(workdir)
        os.makedirs(workdir)
    
    print >> sys.stderr, "---------------", "Converting to XML", "---------------"
    # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are
    # applied equally
    datasets = ["devel", "train", "test"]
    bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets))
    documents = []
    for setName in datasets:
        sourceFile = files[corpus + "_" + setName.upper()]
        print >> sys.stderr, "Reading", setName, "set from", sourceFile
        sitesAreArguments = False
        if corpus == "EPI":
            sitesAreArguments = True
        docs = ST.loadSet(sourceFile, setName, "a2", sitesAreArguments=sitesAreArguments)
        print >> sys.stderr, "Read", len(docs), "documents"
        documents.extend(docs)
    
    if len(docs) > 0 and docs[0].license != None:
        licenseFile = open(os.path.join(outdir, corpus + "-LICENSE"), "wt")
        licenseFile.write(docs[0].license)
        licenseFile.close()
    
    print >> sys.stderr, "Resolving equivalences"
    Utils.STFormat.Equiv.process(documents)
    
    if evaluate:
        print >> sys.stderr, "Checking data validity"
        for doc in documents:
            Utils.STFormat.Validate.validate(doc.events, simulation=True, verbose=True, docId=doc.id)
        print >> sys.stderr, "Writing all documents to geniaformat"
        ST.writeSet(documents, os.path.join(workdir, "all-geniaformat"), resultFileTag="a2", debug=False, task=2, validate=False)
    
    if intermediateFiles:
        print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName+"-documents.xml"
        xml = STConvert.toInteractionXML(documents, corpus, bigfileName+"-documents.xml")
    else:
        print >> sys.stderr, "Converting to XML"
        xml = STConvert.toInteractionXML(documents, corpus, None)
    
    if corpus == "BI":
        Utils.InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train", "devel")
    if corpus == "REN":
        corpusRENtoASCII(xml)
    
    addAnalyses(xml, corpus, datasets, files, bigfileName)
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName+"-sentences.xml"
        ETUtils.write(xml, bigfileName+"-sentences.xml")
    processParses(xml)
    
    print >> sys.stderr, "---------------", "Writing corpora", "---------------"
    # Write out converted data
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName+".xml"
        ETUtils.write(xml, bigfileName+".xml")
    print >> sys.stderr, "Dividing into sets"
    Utils.InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml")
    
    if evaluate and "devel" in datasets:
        print >> sys.stderr, "---------------", "Evaluating conversion", "---------------"
        if corpus != "REL": # Task 1 (removal of Entity-entities) cannot work for REL
            print >> sys.stderr, "Evaluating task 1 back-conversion"
            STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task1", outputTag="a2", task=1)
            BioNLP11GeniaTools.evaluate(workdir + "/roundtrip/" + corpus + "-devel" + "-task1", corpus + ".1")
        print >> sys.stderr, "Evaluating task 2 back-conversion"
        STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task2", outputTag="a2", task=2)
        BioNLP11GeniaTools.evaluate(workdir + "/roundtrip/" + corpus + "-devel" + "-task2", corpus + ".2")
        print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping"
Example #2
0
def convertDownloaded(outdir, corpus, files, intermediateFiles=True, evaluate=True, processEquiv=True, addAnalyses=True, packageSubPath=None):
    global moveBI
    if evaluate:
        workdir = outdir + "/conversion/" + corpus
        if os.path.exists(workdir):
            shutil.rmtree(workdir)
        os.makedirs(workdir)
    
    print >> sys.stderr, "---------------", "Converting to XML", "---------------"
    # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are
    # applied equally
    #print corpus, files
    datasets = []
    for setName in ["devel", "train", "test"]:
        if corpus + "_" + setName.upper() in files:
            datasets.append(setName)
    bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets))
    documents = []
    for setName in datasets:
        sourceFile = files[corpus + "_" + setName.upper()]
        print >> sys.stderr, "Reading", setName, "set from", sourceFile
        docs = ST.loadSet(sourceFile, setName, "a2", subPath=packageSubPath)
        print >> sys.stderr, "Read", len(docs), "documents"
        documents.extend(docs)
        
    if len(docs) > 0 and docs[0].license != None:
        licenseFile = open(os.path.join(outdir, corpus + "-LICENSE"), "wt")
        licenseFile.write(docs[0].license)
        licenseFile.close()
    
    if processEquiv:
        print >> sys.stderr, "Resolving equivalences"
        Utils.STFormat.Equiv.process(documents)
    else:
        print >> sys.stderr, "Skipping resolving of equivalences"
    
    if evaluate:
        #print >> sys.stderr, "Checking data validity"
        #for doc in documents:
        #    Utils.STFormat.Validate.validate(doc.events, simulation=True, verbose=True, docId=doc.id)
        print >> sys.stderr, "Writing all documents to geniaformat"
        ST.writeSet(documents, os.path.join(workdir, "all-geniaformat"), resultFileTag="a2", debug=False)
    
    if intermediateFiles:
        print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName+"-documents.xml"
        xml = STConvert.toInteractionXML(documents, corpus, bigfileName+"-documents.xml")
    else:
        print >> sys.stderr, "Converting to XML"
        xml = STConvert.toInteractionXML(documents, corpus, None)
    
    if corpus == "BI11":
        Utils.InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train", "devel")
    if corpus == "REN11":
        corpusRENtoASCII(xml)
    
    if addAnalyses:
        insertAnalyses(xml, corpus, datasets, files, bigfileName, packageSubPath=packageSubPath)
    else:
        print >> sys.stderr, "Skipping adding analyses"
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName+"-sentences.xml"
        ETUtils.write(xml, bigfileName+"-sentences.xml")
    processParses(xml)
    
    # A hack for GRN13 task that breaks the official BioNLP Shared Task convention of trigger and event having the same type.
    # Let's remove the unused triggers, so that there won't be an unusable node class. There is no clean way to fix this,
    # as the GRN13 task not following the official rules introduces yet another mechanism into the Shared Task format,
    # and supporting this would require rewriting everything.
    if corpus == "GRN13":
        Utils.InteractionXML.DeleteElements.processCorpus(xml, None, {"entity":{"type":["Action"]}})
    
    print >> sys.stderr, "---------------", "Writing corpora", "---------------"
    checkAttributes(xml)
    # Write out converted data
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName+".xml"
        ETUtils.write(xml, bigfileName+".xml")
    print >> sys.stderr, "Dividing into sets"
    Utils.InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml")
    
    if evaluate and "devel" in datasets:
        print >> sys.stderr, "---------------", "Evaluating conversion", "---------------"
        if corpus != "REL11": # Task 1 (removal of Entity-entities) cannot work for REL
            print >> sys.stderr, "Evaluating task 1 back-conversion"
            STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task1", outputTag="a2", skipArgs=["Site"])
            BioNLP11GeniaTools.evaluate(workdir + "/roundtrip/" + corpus + "-devel" + "-task1", corpus + ".1")
        print >> sys.stderr, "Evaluating task 2 back-conversion"
        STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task2", outputTag="a2")
        BioNLP11GeniaTools.evaluate(workdir + "/roundtrip/" + corpus + "-devel" + "-task2", corpus + ".2")
        print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping"
    
    # Check what was produced by the conversion
    print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------"
    analyzer = StructureAnalyzer()
    analyzer.analyze([xml])
    print >> sys.stderr, analyzer.toString()
def convertDownloaded(outdir,
                      corpus,
                      files,
                      intermediateFiles=True,
                      evaluate=True,
                      processEquiv=True,
                      analysisMode="INSERT",
                      packageSubPath=None,
                      debug=False):
    global moveBI
    if evaluate:
        workdir = outdir + "/conversion/" + corpus
        if os.path.exists(workdir):
            shutil.rmtree(workdir)
        os.makedirs(workdir)

    print >> sys.stderr, "---------------", "Converting to XML", "---------------"
    # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are
    # applied equally
    #print corpus, files
    datasets = []
    for setName in ["devel", "train", "test"]:
        if corpus + "_" + setName.upper() in files:
            datasets.append(setName)
    bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets))
    documents = []
    for setName in datasets:
        sourceFile = files[corpus + "_" + setName.upper()]
        print >> sys.stderr, "Reading", setName, "set from", sourceFile
        docs = ST.loadSet(sourceFile, setName, "a2", subPath=packageSubPath)
        print >> sys.stderr, "Read", len(docs), "documents"
        documents.extend(docs)

    if len(docs) > 0 and docs[0].license != None:
        licenseFile = open(os.path.join(outdir, corpus + "-LICENSE"), "wt")
        licenseFile.write(docs[0].license)
        licenseFile.close()

    if processEquiv:
        print >> sys.stderr, "Resolving equivalences"
        Utils.STFormat.Equiv.process(documents)
    else:
        print >> sys.stderr, "Skipping resolving of equivalences"

    if evaluate:
        #print >> sys.stderr, "Checking data validity"
        #for doc in documents:
        #    Utils.STFormat.Validate.validate(doc.events, simulation=True, verbose=True, docId=doc.id)
        print >> sys.stderr, "Writing all documents to geniaformat"
        ST.writeSet(documents,
                    os.path.join(workdir, "all-geniaformat"),
                    resultFileTag="a2",
                    debug=False)

    if intermediateFiles:
        print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName + "-documents.xml"
        xml = STConvert.toInteractionXML(documents, corpus,
                                         bigfileName + "-documents.xml")
    else:
        print >> sys.stderr, "Converting to XML"
        xml = STConvert.toInteractionXML(documents, corpus, None)

    if corpus == "BI11":
        Utils.InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train",
                                             "devel")
    if corpus == "REN11":
        corpusRENtoASCII(xml)

    if analysisMode == "INSERT":
        insertAnalyses(xml,
                       corpus,
                       datasets,
                       files,
                       bigfileName,
                       packageSubPath=packageSubPath)
        if intermediateFiles:
            print >> sys.stderr, "Writing combined corpus", bigfileName + "-sentences.xml"
            ETUtils.write(xml, bigfileName + "-sentences.xml")
        processParses(xml)
    elif analysisMode == "BUILD":
        parseXML(xml,
                 bigfileName,
                 intermediateFiles,
                 debug,
                 bbResources=(corpus.startswith("BB_")))
    else:
        print >> sys.stderr, "Skipping analyses"

    # A hack for GRN13 task that breaks the official BioNLP Shared Task convention of trigger and event having the same type.
    # Let's remove the unused triggers, so that there won't be an unusable node class. There is no clean way to fix this,
    # as the GRN13 task not following the official rules introduces yet another mechanism into the Shared Task format,
    # and supporting this would require rewriting everything.
    if corpus == "GRN13":
        Utils.InteractionXML.DeleteElements.processCorpus(
            xml, None, {"entity": {
                "type": ["Action"]
            }})

    print >> sys.stderr, "---------------", "Writing corpora", "---------------"
    checkAttributes(xml)
    # Write out converted data
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName + ".xml"
        ETUtils.write(xml, bigfileName + ".xml")
    print >> sys.stderr, "Dividing into sets"
    Utils.InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml")

    if evaluate and "devel" in datasets:
        print >> sys.stderr, "---------------", "Evaluating conversion", "---------------"
        if corpus != "REL11":  # Task 1 (removal of Entity-entities) cannot work for REL
            print >> sys.stderr, "Evaluating task 1 back-conversion"
            STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"),
                                 workdir + "/roundtrip/" + corpus + "-devel" +
                                 "-task1",
                                 outputTag="a2",
                                 skipArgs=["Site"])
            BioNLP11GeniaTools.evaluate(
                workdir + "/roundtrip/" + corpus + "-devel" + "-task1",
                corpus + ".1")
        print >> sys.stderr, "Evaluating task 2 back-conversion"
        STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"),
                             workdir + "/roundtrip/" + corpus + "-devel" +
                             "-task2",
                             outputTag="a2")
        BioNLP11GeniaTools.evaluate(
            workdir + "/roundtrip/" + corpus + "-devel" + "-task2",
            corpus + ".2")
        print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping"

    # Check what was produced by the conversion
    print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------"
    analyzer = StructureAnalyzer()
    analyzer.analyze([xml])
    print >> sys.stderr, analyzer.toString()
def convertDownloaded(outdir,
                      corpus,
                      files,
                      intermediateFiles=True,
                      evaluate=True,
                      processEquiv=True,
                      addAnalyses=True,
                      packageSubPath=None):
    global moveBI
    if evaluate:
        workdir = outdir + "/conversion/" + corpus
        if os.path.exists(workdir):
            shutil.rmtree(workdir)
        os.makedirs(workdir)

    print >> sys.stderr, "---------------", "Converting to XML", "---------------"
    # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are
    # applied equally
    #print corpus, files
    datasets = []
    for setName in ["devel", "train", "test"]:
        if corpus + "_" + setName.upper() in files:
            datasets.append(setName)
    bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets))
    documents = []
    for setName in datasets:
        sourceFile = files[corpus + "_" + setName.upper()]
        print >> sys.stderr, "Reading", setName, "set from", sourceFile
        docs = ST.loadSet(sourceFile, setName, "a2", subPath=packageSubPath)
        print >> sys.stderr, "Read", len(docs), "documents"
        documents.extend(docs)

    if len(docs) > 0 and docs[0].license != None:
        licenseFile = open(os.path.join(outdir, corpus + "-LICENSE"), "wt")
        licenseFile.write(docs[0].license)
        licenseFile.close()

    if processEquiv:
        print >> sys.stderr, "Resolving equivalences"
        Utils.STFormat.Equiv.process(documents)
    else:
        print >> sys.stderr, "Skipping resolving of equivalences"

    if evaluate:
        #print >> sys.stderr, "Checking data validity"
        #for doc in documents:
        #    Utils.STFormat.Validate.validate(doc.events, simulation=True, verbose=True, docId=doc.id)
        print >> sys.stderr, "Writing all documents to geniaformat"
        ST.writeSet(documents,
                    os.path.join(workdir, "all-geniaformat"),
                    resultFileTag="a2",
                    debug=False)

    if intermediateFiles:
        print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName + "-documents.xml"
        xml = STConvert.toInteractionXML(documents, corpus,
                                         bigfileName + "-documents.xml")
    else:
        print >> sys.stderr, "Converting to XML"
        xml = STConvert.toInteractionXML(documents, corpus, None)

    if corpus == "BI11":
        Utils.InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train",
                                             "devel")
    if corpus == "REN11":
        corpusRENtoASCII(xml)

    if addAnalyses:
        insertAnalyses(xml,
                       corpus,
                       datasets,
                       files,
                       bigfileName,
                       packageSubPath=packageSubPath)
    else:
        print >> sys.stderr, "Skipping adding analyses"
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName + "-sentences.xml"
        ETUtils.write(xml, bigfileName + "-sentences.xml")
    processParses(xml)

    print >> sys.stderr, "---------------", "Writing corpora", "---------------"
    checkAttributes(xml)
    # Write out converted data
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName + ".xml"
        ETUtils.write(xml, bigfileName + ".xml")
    print >> sys.stderr, "Dividing into sets"
    Utils.InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml")

    if evaluate and "devel" in datasets:
        print >> sys.stderr, "---------------", "Evaluating conversion", "---------------"
        if corpus != "REL11":  # Task 1 (removal of Entity-entities) cannot work for REL
            print >> sys.stderr, "Evaluating task 1 back-conversion"
            STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"),
                                 workdir + "/roundtrip/" + corpus + "-devel" +
                                 "-task1",
                                 outputTag="a2",
                                 skipArgs=["Site"])
            BioNLP11GeniaTools.evaluate(
                workdir + "/roundtrip/" + corpus + "-devel" + "-task1",
                corpus + ".1")
        print >> sys.stderr, "Evaluating task 2 back-conversion"
        STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"),
                             workdir + "/roundtrip/" + corpus + "-devel" +
                             "-task2",
                             outputTag="a2")
        BioNLP11GeniaTools.evaluate(
            workdir + "/roundtrip/" + corpus + "-devel" + "-task2",
            corpus + ".2")
        print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping"

    # Check what was produced by the conversion
    print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------"
    analyzer = StructureAnalyzer()
    analyzer.analyze([xml])
    print >> sys.stderr, analyzer.toString()