Example #1
0
def convertDownloaded(outdir, corpus, files, intermediateFiles=True, evaluate=True):
    global moveBI
    if evaluate:
        workdir = outdir + "/conversion/" + corpus
        if os.path.exists(workdir):
            shutil.rmtree(workdir)
        os.makedirs(workdir)
    
    print >> sys.stderr, "---------------", "Converting to XML", "---------------"
    # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are
    # applied equally
    datasets = ["devel", "train", "test"]
    bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets))
    documents = []
    for setName in datasets:
        sourceFile = files[corpus + "_" + setName.upper()]
        print >> sys.stderr, "Reading", setName, "set from", sourceFile
        sitesAreArguments = False
        if corpus == "EPI":
            sitesAreArguments = True
        docs = ST.loadSet(sourceFile, setName, "a2", sitesAreArguments=sitesAreArguments)
        print >> sys.stderr, "Read", len(docs), "documents"
        documents.extend(docs)
    
    if len(docs) > 0 and docs[0].license != None:
        licenseFile = open(os.path.join(outdir, corpus + "-LICENSE"), "wt")
        licenseFile.write(docs[0].license)
        licenseFile.close()
    
    print >> sys.stderr, "Resolving equivalences"
    Utils.STFormat.Equiv.process(documents)
    
    if evaluate:
        print >> sys.stderr, "Checking data validity"
        for doc in documents:
            Utils.STFormat.Validate.validate(doc.events, simulation=True, verbose=True, docId=doc.id)
        print >> sys.stderr, "Writing all documents to geniaformat"
        ST.writeSet(documents, os.path.join(workdir, "all-geniaformat"), resultFileTag="a2", debug=False, task=2, validate=False)
    
    if intermediateFiles:
        print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName+"-documents.xml"
        xml = STConvert.toInteractionXML(documents, corpus, bigfileName+"-documents.xml")
    else:
        print >> sys.stderr, "Converting to XML"
        xml = STConvert.toInteractionXML(documents, corpus, None)
    
    if corpus == "BI":
        Utils.InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train", "devel")
    if corpus == "REN":
        corpusRENtoASCII(xml)
    
    addAnalyses(xml, corpus, datasets, files, bigfileName)
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName+"-sentences.xml"
        ETUtils.write(xml, bigfileName+"-sentences.xml")
    processParses(xml)
    
    print >> sys.stderr, "---------------", "Writing corpora", "---------------"
    # Write out converted data
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName+".xml"
        ETUtils.write(xml, bigfileName+".xml")
    print >> sys.stderr, "Dividing into sets"
    Utils.InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml")
    
    if evaluate and "devel" in datasets:
        print >> sys.stderr, "---------------", "Evaluating conversion", "---------------"
        if corpus != "REL": # Task 1 (removal of Entity-entities) cannot work for REL
            print >> sys.stderr, "Evaluating task 1 back-conversion"
            STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task1", outputTag="a2", task=1)
            BioNLP11GeniaTools.evaluate(workdir + "/roundtrip/" + corpus + "-devel" + "-task1", corpus + ".1")
        print >> sys.stderr, "Evaluating task 2 back-conversion"
        STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task2", outputTag="a2", task=2)
        BioNLP11GeniaTools.evaluate(workdir + "/roundtrip/" + corpus + "-devel" + "-task2", corpus + ".2")
        print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping"
def convert(datasets, analysisTags, analysisPath, corpusName):
    global moveBI

    bigfileName = corpusName + "-" + "-and-".join([x[0] for x in datasets])
    documents = []
    for pair in datasets:
        print >> sys.stderr, "Reading", pair[0], "set,",
        sitesAreArguments = False
        if corpusName == "EPI":
            sitesAreArguments = True
        docs = ST.loadSet(pair[1],
                          pair[0],
                          "a2",
                          sitesAreArguments=sitesAreArguments)
        print >> sys.stderr, len(docs), "documents"
        documents.extend(docs)

    print >> sys.stderr, "Resolving equivalences"
    STFormat.Equiv.process(documents)

    print >> sys.stderr, "Checking data validity"
    for doc in documents:
        STFormat.Validate.validate(doc.events,
                                   simulation=True,
                                   verbose=True,
                                   docId=doc.id)
    print >> sys.stderr, "Writing all documents to geniaformat"
    ST.writeSet(documents,
                "all-geniaformat",
                resultFileTag="a2",
                debug=False,
                task=2,
                validate=False)

    print >> sys.stderr, "Converting to", bigfileName + "-documents.xml"
    xml = STConvert.toInteractionXML(documents, corpusName,
                                     bigfileName + "-documents.xml")

    if corpusName == "BI":
        InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train",
                                       "devel")

    for pair in datasets:
        if True:  #corpusName != "BI":
            print >> sys.stderr, "Adding analyses for set", pair[0]
            addAnalyses(xml, analysisTags[pair[0]], analysisPath, bigfileName)
    ETUtils.write(xml, bigfileName + "-sentences.xml")
    processParses(corpusName, xml)

    # Write out converted data
    ETUtils.write(xml, bigfileName + ".xml")
    InteractionXML.MergeDuplicateEntities.mergeAll(xml,
                                                   bigfileName + "-nodup.xml")
    for sourceTag in ["", "-nodup"]:
        print >> sys.stderr, "Dividing into sets"
        InteractionXML.DivideSets.processCorpus(
            bigfileName + sourceTag + ".xml", "./", corpusName + "-",
            sourceTag + ".xml", [("devel", "train")])
        if "devel" in [x[0] for x in datasets]:
            print >> sys.stderr, "Converting back"
            STConvert.toSTFormat(corpusName + "-devel" + sourceTag + ".xml",
                                 "roundtrip/" + corpusName + "-devel" +
                                 sourceTag + "-task2",
                                 outputTag="a2",
                                 task=2)
            STConvert.toSTFormat(corpusName + "-devel" + sourceTag + ".xml",
                                 "roundtrip/" + corpusName + "-devel" +
                                 sourceTag + "-task1",
                                 outputTag="a2",
                                 task=1)
            if corpusName == "GE":
                print >> sys.stderr, "Evaluating task 2 back-conversion"
                BioNLP11GeniaTools.evaluate("roundtrip/" + corpusName +
                                            "-devel" + sourceTag + "-task2",
                                            task=2,
                                            verbose=True,
                                            debug=False)
                print >> sys.stderr, "Evaluating task 1 back-conversion"
                BioNLP11GeniaTools.evaluate("roundtrip/" + corpusName +
                                            "-devel" + sourceTag + "-task1",
                                            task=1,
                                            verbose=True,
                                            debug=False)
            elif corpusName in ["BI", "BB"]:
                print >> sys.stderr, "Evaluating task 2 back-conversion"
                BioNLP11GeniaTools.evaluateBX(
                    "roundtrip/" + corpusName + "-devel" + sourceTag +
                    "-task2", corpusName)
                print >> sys.stderr, "Evaluating task 1 back-conversion"
                BioNLP11GeniaTools.evaluateBX(
                    "roundtrip/" + corpusName + "-devel" + sourceTag +
                    "-task1", corpusName)
            print >> sys.stderr, "Creating empty devel set"
            deletionRules = {"interaction": {}, "entity": {"isName": "False"}}
            InteractionXML.DeleteElements.processCorpus(
                corpusName + "-devel" + sourceTag + ".xml",
                corpusName + "-devel" + sourceTag + "-empty.xml",
                deletionRules)
def convertDownloaded(outdir,
                      corpus,
                      files,
                      intermediateFiles=True,
                      evaluate=True,
                      processEquiv=True,
                      analysisMode="INSERT",
                      packageSubPath=None,
                      debug=False):
    global moveBI
    if evaluate:
        workdir = outdir + "/conversion/" + corpus
        if os.path.exists(workdir):
            shutil.rmtree(workdir)
        os.makedirs(workdir)

    print >> sys.stderr, "---------------", "Converting to XML", "---------------"
    # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are
    # applied equally
    #print corpus, files
    datasets = []
    for setName in ["devel", "train", "test"]:
        if corpus + "_" + setName.upper() in files:
            datasets.append(setName)
    bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets))
    documents = []
    for setName in datasets:
        sourceFile = files[corpus + "_" + setName.upper()]
        print >> sys.stderr, "Reading", setName, "set from", sourceFile
        docs = ST.loadSet(sourceFile, setName, "a2", subPath=packageSubPath)
        print >> sys.stderr, "Read", len(docs), "documents"
        documents.extend(docs)

    if len(docs) > 0 and docs[0].license != None:
        licenseFile = open(os.path.join(outdir, corpus + "-LICENSE"), "wt")
        licenseFile.write(docs[0].license)
        licenseFile.close()

    if processEquiv:
        print >> sys.stderr, "Resolving equivalences"
        Utils.STFormat.Equiv.process(documents)
    else:
        print >> sys.stderr, "Skipping resolving of equivalences"

    if evaluate:
        #print >> sys.stderr, "Checking data validity"
        #for doc in documents:
        #    Utils.STFormat.Validate.validate(doc.events, simulation=True, verbose=True, docId=doc.id)
        print >> sys.stderr, "Writing all documents to geniaformat"
        ST.writeSet(documents,
                    os.path.join(workdir, "all-geniaformat"),
                    resultFileTag="a2",
                    debug=False)

    if intermediateFiles:
        print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName + "-documents.xml"
        xml = STConvert.toInteractionXML(documents, corpus,
                                         bigfileName + "-documents.xml")
    else:
        print >> sys.stderr, "Converting to XML"
        xml = STConvert.toInteractionXML(documents, corpus, None)

    if corpus == "BI11":
        Utils.InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train",
                                             "devel")
    if corpus == "REN11":
        corpusRENtoASCII(xml)

    if analysisMode == "INSERT":
        insertAnalyses(xml,
                       corpus,
                       datasets,
                       files,
                       bigfileName,
                       packageSubPath=packageSubPath)
        if intermediateFiles:
            print >> sys.stderr, "Writing combined corpus", bigfileName + "-sentences.xml"
            ETUtils.write(xml, bigfileName + "-sentences.xml")
        processParses(xml)
    elif analysisMode == "BUILD":
        parseXML(xml,
                 bigfileName,
                 intermediateFiles,
                 debug,
                 bbResources=(corpus.startswith("BB_")))
    else:
        print >> sys.stderr, "Skipping analyses"

    # A hack for GRN13 task that breaks the official BioNLP Shared Task convention of trigger and event having the same type.
    # Let's remove the unused triggers, so that there won't be an unusable node class. There is no clean way to fix this,
    # as the GRN13 task not following the official rules introduces yet another mechanism into the Shared Task format,
    # and supporting this would require rewriting everything.
    if corpus == "GRN13":
        Utils.InteractionXML.DeleteElements.processCorpus(
            xml, None, {"entity": {
                "type": ["Action"]
            }})

    print >> sys.stderr, "---------------", "Writing corpora", "---------------"
    checkAttributes(xml)
    # Write out converted data
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName + ".xml"
        ETUtils.write(xml, bigfileName + ".xml")
    print >> sys.stderr, "Dividing into sets"
    Utils.InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml")

    if evaluate and "devel" in datasets:
        print >> sys.stderr, "---------------", "Evaluating conversion", "---------------"
        if corpus != "REL11":  # Task 1 (removal of Entity-entities) cannot work for REL
            print >> sys.stderr, "Evaluating task 1 back-conversion"
            STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"),
                                 workdir + "/roundtrip/" + corpus + "-devel" +
                                 "-task1",
                                 outputTag="a2",
                                 skipArgs=["Site"])
            BioNLP11GeniaTools.evaluate(
                workdir + "/roundtrip/" + corpus + "-devel" + "-task1",
                corpus + ".1")
        print >> sys.stderr, "Evaluating task 2 back-conversion"
        STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"),
                             workdir + "/roundtrip/" + corpus + "-devel" +
                             "-task2",
                             outputTag="a2")
        BioNLP11GeniaTools.evaluate(
            workdir + "/roundtrip/" + corpus + "-devel" + "-task2",
            corpus + ".2")
        print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping"

    # Check what was produced by the conversion
    print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------"
    analyzer = StructureAnalyzer()
    analyzer.analyze([xml])
    print >> sys.stderr, analyzer.toString()
Example #4
0
def convertDownloaded(outdir, corpus, files, intermediateFiles=True, evaluate=True, processEquiv=True, addAnalyses=True, packageSubPath=None):
    global moveBI
    if evaluate:
        workdir = outdir + "/conversion/" + corpus
        if os.path.exists(workdir):
            shutil.rmtree(workdir)
        os.makedirs(workdir)
    
    print >> sys.stderr, "---------------", "Converting to XML", "---------------"
    # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are
    # applied equally
    #print corpus, files
    datasets = []
    for setName in ["devel", "train", "test"]:
        if corpus + "_" + setName.upper() in files:
            datasets.append(setName)
    bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets))
    documents = []
    for setName in datasets:
        sourceFile = files[corpus + "_" + setName.upper()]
        print >> sys.stderr, "Reading", setName, "set from", sourceFile
        docs = ST.loadSet(sourceFile, setName, "a2", subPath=packageSubPath)
        print >> sys.stderr, "Read", len(docs), "documents"
        documents.extend(docs)
        
    if len(docs) > 0 and docs[0].license != None:
        licenseFile = open(os.path.join(outdir, corpus + "-LICENSE"), "wt")
        licenseFile.write(docs[0].license)
        licenseFile.close()
    
    if processEquiv:
        print >> sys.stderr, "Resolving equivalences"
        Utils.STFormat.Equiv.process(documents)
    else:
        print >> sys.stderr, "Skipping resolving of equivalences"
    
    if evaluate:
        #print >> sys.stderr, "Checking data validity"
        #for doc in documents:
        #    Utils.STFormat.Validate.validate(doc.events, simulation=True, verbose=True, docId=doc.id)
        print >> sys.stderr, "Writing all documents to geniaformat"
        ST.writeSet(documents, os.path.join(workdir, "all-geniaformat"), resultFileTag="a2", debug=False)
    
    if intermediateFiles:
        print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName+"-documents.xml"
        xml = STConvert.toInteractionXML(documents, corpus, bigfileName+"-documents.xml")
    else:
        print >> sys.stderr, "Converting to XML"
        xml = STConvert.toInteractionXML(documents, corpus, None)
    
    if corpus == "BI11":
        Utils.InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train", "devel")
    if corpus == "REN11":
        corpusRENtoASCII(xml)
    
    if addAnalyses:
        insertAnalyses(xml, corpus, datasets, files, bigfileName, packageSubPath=packageSubPath)
    else:
        print >> sys.stderr, "Skipping adding analyses"
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName+"-sentences.xml"
        ETUtils.write(xml, bigfileName+"-sentences.xml")
    processParses(xml)
    
    # A hack for GRN13 task that breaks the official BioNLP Shared Task convention of trigger and event having the same type.
    # Let's remove the unused triggers, so that there won't be an unusable node class. There is no clean way to fix this,
    # as the GRN13 task not following the official rules introduces yet another mechanism into the Shared Task format,
    # and supporting this would require rewriting everything.
    if corpus == "GRN13":
        Utils.InteractionXML.DeleteElements.processCorpus(xml, None, {"entity":{"type":["Action"]}})
    
    print >> sys.stderr, "---------------", "Writing corpora", "---------------"
    checkAttributes(xml)
    # Write out converted data
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName+".xml"
        ETUtils.write(xml, bigfileName+".xml")
    print >> sys.stderr, "Dividing into sets"
    Utils.InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml")
    
    if evaluate and "devel" in datasets:
        print >> sys.stderr, "---------------", "Evaluating conversion", "---------------"
        if corpus != "REL11": # Task 1 (removal of Entity-entities) cannot work for REL
            print >> sys.stderr, "Evaluating task 1 back-conversion"
            STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task1", outputTag="a2", skipArgs=["Site"])
            BioNLP11GeniaTools.evaluate(workdir + "/roundtrip/" + corpus + "-devel" + "-task1", corpus + ".1")
        print >> sys.stderr, "Evaluating task 2 back-conversion"
        STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task2", outputTag="a2")
        BioNLP11GeniaTools.evaluate(workdir + "/roundtrip/" + corpus + "-devel" + "-task2", corpus + ".2")
        print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping"
    
    # Check what was produced by the conversion
    print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------"
    analyzer = StructureAnalyzer()
    analyzer.analyze([xml])
    print >> sys.stderr, analyzer.toString()
Example #5
0
def convertDownloaded(outdir,
                      corpus,
                      files,
                      intermediateFiles=True,
                      evaluate=True):
    global moveBI
    workdir = outdir + "/conversion/" + corpus
    if os.path.exists(workdir):
        shutil.rmtree(workdir)
    os.makedirs(workdir)

    print >> sys.stderr, "---------------", "Converting to XML", "---------------"
    # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are
    # applied equally
    datasets = ["devel", "train", "test"]
    bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets))
    documents = []
    for setName in datasets:
        sourceFile = files[corpus + "_" + setName.upper()]
        print >> sys.stderr, "Reading", setName, "set from", sourceFile, "temp at ",
        sitesAreArguments = False
        if corpus == "EPI":
            sitesAreArguments = True
        docs = ST.loadSet(sourceFile,
                          setName,
                          "a2",
                          sitesAreArguments=sitesAreArguments)
        print >> sys.stderr, "Read", len(docs), "documents"
        documents.extend(docs)

    print >> sys.stderr, "Resolving equivalences"
    STFormat.Equiv.process(documents)

    if evaluate:
        print >> sys.stderr, "Checking data validity"
        for doc in documents:
            STFormat.Validate.validate(doc.events,
                                       simulation=True,
                                       verbose=True,
                                       docId=doc.id)
        print >> sys.stderr, "Writing all documents to geniaformat"
        ST.writeSet(documents,
                    os.path.join(workdir, "all-geniaformat"),
                    resultFileTag="a2",
                    debug=False,
                    task=2,
                    validate=False)

    if intermediateFiles:
        print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName + "-documents.xml"
        xml = STConvert.toInteractionXML(documents, corpus,
                                         bigfileName + "-documents.xml")
    else:
        print >> sys.stderr, "Converting to XML"
        xml = STConvert.toInteractionXML(documents, corpus, None)

    if corpus == "BI":
        InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train",
                                       "devel")

    addAnalyses(xml, corpus, datasets, files, bigfileName)
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName + "-sentences.xml"
        ETUtils.write(xml, bigfileName + "-sentences.xml")
    processParses(xml)

    print >> sys.stderr, "---------------", "Writing corpora", "---------------"
    # Write out converted data
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName + ".xml"
        ETUtils.write(xml, bigfileName + ".xml")
    print >> sys.stderr, "Dividing into sets"
    InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml")

    if evaluate and "devel" in datasets:
        print >> sys.stderr, "---------------", "Evaluating conversion", "---------------"
        print >> sys.stderr, "Converting back"
        STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"),
                             workdir + "/roundtrip/" + corpus + "-devel" +
                             "-task1",
                             outputTag="a2",
                             task=1)
        STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"),
                             workdir + "/roundtrip/" + corpus + "-devel" +
                             "-task2",
                             outputTag="a2",
                             task=2)
        print >> sys.stderr, "Evaluating task 1 back-conversion"
        BioNLP11GeniaTools.evaluate(
            workdir + "/roundtrip/" + corpus + "-devel" + "-task1",
            corpus + ".1")
        print >> sys.stderr, "Evaluating task 2 back-conversion"
        BioNLP11GeniaTools.evaluate(
            workdir + "/roundtrip/" + corpus + "-devel" + "-task2",
            corpus + ".2")
        print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping"
def convertDownloaded(outdir,
                      corpus,
                      files,
                      intermediateFiles=True,
                      evaluate=True,
                      processEquiv=True,
                      addAnalyses=True,
                      packageSubPath=None):
    global moveBI
    if evaluate:
        workdir = outdir + "/conversion/" + corpus
        if os.path.exists(workdir):
            shutil.rmtree(workdir)
        os.makedirs(workdir)

    print >> sys.stderr, "---------------", "Converting to XML", "---------------"
    # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are
    # applied equally
    #print corpus, files
    datasets = []
    for setName in ["devel", "train", "test"]:
        if corpus + "_" + setName.upper() in files:
            datasets.append(setName)
    bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets))
    documents = []
    for setName in datasets:
        sourceFile = files[corpus + "_" + setName.upper()]
        print >> sys.stderr, "Reading", setName, "set from", sourceFile
        docs = ST.loadSet(sourceFile, setName, "a2", subPath=packageSubPath)
        print >> sys.stderr, "Read", len(docs), "documents"
        documents.extend(docs)

    if len(docs) > 0 and docs[0].license != None:
        licenseFile = open(os.path.join(outdir, corpus + "-LICENSE"), "wt")
        licenseFile.write(docs[0].license)
        licenseFile.close()

    if processEquiv:
        print >> sys.stderr, "Resolving equivalences"
        Utils.STFormat.Equiv.process(documents)
    else:
        print >> sys.stderr, "Skipping resolving of equivalences"

    if evaluate:
        #print >> sys.stderr, "Checking data validity"
        #for doc in documents:
        #    Utils.STFormat.Validate.validate(doc.events, simulation=True, verbose=True, docId=doc.id)
        print >> sys.stderr, "Writing all documents to geniaformat"
        ST.writeSet(documents,
                    os.path.join(workdir, "all-geniaformat"),
                    resultFileTag="a2",
                    debug=False)

    if intermediateFiles:
        print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName + "-documents.xml"
        xml = STConvert.toInteractionXML(documents, corpus,
                                         bigfileName + "-documents.xml")
    else:
        print >> sys.stderr, "Converting to XML"
        xml = STConvert.toInteractionXML(documents, corpus, None)

    if corpus == "BI11":
        Utils.InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train",
                                             "devel")
    if corpus == "REN11":
        corpusRENtoASCII(xml)

    if addAnalyses:
        insertAnalyses(xml,
                       corpus,
                       datasets,
                       files,
                       bigfileName,
                       packageSubPath=packageSubPath)
    else:
        print >> sys.stderr, "Skipping adding analyses"
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName + "-sentences.xml"
        ETUtils.write(xml, bigfileName + "-sentences.xml")
    processParses(xml)

    print >> sys.stderr, "---------------", "Writing corpora", "---------------"
    checkAttributes(xml)
    # Write out converted data
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName + ".xml"
        ETUtils.write(xml, bigfileName + ".xml")
    print >> sys.stderr, "Dividing into sets"
    Utils.InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml")

    if evaluate and "devel" in datasets:
        print >> sys.stderr, "---------------", "Evaluating conversion", "---------------"
        if corpus != "REL11":  # Task 1 (removal of Entity-entities) cannot work for REL
            print >> sys.stderr, "Evaluating task 1 back-conversion"
            STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"),
                                 workdir + "/roundtrip/" + corpus + "-devel" +
                                 "-task1",
                                 outputTag="a2",
                                 skipArgs=["Site"])
            BioNLP11GeniaTools.evaluate(
                workdir + "/roundtrip/" + corpus + "-devel" + "-task1",
                corpus + ".1")
        print >> sys.stderr, "Evaluating task 2 back-conversion"
        STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"),
                             workdir + "/roundtrip/" + corpus + "-devel" +
                             "-task2",
                             outputTag="a2")
        BioNLP11GeniaTools.evaluate(
            workdir + "/roundtrip/" + corpus + "-devel" + "-task2",
            corpus + ".2")
        print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping"

    # Check what was produced by the conversion
    print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------"
    analyzer = StructureAnalyzer()
    analyzer.analyze([xml])
    print >> sys.stderr, analyzer.toString()