Beispiel #1
0
def convertDDI13(outDir, downloadDir=None, datasets=["DDI13_TRAIN", "DDI13_TEST_TASK_9.1", "DDI13_TEST_TASK_9.2"], redownload=False, insertParses=True, parse=False, makeIntermediateFiles=True, debug=False):
    cwd = os.getcwd()
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    os.chdir(outDir)
    logFileName = os.path.join(outDir, "DDI13-conversion-log.txt")
    Stream.openLog(logFileName)
    print >> sys.stderr, "=======================", "Converting DDI'13 corpus", "======================="
    
    tempdir = tempfile.mkdtemp()
    downloaded = downloadFiles(downloadDir, tempdir, redownload)
    
    for dataset in datasets:       
        corpusTree = getCorpusXML()
        xml = corpusTree.getroot()
        print >> sys.stderr, "Merging input XMLs"
        assert downloaded[dataset] != None
        combineXML(xml, "train", downloaded[dataset], subDirs=["DrugBank", "MedLine", "NER"])
        print >> sys.stderr, "Processing elements"
        processElements(xml)
        
        if dataset == "DDI13_TRAIN":
            print >> sys.stderr, "Dividing training set into folds"
            divideSets(xml, "train", 10)
        else:
            for doc in xml.getiterator("document"):
                doc.set("set", "test")

        if parse:
            print >> sys.stderr, "Parsing"
            parseXML(corpusTree, os.path.join(tempdir, "parsing"), debug)
        elif insertParses:
            assert parse == False
            print >> sys.stderr, "Inserting McCC parses"
            Tools.BLLIPParser.insertParses(corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"source":"TEES"})
            print >> sys.stderr, "Inserting Stanford conversions"
            Tools.StanfordParser.insertParses(corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"stanfordSource":"TEES"})
        # Check what was produced by the conversion
        print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------"
        analyzer = StructureAnalyzer()
        analyzer.analyze([xml])
        print >> sys.stderr, analyzer.toString()
        if "9.1" in dataset:
            outFileName = os.path.join(outDir, "DDI13-test-task9.1.xml")
        elif "9.2" in dataset:
            outFileName = os.path.join(outDir, "DDI13-test-task9.2.xml")
        else:
            outFileName = os.path.join(outDir, "DDI13-train.xml")
        print >> sys.stderr, "Writing output to", outFileName
        ETUtils.write(xml, outFileName)
    
    Stream.closeLog(logFileName)
    if not debug and tempdir != None:
        print >> sys.stderr, "Removing temporary directory", tempdir
        shutil.rmtree(tempdir)
    os.chdir(cwd)
Beispiel #2
0
def learnSettings(inputFiles, detector, classifierParameters):
    if detector == None:
        print >> sys.stderr, "*** Analyzing input files to determine training settings ***"
        structureAnalyzer = StructureAnalyzer()
        if not os.path.exists("training/structure.txt"):
            datasets = sorted(
                filter(None, [inputFiles["train"], inputFiles["devel"]]))
            print >> sys.stderr, "input files:", datasets
            structureAnalyzer.analyze(datasets)
            print >> sys.stderr, structureAnalyzer.toString()
            structureAnalyzer.save(None, "training/structure.txt")
        else:
            print >> sys.stderr, "Using existing analysis from training/structure.txt"
            structureAnalyzer.load(None, "training/structure.txt")

    # Choose detector
    if detector == None:
        if "ENTITY" in structureAnalyzer.targets and "INTERACTION" in structureAnalyzer.targets:
            detector = "Detectors.EventDetector"
        elif "ENTITY" in structureAnalyzer.targets:
            detector = "Detectors.EntityDetector"
        elif "INTERACTION" in structureAnalyzer.targets:
            detector = "Detectors.EdgeDetector"
        else:
            assert False, structureAnalyzer.targets
    print >> sys.stderr, "Using detector '" + str(detector) + "'"

    # Set default parameters
    if detector == "Detectors.EventDetector":
        classifierParameters["unmerging"] = Parameters.cat(
            "c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000",
            classifierParameters["unmerging"],
            "Classifier parameters for unmerging")
        classifierParameters["modifiers"] = Parameters.cat(
            "c=5000,10000,20000,50000,100000",
            classifierParameters["modifiers"],
            "Classifier parameters for modifiers")
        classifierParameters["edge"] = Parameters.cat(
            "c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000",
            classifierParameters["edge"], "Classifier parameters for edges")
        classifierParameters["trigger"] = Parameters.cat(
            "c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000",
            classifierParameters["trigger"],
            "Classifier parameters for triggers")
        classifierParameters["recall"] = Parameters.cat(
            "0.5,0.6,0.65,0.7,0.85,1.0,1.1,1.2",
            classifierParameters["recall"], "Recall adjustment parameters")
    elif detector == "Detectors.EntityDetector":
        classifierParameters["examples"] = Parameters.cat(
            "c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000",
            classifierParameters["examples"],
            "Classifier parameters for entities")
    elif detector == "Detectors.EdgeDetector":
        classifierParameters["examples"] = Parameters.cat(
            "c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000",
            classifierParameters["examples"],
            "Classifier parameters for edges")

    return detector
Beispiel #3
0
def learnSettings(inputFiles, detector, classifierParameters):
    if detector == None:
        print >> sys.stderr, "*** Analyzing input files to determine training settings ***"
        structureAnalyzer = StructureAnalyzer()
        if not os.path.exists("training/structure.txt"): 
            datasets = sorted(filter(None, [inputFiles["train"], inputFiles["devel"]]))
            print >> sys.stderr, "input files:", datasets
            structureAnalyzer.analyze(datasets)
            print >> sys.stderr, structureAnalyzer.toString()
            structureAnalyzer.save(None, "training/structure.txt")
        else:
            print >> sys.stderr, "Using existing analysis from training/structure.txt"
            structureAnalyzer.load(None, "training/structure.txt")
    
    # Choose detector
    if detector == None:
        if "ENTITY" in structureAnalyzer.targets and "INTERACTION" in structureAnalyzer.targets:
            detector = "Detectors.EventDetector"
        elif "ENTITY" in structureAnalyzer.targets:
            detector = "Detectors.EntityDetector"
        elif "INTERACTION" in structureAnalyzer.targets:
            detector = "Detectors.EdgeDetector"
        else:
            assert False, structureAnalyzer.targets
    print >> sys.stderr, "Using detector '" + str(detector) + "'"
    
    # Set default parameters
    cp = classifierParameters
    if detector == "Detectors.EventDetector":
        # Add common classifier parameters
        if cp["examples"] != None:
            cp["unmerging"] = Parameters.cat(cp["examples"], cp["unmerging"])
            cp["modifiers"] = Parameters.cat(cp["examples"], cp["modifiers"])
            cp["edge"] = Parameters.cat(cp["examples"], cp["edge"])
            cp["trigger"] = Parameters.cat(cp["examples"], cp["trigger"])
        cp["unmerging"] = Parameters.cat("c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000", cp["unmerging"], "Classifier parameters for unmerging")        
        cp["modifiers"] = Parameters.cat("c=5000,10000,20000,50000,100000", cp["modifiers"], "Classifier parameters for modifiers")
        cp["edge"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", cp["edge"], "Classifier parameters for edges")
        cp["trigger"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", cp["trigger"], "Classifier parameters for triggers")
        cp["recall"] = Parameters.cat("0.5,0.6,0.65,0.7,0.85,1.0,1.1,1.2", cp["recall"], "Recall adjustment parameters")
    elif detector == "Detectors.EntityDetector":
        cp["examples"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", cp["examples"], "Classifier parameters for entities")
    elif detector == "Detectors.EdgeDetector":
        cp["examples"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", cp["examples"], "Classifier parameters for edges")

    return detector
Beispiel #4
0
    optparser = OptionParser(
        usage=
        "%prog [options]\nBuild machine learning examples from interaction XML."
    )
    addBasicOptions(optparser)
    (options, args) = optparser.parse_args()

    if options.gold == "AUTO":
        options.gold = options.input

    print >> sys.stderr, "Importing modules"
    exec "from ExampleBuilders." + options.exampleBuilder + " import " + options.exampleBuilder + " as ExampleBuilderClass"

    structureAnalyzer = None
    if options.structure == None:  # define structure from input file
        structureAnalyzer = StructureAnalyzer()
        structureAnalyzer.analyze(options.input)
        print >> sys.stderr, "--- Structure Analysis ----"
        print >> sys.stderr, structureAnalyzer.toString()
    elif options.structure != "NONE":  # a file name
        structureAnalyzer = StructureAnalyzer(options.structure)
    #input, output, parse, tokenization, style, classIds=None, featureIds=None, gold=None, append=False)
    ExampleBuilderClass.run(options.input,
                            options.output,
                            options.parse,
                            None,
                            options.parameters,
                            options.classes,
                            options.features,
                            allowNewIds=options.addIds,
                            structureAnalyzer=structureAnalyzer,
def convertDownloaded(outdir,
                      corpus,
                      files,
                      intermediateFiles=True,
                      evaluate=True,
                      processEquiv=True,
                      analysisMode="INSERT",
                      packageSubPath=None,
                      debug=False):
    global moveBI
    if evaluate:
        workdir = outdir + "/conversion/" + corpus
        if os.path.exists(workdir):
            shutil.rmtree(workdir)
        os.makedirs(workdir)

    print >> sys.stderr, "---------------", "Converting to XML", "---------------"
    # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are
    # applied equally
    #print corpus, files
    datasets = []
    for setName in ["devel", "train", "test"]:
        if corpus + "_" + setName.upper() in files:
            datasets.append(setName)
    bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets))
    documents = []
    for setName in datasets:
        sourceFile = files[corpus + "_" + setName.upper()]
        print >> sys.stderr, "Reading", setName, "set from", sourceFile
        docs = ST.loadSet(sourceFile, setName, "a2", subPath=packageSubPath)
        print >> sys.stderr, "Read", len(docs), "documents"
        documents.extend(docs)

    if len(docs) > 0 and docs[0].license != None:
        licenseFile = open(os.path.join(outdir, corpus + "-LICENSE"), "wt")
        licenseFile.write(docs[0].license)
        licenseFile.close()

    if processEquiv:
        print >> sys.stderr, "Resolving equivalences"
        Utils.STFormat.Equiv.process(documents)
    else:
        print >> sys.stderr, "Skipping resolving of equivalences"

    if evaluate:
        #print >> sys.stderr, "Checking data validity"
        #for doc in documents:
        #    Utils.STFormat.Validate.validate(doc.events, simulation=True, verbose=True, docId=doc.id)
        print >> sys.stderr, "Writing all documents to geniaformat"
        ST.writeSet(documents,
                    os.path.join(workdir, "all-geniaformat"),
                    resultFileTag="a2",
                    debug=False)

    if intermediateFiles:
        print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName + "-documents.xml"
        xml = STConvert.toInteractionXML(documents, corpus,
                                         bigfileName + "-documents.xml")
    else:
        print >> sys.stderr, "Converting to XML"
        xml = STConvert.toInteractionXML(documents, corpus, None)

    if corpus == "BI11":
        Utils.InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train",
                                             "devel")
    if corpus == "REN11":
        corpusRENtoASCII(xml)

    if analysisMode == "INSERT":
        insertAnalyses(xml,
                       corpus,
                       datasets,
                       files,
                       bigfileName,
                       packageSubPath=packageSubPath)
        if intermediateFiles:
            print >> sys.stderr, "Writing combined corpus", bigfileName + "-sentences.xml"
            ETUtils.write(xml, bigfileName + "-sentences.xml")
        processParses(xml)
    elif analysisMode == "BUILD":
        parseXML(xml,
                 bigfileName,
                 intermediateFiles,
                 debug,
                 bbResources=(corpus.startswith("BB_")))
    else:
        print >> sys.stderr, "Skipping analyses"

    # A hack for GRN13 task that breaks the official BioNLP Shared Task convention of trigger and event having the same type.
    # Let's remove the unused triggers, so that there won't be an unusable node class. There is no clean way to fix this,
    # as the GRN13 task not following the official rules introduces yet another mechanism into the Shared Task format,
    # and supporting this would require rewriting everything.
    if corpus == "GRN13":
        Utils.InteractionXML.DeleteElements.processCorpus(
            xml, None, {"entity": {
                "type": ["Action"]
            }})

    print >> sys.stderr, "---------------", "Writing corpora", "---------------"
    checkAttributes(xml)
    # Write out converted data
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName + ".xml"
        ETUtils.write(xml, bigfileName + ".xml")
    print >> sys.stderr, "Dividing into sets"
    Utils.InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml")

    if evaluate and "devel" in datasets:
        print >> sys.stderr, "---------------", "Evaluating conversion", "---------------"
        if corpus != "REL11":  # Task 1 (removal of Entity-entities) cannot work for REL
            print >> sys.stderr, "Evaluating task 1 back-conversion"
            STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"),
                                 workdir + "/roundtrip/" + corpus + "-devel" +
                                 "-task1",
                                 outputTag="a2",
                                 skipArgs=["Site"])
            BioNLP11GeniaTools.evaluate(
                workdir + "/roundtrip/" + corpus + "-devel" + "-task1",
                corpus + ".1")
        print >> sys.stderr, "Evaluating task 2 back-conversion"
        STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"),
                             workdir + "/roundtrip/" + corpus + "-devel" +
                             "-task2",
                             outputTag="a2")
        BioNLP11GeniaTools.evaluate(
            workdir + "/roundtrip/" + corpus + "-devel" + "-task2",
            corpus + ".2")
        print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping"

    # Check what was produced by the conversion
    print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------"
    analyzer = StructureAnalyzer()
    analyzer.analyze([xml])
    print >> sys.stderr, analyzer.toString()
Beispiel #6
0
def convertDownloaded(outdir, corpus, files, intermediateFiles=True, evaluate=True, processEquiv=True, addAnalyses=True, packageSubPath=None):
    global moveBI
    if evaluate:
        workdir = outdir + "/conversion/" + corpus
        if os.path.exists(workdir):
            shutil.rmtree(workdir)
        os.makedirs(workdir)
    
    print >> sys.stderr, "---------------", "Converting to XML", "---------------"
    # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are
    # applied equally
    #print corpus, files
    datasets = []
    for setName in ["devel", "train", "test"]:
        if corpus + "_" + setName.upper() in files:
            datasets.append(setName)
    bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets))
    documents = []
    for setName in datasets:
        sourceFile = files[corpus + "_" + setName.upper()]
        print >> sys.stderr, "Reading", setName, "set from", sourceFile
        docs = ST.loadSet(sourceFile, setName, "a2", subPath=packageSubPath)
        print >> sys.stderr, "Read", len(docs), "documents"
        documents.extend(docs)
        
    if len(docs) > 0 and docs[0].license != None:
        licenseFile = open(os.path.join(outdir, corpus + "-LICENSE"), "wt")
        licenseFile.write(docs[0].license)
        licenseFile.close()
    
    if processEquiv:
        print >> sys.stderr, "Resolving equivalences"
        Utils.STFormat.Equiv.process(documents)
    else:
        print >> sys.stderr, "Skipping resolving of equivalences"
    
    if evaluate:
        #print >> sys.stderr, "Checking data validity"
        #for doc in documents:
        #    Utils.STFormat.Validate.validate(doc.events, simulation=True, verbose=True, docId=doc.id)
        print >> sys.stderr, "Writing all documents to geniaformat"
        ST.writeSet(documents, os.path.join(workdir, "all-geniaformat"), resultFileTag="a2", debug=False)
    
    if intermediateFiles:
        print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName+"-documents.xml"
        xml = STConvert.toInteractionXML(documents, corpus, bigfileName+"-documents.xml")
    else:
        print >> sys.stderr, "Converting to XML"
        xml = STConvert.toInteractionXML(documents, corpus, None)
    
    if corpus == "BI11":
        Utils.InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train", "devel")
    if corpus == "REN11":
        corpusRENtoASCII(xml)
    
    if addAnalyses:
        insertAnalyses(xml, corpus, datasets, files, bigfileName, packageSubPath=packageSubPath)
    else:
        print >> sys.stderr, "Skipping adding analyses"
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName+"-sentences.xml"
        ETUtils.write(xml, bigfileName+"-sentences.xml")
    processParses(xml)
    
    # A hack for GRN13 task that breaks the official BioNLP Shared Task convention of trigger and event having the same type.
    # Let's remove the unused triggers, so that there won't be an unusable node class. There is no clean way to fix this,
    # as the GRN13 task not following the official rules introduces yet another mechanism into the Shared Task format,
    # and supporting this would require rewriting everything.
    if corpus == "GRN13":
        Utils.InteractionXML.DeleteElements.processCorpus(xml, None, {"entity":{"type":["Action"]}})
    
    print >> sys.stderr, "---------------", "Writing corpora", "---------------"
    checkAttributes(xml)
    # Write out converted data
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName+".xml"
        ETUtils.write(xml, bigfileName+".xml")
    print >> sys.stderr, "Dividing into sets"
    Utils.InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml")
    
    if evaluate and "devel" in datasets:
        print >> sys.stderr, "---------------", "Evaluating conversion", "---------------"
        if corpus != "REL11": # Task 1 (removal of Entity-entities) cannot work for REL
            print >> sys.stderr, "Evaluating task 1 back-conversion"
            STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task1", outputTag="a2", skipArgs=["Site"])
            BioNLP11GeniaTools.evaluate(workdir + "/roundtrip/" + corpus + "-devel" + "-task1", corpus + ".1")
        print >> sys.stderr, "Evaluating task 2 back-conversion"
        STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task2", outputTag="a2")
        BioNLP11GeniaTools.evaluate(workdir + "/roundtrip/" + corpus + "-devel" + "-task2", corpus + ".2")
        print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping"
    
    # Check what was produced by the conversion
    print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------"
    analyzer = StructureAnalyzer()
    analyzer.analyze([xml])
    print >> sys.stderr, analyzer.toString()
Beispiel #7
0
    try:
        import psyco
        psyco.full()
        print >> sys.stderr, "Found Psyco, using"
    except ImportError:
        print >> sys.stderr, "Psyco not installed"

    from optparse import OptionParser
    optparser = OptionParser(usage="%prog [options]\nBuild machine learning examples from interaction XML.")
    addBasicOptions(optparser)
    (options, args) = optparser.parse_args()
    
    if options.gold == "AUTO":
        options.gold = options.input
    
    print >> sys.stderr, "Importing modules"
    exec "from ExampleBuilders." + options.exampleBuilder + " import " + options.exampleBuilder + " as ExampleBuilderClass"
    
    structureAnalyzer = None
    if options.structure == None: # define structure from input file
        structureAnalyzer = StructureAnalyzer()
        structureAnalyzer.analyze(options.input)
        print >> sys.stderr, "--- Structure Analysis ----"
        print >> sys.stderr, structureAnalyzer.toString()
    elif options.structure != "NONE": # a file name
        structureAnalyzer = StructureAnalyzer(options.structure)
    #input, output, parse, tokenization, style, classIds=None, featureIds=None, gold=None, append=False)
    ExampleBuilderClass.run(options.input, options.output, options.parse, None, options.parameters, 
                            options.classes, options.features, allowNewIds=options.addIds, 
                            structureAnalyzer=structureAnalyzer, debug=options.debug, gold=options.gold)
Beispiel #8
0
def learnSettings(inputFiles, detector, classifierParameters, task, exampleStyles, useKerasDetector=False):
    if detector == None:
        print >> sys.stderr, "*** Analyzing input files to determine training settings ***"
        structureAnalyzer = StructureAnalyzer()
        if not os.path.exists("training/structure.txt"): 
            datasets = sorted(filter(None, [inputFiles["train"], inputFiles["devel"]]))
            print >> sys.stderr, "input files:", datasets
            structureAnalyzer.analyze(datasets)
            print >> sys.stderr, structureAnalyzer.toString()
            structureAnalyzer.save(None, "training/structure.txt")
        else:
            print >> sys.stderr, "Using existing analysis from training/structure.txt"
            structureAnalyzer.load(None, "training/structure.txt")
    
    # Choose detector
    if detector == None:
        if "ENTITY" in structureAnalyzer.targets and "INTERACTION" in structureAnalyzer.targets:
            detector = "Detectors.EventDetector"
        elif "ENTITY" in structureAnalyzer.targets:
            detector = "Detectors.EntityDetector"
        elif "INTERACTION" in structureAnalyzer.targets:
            detector = "Detectors.EdgeDetector"
        else:
            assert False, structureAnalyzer.targets

    if useKerasDetector and not "Keras" in detector:
        detector = detector.replace("Detectors.", "Detectors.Keras")
    print >> sys.stderr, "Using detector '" + str(detector) + "'"
    
    # Set default parameters
    cp = classifierParameters
    if detector == "Detectors.EventDetector":
        # Add common classifier parameters
        if cp["examples"] != None:
            cp["unmerging"] = Parameters.cat(cp["examples"], cp["unmerging"])
            cp["modifiers"] = Parameters.cat(cp["examples"], cp["modifiers"])
            cp["edge"] = Parameters.cat(cp["examples"], cp["edge"])
            cp["trigger"] = Parameters.cat(cp["examples"], cp["trigger"])
        cp["unmerging"] = Parameters.cat("c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000", cp["unmerging"], "Classifier parameters for unmerging")        
        cp["modifiers"] = Parameters.cat("c=5000,10000,20000,50000,100000", cp["modifiers"], "Classifier parameters for modifiers")
        cp["edge"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", cp["edge"], "Classifier parameters for edges")
        cp["trigger"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", cp["trigger"], "Classifier parameters for triggers")
        cp["recall"] = Parameters.cat("0.5,0.6,0.65,0.7,0.85,1.0,1.1,1.2", cp["recall"], "Recall adjustment parameters")
    elif detector == "Detectors.EntityDetector":
        cp["examples"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", cp["examples"], "Classifier parameters for entities")
    elif detector == "Detectors.EdgeDetector":
        cp["examples"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", cp["examples"], "Classifier parameters for edges")
    elif detector == "Detectors.UnmergingDetector":
        cp["examples"] = Parameters.cat("c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000", cp["examples"], "Classifier parameters for unmerging")
    
    #######################################################################
    # Keras example styles
    #######################################################################
    if useKerasDetector:
        task, subTask = getSubTask(task)
        msg = "Keras example style"
        #overrideStyles = {x:(Parameters.get(exampleStyles[x]) if (exampleStyles[x] != None and "override" in exampleStyles[x]) else {"override":True}) for x in exampleStyles}
        overrideStyles = {"all":{}}
        for key in exampleStyles:
            overrideStyles[key] = {}
            params = Parameters.get(exampleStyles[key])
            if "override" in params:
                exampleStyles[key] = None
                overrideStyles[key] = params
                overrideStyles[key].pop("override")
            elif "override_all" in params:
                exampleStyles[key] = None
                overrideStyles["all"] = params
                overrideStyles["all"].pop("override_all")
            #exampleStyles[key] = exampleStyles[key] if (exampleStyles[key] != None and not "override" in exampleStyles[key]) else None
        print >> sys.stderr, "Override styles:", overrideStyles
        if "EventDetector" in detector:
            if task == "EPI11":
                exampleStyles["trigger"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20:epi_merge_negated", exampleStyles["trigger"])
            else:
                exampleStyles["trigger"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20", exampleStyles["trigger"])
            if task in ["GE09", "GE11", "GE13"] and subTask == 1:
                exampleStyles["edge"] = Parameters.cat("keras:genia_task1:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["edge"])
            else:
                exampleStyles["edge"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["edge"])
            exampleStyles["unmerging"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["unmerging"])
            exampleStyles["modifiers"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:el=41:mods=20", exampleStyles["modifiers"])
        elif "EntityDetector" in detector:
            if task == "DDI13T91":
                exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20:names:build_for_nameless", exampleStyles["examples"])
            else:
                exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20", exampleStyles["examples"])
        elif "EdgeDetector" in detector:
            if "DDI" in task:
                exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=0:do=0.2:dense=800:ol=50:mods=20", exampleStyles["examples"])
            elif task == "CP17":
                exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=0:do=0.2:ol=50:skip_labels=CPR\:0,CPR\:1,CPR\:2,CPR\:7,CPR\:8,CPR\:10:mods=20", exampleStyles["examples"])
            else:
                exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["examples"])
        print >> sys.stderr, "Keras initial example styles:", exampleStyles
        for key in exampleStyles:
            if exampleStyles[key] != None:
                exampleStyles[key] = Parameters.get(exampleStyles[key])
                exampleStyles[key].update(overrideStyles[key])
                exampleStyles[key].update(overrideStyles["all"])
                exampleStyles[key] = Parameters.toString(exampleStyles[key])
            print >> sys.stderr, "Keras final example style for " + key + ": ", exampleStyles[key]
        
    return detector
Beispiel #9
0
def convertDDI13(
        outDir,
        downloadDir=None,
        datasets=["DDI13_TRAIN", "DDI13_TEST_TASK_9.1", "DDI13_TEST_TASK_9.2"],
        redownload=False,
        insertParses=True,
        parse=False,
        makeIntermediateFiles=True,
        debug=False):
    cwd = os.getcwd()
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    os.chdir(outDir)
    logFileName = os.path.join(outDir, "DDI13-conversion-log.txt")
    Stream.openLog(logFileName)
    print >> sys.stderr, "=======================", "Converting DDI'13 corpus", "======================="

    tempdir = tempfile.mkdtemp()
    downloaded = downloadFiles(downloadDir, tempdir, redownload)

    for dataset in datasets:
        corpusTree = getCorpusXML()
        xml = corpusTree.getroot()
        print >> sys.stderr, "Merging input XMLs"
        assert downloaded[dataset] != None
        combineXML(xml,
                   "train",
                   downloaded[dataset],
                   subDirs=["DrugBank", "MedLine", "NER"])
        print >> sys.stderr, "Processing elements"
        processElements(xml)

        if dataset == "DDI13_TRAIN":
            print >> sys.stderr, "Dividing training set into folds"
            divideSets(xml, "train", 10)
        else:
            for doc in xml.getiterator("document"):
                doc.set("set", "test")

        if parse:
            print >> sys.stderr, "Parsing"
            parseXML(corpusTree, os.path.join(tempdir, "parsing"), debug)
        elif insertParses:
            assert parse == False
            print >> sys.stderr, "Inserting McCC parses"
            Tools.BLLIPParser.insertParses(corpusTree,
                                           downloaded[dataset +
                                                      "_TEES_PARSES"],
                                           None,
                                           extraAttributes={"source": "TEES"})
            print >> sys.stderr, "Inserting Stanford conversions"
            Tools.StanfordParser.insertParses(
                corpusTree,
                downloaded[dataset + "_TEES_PARSES"],
                None,
                extraAttributes={"stanfordSource": "TEES"})
        # Check what was produced by the conversion
        print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------"
        analyzer = StructureAnalyzer()
        analyzer.analyze([xml])
        print >> sys.stderr, analyzer.toString()
        if "9.1" in dataset:
            outFileName = os.path.join(outDir, "DDI13-test-task9.1.xml")
        elif "9.2" in dataset:
            outFileName = os.path.join(outDir, "DDI13-test-task9.2.xml")
        else:
            outFileName = os.path.join(outDir, "DDI13-train.xml")
        print >> sys.stderr, "Writing output to", outFileName
        ETUtils.write(xml, outFileName)

    Stream.closeLog(logFileName)
    if not debug and tempdir != None:
        print >> sys.stderr, "Removing temporary directory", tempdir
        shutil.rmtree(tempdir)
    os.chdir(cwd)
def convertDownloaded(outdir,
                      corpus,
                      files,
                      intermediateFiles=True,
                      evaluate=True,
                      processEquiv=True,
                      addAnalyses=True,
                      packageSubPath=None):
    global moveBI
    if evaluate:
        workdir = outdir + "/conversion/" + corpus
        if os.path.exists(workdir):
            shutil.rmtree(workdir)
        os.makedirs(workdir)

    print >> sys.stderr, "---------------", "Converting to XML", "---------------"
    # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are
    # applied equally
    #print corpus, files
    datasets = []
    for setName in ["devel", "train", "test"]:
        if corpus + "_" + setName.upper() in files:
            datasets.append(setName)
    bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets))
    documents = []
    for setName in datasets:
        sourceFile = files[corpus + "_" + setName.upper()]
        print >> sys.stderr, "Reading", setName, "set from", sourceFile
        docs = ST.loadSet(sourceFile, setName, "a2", subPath=packageSubPath)
        print >> sys.stderr, "Read", len(docs), "documents"
        documents.extend(docs)

    if len(docs) > 0 and docs[0].license != None:
        licenseFile = open(os.path.join(outdir, corpus + "-LICENSE"), "wt")
        licenseFile.write(docs[0].license)
        licenseFile.close()

    if processEquiv:
        print >> sys.stderr, "Resolving equivalences"
        Utils.STFormat.Equiv.process(documents)
    else:
        print >> sys.stderr, "Skipping resolving of equivalences"

    if evaluate:
        #print >> sys.stderr, "Checking data validity"
        #for doc in documents:
        #    Utils.STFormat.Validate.validate(doc.events, simulation=True, verbose=True, docId=doc.id)
        print >> sys.stderr, "Writing all documents to geniaformat"
        ST.writeSet(documents,
                    os.path.join(workdir, "all-geniaformat"),
                    resultFileTag="a2",
                    debug=False)

    if intermediateFiles:
        print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName + "-documents.xml"
        xml = STConvert.toInteractionXML(documents, corpus,
                                         bigfileName + "-documents.xml")
    else:
        print >> sys.stderr, "Converting to XML"
        xml = STConvert.toInteractionXML(documents, corpus, None)

    if corpus == "BI11":
        Utils.InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train",
                                             "devel")
    if corpus == "REN11":
        corpusRENtoASCII(xml)

    if addAnalyses:
        insertAnalyses(xml,
                       corpus,
                       datasets,
                       files,
                       bigfileName,
                       packageSubPath=packageSubPath)
    else:
        print >> sys.stderr, "Skipping adding analyses"
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName + "-sentences.xml"
        ETUtils.write(xml, bigfileName + "-sentences.xml")
    processParses(xml)

    print >> sys.stderr, "---------------", "Writing corpora", "---------------"
    checkAttributes(xml)
    # Write out converted data
    if intermediateFiles:
        print >> sys.stderr, "Writing combined corpus", bigfileName + ".xml"
        ETUtils.write(xml, bigfileName + ".xml")
    print >> sys.stderr, "Dividing into sets"
    Utils.InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml")

    if evaluate and "devel" in datasets:
        print >> sys.stderr, "---------------", "Evaluating conversion", "---------------"
        if corpus != "REL11":  # Task 1 (removal of Entity-entities) cannot work for REL
            print >> sys.stderr, "Evaluating task 1 back-conversion"
            STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"),
                                 workdir + "/roundtrip/" + corpus + "-devel" +
                                 "-task1",
                                 outputTag="a2",
                                 skipArgs=["Site"])
            BioNLP11GeniaTools.evaluate(
                workdir + "/roundtrip/" + corpus + "-devel" + "-task1",
                corpus + ".1")
        print >> sys.stderr, "Evaluating task 2 back-conversion"
        STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"),
                             workdir + "/roundtrip/" + corpus + "-devel" +
                             "-task2",
                             outputTag="a2")
        BioNLP11GeniaTools.evaluate(
            workdir + "/roundtrip/" + corpus + "-devel" + "-task2",
            corpus + ".2")
        print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping"

    # Check what was produced by the conversion
    print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------"
    analyzer = StructureAnalyzer()
    analyzer.analyze([xml])
    print >> sys.stderr, analyzer.toString()