Example #1
0
File: train.py Project: ninjin/TEES
def workdir(path, deleteIfExists=True, copyFrom=None, log="log.txt"):
    # When using a template, always remove existing work directory
    if copyFrom != None:
        deleteIfExists = True
    # Remove existing work directory, if requested to do so
    if os.path.exists(path) and deleteIfExists:
        print >> sys.stderr, "Output directory exists, removing", path
        shutil.rmtree(path)
    # Create work directory if needed
    if not os.path.exists(path):
        if copyFrom == None:
            print >> sys.stderr, "Making output directory", path
            os.makedirs(path)
        else:
            print >> sys.stderr, "Copying template from", options.copyFrom, "to", path
            shutil.copytree(options.copyFrom, path)
    else:
        print >> sys.stderr, "Using existing output directory", path
    # Remember current directory and switch to workdir
    atexit.register(os.chdir, os.getcwd())
    os.chdir(path)
    # Open log (if a relative path, it goes under workdir)
    if log != None:
        Stream.openLog(log)
    else:
        print >> sys.stderr, "No logging"
    return path
Example #2
0
def workdir(path, deleteIfExists=True, copyFrom=None, log="log.txt"):
    # When using a template, always remove existing work directory
    if copyFrom != None:
        deleteIfExists = True
    # Remove existing work directory, if requested to do so
    if os.path.exists(path) and deleteIfExists:
        print >> sys.stderr, "Output directory exists, removing", path
        shutil.rmtree(path)
    # Create work directory if needed
    if not os.path.exists(path):
        if copyFrom == None:
            print >> sys.stderr, "Making output directory", path
            os.makedirs(path)
        else:
            print >> sys.stderr, "Copying template from", options.copyFrom, "to", path
            shutil.copytree(options.copyFrom, path)
    else:
        print >> sys.stderr, "Using existing output directory", path
    # Remember current directory and switch to workdir
    atexit.register(os.chdir, os.getcwd())
    os.chdir(path)
    # Open log (if a relative path, it goes under workdir)
    if log != None:
        Stream.openLog(log)
    else:
        print >> sys.stderr, "No logging"
    return path
Example #3
0
def convertCorpus(corpus, outDir=None, downloadDir=None, redownload=False, makeIntermediateFiles=True, evaluate=False, processEquiv=True, analysisMode="INSERT", debug=False, preprocessorSteps=None, preprocessorParameters=None, logPath=None):
    global bioNLP13AnalysesTempDir
    
    print >> sys.stderr, "==========", "Converting BioNLP Shared Task", corpus, "corpus", "=========="
    assert analysisMode in ("AUTO", "INSERT", "BUILD", "SKIP")
    if logPath == "AUTO":
        if outDir != None:
            logPath = outDir + "/conversion/" + corpus + "-conversion-log.txt"
        else:
            logPath = None
    if logPath:
        Stream.openLog(logPath)
    downloaded = downloadCorpus(corpus, outDir, downloadDir, redownload)
    packageSubPath = None
    if corpus == "BB13T2":
        packageSubPath = "task_2"
    elif corpus == "BB13T3":
        packageSubPath = "task_3"
    xml = convertDownloaded(outDir, corpus, downloaded, makeIntermediateFiles, evaluate, processEquiv=processEquiv, analysisMode=analysisMode, packageSubPath=packageSubPath, debug=debug, preprocessorSteps=preprocessorSteps, preprocessorParameters=preprocessorParameters)
    if logPath != None:
        Stream.closeLog(logPath)
    
    if bioNLP13AnalysesTempDir != None:
        shutil.rmtree(bioNLP13AnalysesTempDir)
        bioNLP13AnalysesTempDir = None
    
    return xml
Example #4
0
def log(clear=False, logCmd=True, logFile="log.txt", timeStamp="[%H:%M:%S %d/%m]"):
    Stream.setLog(logFile, clear)
    if timeStamp != None:
        Stream.setTimeStamp(timeStamp, True)
    print >> sys.stderr, "####### Log opened at ", time.ctime(time.time()), "#######"
    if logCmd:
        sys.stdout.writeToLog("Command line: " + " ".join(sys.argv) + "\n")
Example #5
0
def convert(corpora, outDir=None, downloadDir=None, redownload=False, makeIntermediateFiles=True, evaluate=False, processEquiv=True, addAnalyses=True):
    global bioNLP13AnalysesTempDir
    
    if outDir == None:
        os.path.normpath(Settings.DATAPATH + "/corpora")
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    else:
        assert os.path.isdir(outDir)
    count = 1
    for corpus in corpora:
        print >> sys.stderr, "=======================", "Converting BioNLP Shared Task", corpus, "corpus ("+str(count)+"/"+str(len(corpora))+")", "======================="
        logFileName = outDir + "/conversion/" + corpus + "-conversion-log.txt"
        Stream.openLog(logFileName)
        downloaded = downloadCorpus(corpus, outDir, downloadDir, redownload)
        packageSubPath = None
        if corpus == "BB13T2":
            packageSubPath = "task_2"
        elif corpus == "BB13T3":
            packageSubPath = "task_3"
        convertDownloaded(outDir, corpus, downloaded, makeIntermediateFiles, evaluate, processEquiv=processEquiv, addAnalyses=addAnalyses, packageSubPath=packageSubPath)
        Stream.closeLog(logFileName)
        count += 1
    
    if bioNLP13AnalysesTempDir != None:
        shutil.rmtree(bioNLP13AnalysesTempDir)
        bioNLP13AnalysesTempDir = None
Example #6
0
def convertCorpus(corpus, outDir=None, downloadDir=None, redownload=False, removeAnalyses=True, develFraction=0.3, logPath=None):
    assert corpus in PPI_CORPORA
    if logPath == "AUTO":
        logPath = outDir + "/conversion/" + corpus + "-conversion-log.txt" if outDir != None else None
    if logPath:
        Stream.openLog(logPath)
    print >> sys.stderr, "==========", "Converting PPI corpus", corpus, "=========="
    downloaded = downloadCorpus(corpus, outDir, downloadDir, redownload)
    print >> sys.stderr, "---------------", "Updating Interaction XML format", "---------------"
    print >> sys.stderr, "Loading", downloaded[corpus + "_LEARNING_FORMAT"]
    xml = ETUtils.ETFromObj(downloaded[corpus + "_LEARNING_FORMAT"])
    root = xml.getroot()
    updateXML(root, removeAnalyses)
    print >> sys.stderr, "---------------", "Adding sets from the PPI evaluation standard", "---------------"
    addSets(corpus, root, downloaded["PPI_EVALUATION_STANDARD"])
    if develFraction > 0.0:
        print >> sys.stderr, "---------------", "Generating devel set", "---------------"
        MakeSets.processCorpus(xml, None, "train", [("devel", develFraction), ("train", 1.0)], 1)
    if outDir != None:
        print >> sys.stderr, "---------------", "Writing corpus", "---------------"
        #if intermediateFiles:
        #print >> sys.stderr, "Writing combined corpus"
        #ETUtils.write(xml, os.path.join(outDir, corpus + ".xml"))
        print >> sys.stderr, "Dividing into sets"
        Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, corpus, ".xml")
    
    if logPath != None:
        Stream.closeLog(logPath)
    return xml  
def log(clear=False, logCmd=True, logFile="log.txt"):
    Stream.setLog(logFile, clear)
    Stream.setTimeStamp("[%H:%M:%S]", True)
    print >> sys.stderr, "####### Log opened at ", time.ctime(
        time.time()), "#######"
    if logCmd:
        sys.stdout.writeToLog("Command line: " + " ".join(sys.argv) + "\n")
Example #8
0
def classify(input, model, output, workDir=None, step=None, omitSteps=None, 
             goldInput=None, detector=None, debug=False, clear=False, 
             preprocessorTag="-preprocessed.xml.gz", preprocessorParams=None, bioNLPSTParams=None):
    """
    Detect events or relations from text.
    
    @param input: The input file in either interaction XML or BioNLP ST format. Can also be a PMID or TEES default corpus name.
    @param model: A path to a model file or the name of a TEES default model.
    @param output: The output file stem. Output files will be of the form output-*
    @param workDir: If intermediate files need to be saved, they will go here.
    @param step: A step=substep pair, where the steps are PREPROCESS and CLASSIFY
    @param omitSteps: step=substep parameters, where multiple substeps can be defined.
    @param goldInput: a version of the corpus file with gold annotation. Enables measuring of performance
    @param detector: a Detector object, or a string defining one to be imported. If None, will be read from model.
    @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved
    @param clear: Remove existing workDir
    @param preprocessorTag: preprocessor output file will be output + preprocessorTag
    @param preprocessorParams: Optional parameters controlling preprocessing. If None, will be read from model.
    @param bioNLPSTParams: Optional parameters controlling BioNLP ST format output. If None, will be read from model.
    """
    input = os.path.abspath(input)
    if goldInput != None: goldInput = os.path.abspath(goldInput)
    if model != None: model = os.path.abspath(model)
    # Initialize working directory
    if workDir != None: # use a permanent work directory
        workdir(workDir, clear)
    Stream.openLog(output + "-log.txt") # log in the output directory
    # Get input files
    input, preprocess = getInput(input)
    model = getModel(model)
    # Define processing steps
    selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["PREPROCESS", "CLASSIFY"])
    if not preprocess:
        selector.markOmitSteps("PREPROCESS")
    
    classifyInput = input
    if selector.check("PREPROCESS"):
        preprocessor = Preprocessor()
        preprocessorOutput = output + preprocessorTag
        #preprocessor.debug = debug
        #preprocessor.source = input # This has to be defined already here, needs to be fixed later
        #preprocessor.requireEntitiesForParsing = True # parse only sentences which contain named entities
        if os.path.exists(preprocessorOutput) and not clear: #os.path.exists(preprocessor.getOutputPath("FIND-HEADS")):
            #print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "exists, skipping preprocessing."
            print >> sys.stderr, "Preprocessor output", preprocessorOutput, "exists, skipping preprocessing."
            classifyInput = preprocessorOutput # preprocessor.getOutputPath("FIND-HEADS")
        else:
            #print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "does not exist"
            print >> sys.stderr, "Preprocessor output", preprocessorOutput, "does not exist"
            print >> sys.stderr, "------------ Preprocessing ------------"
            # Remove some of the unnecessary intermediate files
            #preprocessor.setIntermediateFiles({"Convert":None, "SPLIT-SENTENCES":None, "PARSE":None, "CONVERT-PARSE":None, "SPLIT-NAMES":None})
            # Process input into interaction XML
            classifyInput = preprocessor.process(input, preprocessorOutput, preprocessorParams, model, [], fromStep=detectorSteps["PREPROCESS"], toStep=None, omitSteps=omitDetectorSteps["PREPROCESS"])
    
    if selector.check("CLASSIFY"):
        detector = getDetector(detector, model)[0]() # initialize detector object
        detector.debug = debug
        detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams, model)
        detector.classify(classifyInput, model, output, goldData=goldInput, fromStep=detectorSteps["CLASSIFY"], omitSteps=omitDetectorSteps["CLASSIFY"], workDir=workDir)
Example #9
0
def convertDDI13(outDir, downloadDir=None, datasets=["DDI13_TRAIN", "DDI13_TEST_TASK_9.1", "DDI13_TEST_TASK_9.2"], redownload=False, insertParses=True, parse=False, makeIntermediateFiles=True, debug=False):
    cwd = os.getcwd()
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    os.chdir(outDir)
    logFileName = os.path.join(outDir, "DDI13-conversion-log.txt")
    Stream.openLog(logFileName)
    print >> sys.stderr, "=======================", "Converting DDI'13 corpus", "======================="
    
    tempdir = tempfile.mkdtemp()
    downloaded = downloadFiles(downloadDir, tempdir, redownload)
    
    for dataset in datasets:       
        corpusTree = getCorpusXML()
        xml = corpusTree.getroot()
        print >> sys.stderr, "Merging input XMLs"
        assert downloaded[dataset] != None
        combineXML(xml, "train", downloaded[dataset], subDirs=["DrugBank", "MedLine", "NER"])
        print >> sys.stderr, "Processing elements"
        processElements(xml)
        
        if dataset == "DDI13_TRAIN":
            print >> sys.stderr, "Dividing training set into folds"
            divideSets(xml, "train", 10)
        else:
            for doc in xml.getiterator("document"):
                doc.set("set", "test")

        if parse:
            print >> sys.stderr, "Parsing"
            parseXML(corpusTree, os.path.join(tempdir, "parsing"), debug)
        elif insertParses:
            assert parse == False
            print >> sys.stderr, "Inserting McCC parses"
            Tools.BLLIPParser.insertParses(corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"source":"TEES"})
            print >> sys.stderr, "Inserting Stanford conversions"
            Tools.StanfordParser.insertParses(corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"stanfordSource":"TEES"})
        # Check what was produced by the conversion
        print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------"
        analyzer = StructureAnalyzer()
        analyzer.analyze([xml])
        print >> sys.stderr, analyzer.toString()
        if "9.1" in dataset:
            outFileName = os.path.join(outDir, "DDI13-test-task9.1.xml")
        elif "9.2" in dataset:
            outFileName = os.path.join(outDir, "DDI13-test-task9.2.xml")
        else:
            outFileName = os.path.join(outDir, "DDI13-train.xml")
        print >> sys.stderr, "Writing output to", outFileName
        ETUtils.write(xml, outFileName)
    
    Stream.closeLog(logFileName)
    if not debug and tempdir != None:
        print >> sys.stderr, "Removing temporary directory", tempdir
        shutil.rmtree(tempdir)
    os.chdir(cwd)
Example #10
0
def beginLog(outDir, logPath="AUTO"):
    if logPath == "AUTO":
        logPath = os.path.join(outDir, "log.txt")
    elif logPath == "None":
        logPath = None
    if logPath != None:
        if not os.path.exists(os.path.dirname(logPath)):
            os.makedirs(os.path.dirname(logPath))
        Stream.openLog(logPath)
    return logPath
Example #11
0
def beginLog(outDir, logPath="AUTO"):
    if logPath == "AUTO":
        logPath = os.path.join(outDir, "log.txt")
    elif logPath == "None":
        logPath = None
    if logPath != None:
        if not os.path.exists(os.path.dirname(logPath)):
            os.makedirs(os.path.dirname(logPath))
        Stream.openLog(logPath)
    return logPath
Example #12
0
 def process(self,
             source,
             output=None,
             model=None,
             fromStep=None,
             toStep=None,
             omitSteps=None,
             logPath=None):
     if logPath == "AUTO":
         if output != None:
             logPath = output
             if "*" in logPath:
                 logPath = logPath.split("*")[0].rstrip("-")
             logPath = os.path.join(
                 logPath.rstrip("/").rstrip("\\") + "-log.txt")
         else:
             logPath = None
     elif logPath == "None":
         logPath = None
     if logPath != None:
         if not os.path.exists(os.path.dirname(logPath)):
             os.makedirs(os.path.dirname(logPath))
         Stream.openLog(logPath)
     print >> sys.stderr, "Preprocessor steps:", [
         x.name for x in self.steps
     ]
     if len(self.steps) == 0:
         raise Exception("No preprocessing steps defined")
     #if omitSteps != None and((type(omitSteps) in types.StringTypes and omitSteps == "CONVERT") or "CONVERT" in omitSteps):
     #    raise Exception("Preprocessor step 'CONVERT' may not be omitted")
     #if isinstance(source, basestring) and os.path.basename(source).isdigit(): # PMID
     #    print >> sys.stderr, "Preprocessing PubMed abstract", os.path.basename(source)
     #    source = Utils.Download.getPubMed(int(source))
     # Initialize variables and save existing default values
     #self.intermediateFileTag = corpusName
     #parameters = self.getParameters(parameters, model)
     #parameters["CONVERT.dataSetNames"] = sourceDataSetNames
     #parameters["CONVERT.corpusName"] = corpusName
     #convertSetNames = self.stepArgs("CONVERT")["dataSetNames"]
     #convertCorpusName = self.stepArgs("CONVERT")["corpusName"]
     #self.stepArgs("CONVERT")["dataSetNames"] = sourceDataSetNames
     #self.stepArgs("CONVERT")["corpusName"] = corpusName
     # Run the tool chain
     xml = ToolChain.process(self, source, output, model, fromStep, toStep,
                             omitSteps)
     # Reset variables to saved default values
     #self.stepArgs("CONVERT")["dataSetNames"] = convertSetNames
     #self.stepArgs("CONVERT")["corpusName"] = convertCorpusName
     if logPath != None:
         Stream.closeLog(logPath)
     return xml
Example #13
0
def convert(corpora, outDir, downloadDir=None, redownload=False, makeIntermediateFiles=True, evaluate=False):
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    else:
        assert os.path.isdir(outDir)
    count = 1
    for corpus in corpora:
        print >> sys.stderr, "=======================", "Converting BioNLP'11", corpus, "corpus ("+str(count)+"/"+str(len(corpora))+")", "======================="
        logFileName = outDir + "/conversion/" + corpus + "-conversion-log.txt"
        Stream.openLog(logFileName)
        downloaded = downloadCorpus(corpus, downloadDir, None, redownload)
        convertDownloaded(outDir, corpus, downloaded, makeIntermediateFiles, evaluate)
        Stream.closeLog(logFileName)
        count += 1
Example #14
0
def convert(inPath, outDir, corpusId, directed, negatives, preprocess, preprocessorParameters=None, debug=False, clear=False, constParser="BLLIP-BIO", depParser="STANFORD-CONVERT", logging=True):
    assert negatives in ("INCLUDE", "SKIP", "REVERSE_POS")
    # Download the corpus if needed
    if inPath == None:
        if not hasattr(Settings, "SE10T8_CORPUS"):
            SemEval2010Task8Tools.install()
        inPath = Settings.SE10T8_CORPUS
    assert os.path.exists(inPath)
    # Prepare the output directory
    if not os.path.exists(outDir):
        print "Making output directory", outDir
        os.makedirs(outDir)
    elif clear:
        print "Removing output directory", outDir
        shutil.rmtree(outDir)
    # Start logging
    if logging:
        Stream.openLog(os.path.join(outDir, "log.txt"), clear=clear)
    # Read and process the corpus files
    archive = zipfile.ZipFile(inPath, 'r')
    usedIds = set()
    tree = None
    for fileName, setName in [("SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT", "train"),\
                              ("SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT", "test")]:
        print "Processing file", fileName, "as set", setName
        f = archive.open(fileName)
        tree = processLines(f.readlines(), setName, directed=directed, negatives=negatives, usedIds=usedIds, tree=tree, corpusId=corpusId)
        f.close()
    # Divide the training set into training and development sets
    MakeSets.processCorpus(tree, None, "train", [("train", 0.7), ("devel", 1.0)], 1)
    # Write out the converted corpus
    convertedPath = os.path.join(outDir, corpusId + "-converted.xml")
    ETUtils.write(tree.getroot(), convertedPath)
    # Preprocess the converted corpus
    if preprocess:
        outPath = os.path.join(outDir, corpusId + ".xml")
        preprocessor = Preprocessor(constParser, depParser)
        preprocessor.setArgForAllSteps("debug", debug)
        preprocessor.stepArgs("CONVERT")["corpusName"] = corpusId
        preprocessor.process(convertedPath, outPath, preprocessorParameters, omitSteps=["SPLIT-SENTENCES", "NER", "SPLIT-NAMES"])
    # Stop logging
    if logging:
        Stream.closeLog(os.path.join(outDir, "log.txt"))
Example #15
0
def optimizeLocal(Classifier,
                  Evaluator,
                  trainExamples,
                  testExamples,
                  classIds,
                  combinations,
                  workDir=None,
                  timeout=None):
    bestResult = None
    combinationCount = 1
    for combination in combinations:
        Stream.setIndent(" ")
        print >> sys.stderr, "Parameters " + str(combinationCount) + "/" + str(
            len(combinations)) + ":", str(combination)
        Stream.setIndent("  ")
        combinationId = getCombinationString(combination)
        # Train
        trainOutput = "model-" + combinationId
        if workDir != None:
            trainOutput = os.path.join(workDir, trainOutput)
        print >> sys.stderr, "Training..."
        timer = Timer()
        Classifier.train(trainExamples, combination, trainOutput)
        print >> sys.stderr, "Training Complete, time:", timer.toString()
        # Test
        testOutput = "classifications-" + combinationId
        if workDir != None:
            testOutput = os.path.join(workDir, testOutput)
        print >> sys.stderr, "Testing..."
        timer = Timer()
        Classifier.test(testExamples, trainOutput, testOutput)
        print >> sys.stderr, "Testing Complete, time:", timer.toString()
        # Evaluate
        evaluationOutput = "evaluation-" + combinationId + ".csv"
        if workDir != None:
            evaluationOutput = os.path.join(workDir, evaluationOutput)
        Stream.setIndent("   ")
        evaluator = Evaluator.evaluate(testExamples, testOutput, classIds,
                                       evaluationOutput)
        #print >> sys.stderr, evaluator.toStringConcise("  ")

        if bestResult == None or evaluator.compare(
                bestResult[0]
        ) > 0:  #: averageResult.fScore > bestResult[1].fScore:
            bestResult = [
                evaluator, trainOutput, testOutput, evaluationOutput,
                combination
            ]
        combinationCount += 1
    Stream.setIndent()
    print >> sys.stderr, "Selected parameters", bestResult[-1]
    return bestResult
Example #16
0
def convert(corpora,
            outDir,
            downloadDir=None,
            redownload=False,
            makeIntermediateFiles=True,
            evaluate=False):
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    else:
        assert os.path.isdir(outDir)
    count = 1
    for corpus in corpora:
        print >> sys.stderr, "=======================", "Converting BioNLP'11", corpus, "corpus (" + str(
            count) + "/" + str(len(corpora)) + ")", "======================="
        logFileName = outDir + "/conversion/" + corpus + "-conversion-log.txt"
        Stream.openLog(logFileName)
        downloaded = downloadCorpus(corpus, downloadDir, None, redownload)
        convertDownloaded(outDir, corpus, downloaded, makeIntermediateFiles,
                          evaluate)
        Stream.closeLog(logFileName)
        count += 1
Example #17
0
 def process(self, source, output=None, model=None, fromStep=None, toStep=None, omitSteps=None, logPath=None):
     if logPath == "AUTO":
         if output != None:
             logPath = output
             if "*" in logPath:
                 logPath = logPath.split("*")[0].rstrip("-")
             logPath = os.path.join(logPath.rstrip("/").rstrip("\\") + "-log.txt")
         else:
             logPath = None
     elif logPath == "None":
         logPath = None
     if logPath != None:
         if not os.path.exists(os.path.dirname(logPath)):
             os.makedirs(os.path.dirname(logPath))
         Stream.openLog(logPath)
     print >> sys.stderr, "Preprocessor steps:", [x.name for x in self.steps]
     if len(self.steps) == 0:
         raise Exception("No preprocessing steps defined")
     #if omitSteps != None and((type(omitSteps) in types.StringTypes and omitSteps == "CONVERT") or "CONVERT" in omitSteps):
     #    raise Exception("Preprocessor step 'CONVERT' may not be omitted")
     #if isinstance(source, basestring) and os.path.basename(source).isdigit(): # PMID
     #    print >> sys.stderr, "Preprocessing PubMed abstract", os.path.basename(source)
     #    source = Utils.Download.getPubMed(int(source))   
     # Initialize variables and save existing default values
     #self.intermediateFileTag = corpusName
     #parameters = self.getParameters(parameters, model)
     #parameters["CONVERT.dataSetNames"] = sourceDataSetNames
     #parameters["CONVERT.corpusName"] = corpusName
     #convertSetNames = self.stepArgs("CONVERT")["dataSetNames"]
     #convertCorpusName = self.stepArgs("CONVERT")["corpusName"]
     #self.stepArgs("CONVERT")["dataSetNames"] = sourceDataSetNames
     #self.stepArgs("CONVERT")["corpusName"] = corpusName
     # Run the tool chain
     xml = ToolChain.process(self, source, output, model, fromStep, toStep, omitSteps)
     # Reset variables to saved default values
     #self.stepArgs("CONVERT")["dataSetNames"] = convertSetNames
     #self.stepArgs("CONVERT")["corpusName"] = convertCorpusName
     if logPath != None:
         Stream.closeLog(logPath)
     return xml
Example #18
0
def optimizeLocal(Classifier, Evaluator, trainExamples, testExamples, classIds, combinations, workDir=None, timeout=None):
    bestResult = None
    combinationCount = 1
    for combination in combinations:
        Stream.setIndent(" ")
        print >> sys.stderr, "Parameters "+str(combinationCount)+"/"+str(len(combinations))+":", str(combination)
        Stream.setIndent("  ")
        combinationId = getCombinationString(combination)
        # Train
        trainOutput = "model-" + combinationId
        if workDir != None:
            trainOutput = os.path.join(workDir, trainOutput)
        print >> sys.stderr, "Training..."
        timer = Timer()
        Classifier.train(trainExamples, combination, trainOutput)
        print >> sys.stderr, "Training Complete, time:", timer.toString()
        # Test
        testOutput = "classifications-" + combinationId
        if workDir != None:
            testOutput = os.path.join(workDir, testOutput)
        print >> sys.stderr, "Testing..."
        timer = Timer()
        Classifier.test(testExamples, trainOutput, testOutput)
        print >> sys.stderr, "Testing Complete, time:", timer.toString()
        # Evaluate
        evaluationOutput = "evaluation-" + combinationId + ".csv"
        if workDir != None:
            evaluationOutput = os.path.join(workDir, evaluationOutput)
        Stream.setIndent("   ")
        evaluator = Evaluator.evaluate(testExamples, testOutput, classIds, evaluationOutput)
        #print >> sys.stderr, evaluator.toStringConcise("  ")

        if bestResult == None or evaluator.compare(bestResult[0]) > 0: #: averageResult.fScore > bestResult[1].fScore:
            bestResult = [evaluator, trainOutput, testOutput, evaluationOutput, combination]
        combinationCount += 1
    Stream.setIndent()
    print >> sys.stderr, "Selected parameters", bestResult[-1]
    return bestResult
Example #19
0
        import psyco
        psyco.full()
        print >> sys.stderr, "Found Psyco, using"
    except ImportError:
        print >> sys.stderr, "Psyco not installed"
    from optparse import OptionParser
    optparser = OptionParser(description="A tool chain for making interaction XML, sentence splitting, NER and parsing")
    optparser.add_option("-i", "--input", default=None, dest="input", help="")
    optparser.add_option("-n", "--inputNames", default=None, dest="inputNames", help="")
    optparser.add_option("-c", "--corpus", default=None, dest="corpus", help="corpus name")
    optparser.add_option("-o", "--output", default=None, dest="output", help="output directory")
    optparser.add_option("-p", "--parameters", default=None, dest="parameters", help="preprocessing parameters")
    optparser.add_option("-s", "--step", default=None, dest="step", help="")
    optparser.add_option("-t", "--toStep", default=None, dest="toStep", help="")
    optparser.add_option("--omitSteps", default=None, dest="omitSteps", help="")
    optparser.add_option("--noLog", default=False, action="store_true", dest="noLog", help="")
    optparser.add_option("--debug", default=False, action="store_true", dest="debug", help="")
    optparser.add_option("--requireEntities", default=False, action="store_true", dest="requireEntities", help="")
    (options, args) = optparser.parse_args()
    if options.omitSteps != None:
        options.omitSteps = options.omitSteps.split(",")
    
    if not options.noLog:
        Stream.openLog(os.path.join(options.output + "-log.txt"))
        #log(False, True, os.path.join(options.output, options.corpus + "-log.txt"))
    preprocessor = Preprocessor()
    preprocessor.setArgForAllSteps("debug", options.debug)
    preprocessor.stepArgs("CONVERT")["corpusName"] = options.corpus
    preprocessor.stepArgs("PARSE")["requireEntities"] = options.requireEntities
    preprocessor.process(options.input, options.output, options.parameters, None, options.inputNames, fromStep=options.step, toStep=options.toStep, omitSteps=options.omitSteps)
Example #20
0
def combine(inputA,
            inputB,
            inputGold,
            outPath=None,
            mode="OR",
            skip=None,
            logPath="AUTO"):
    assert options.mode in ("AND", "OR")
    if skip != None and isinstance(skip, basestring):
        skip = set(skip.split(","))
    if skip != None:
        print "Skipping interaction types:", skip
    if logPath == "AUTO":
        if outPath != None:
            logPath = os.path.join(
                outPath.rstrip("/").rstrip("\\") + "-log.txt")
        else:
            logPath = None
    if logPath != None:
        if not os.path.exists(os.path.dirname(logPath)):
            os.makedirs(os.path.dirname(logPath))
        Stream.openLog(logPath)
    print "Loading the Interaction XML files"
    print "Loading A from", inputA
    a = ETUtils.ETFromObj(inputA)
    print "Loading B from", inputB
    b = ETUtils.ETFromObj(inputB)
    gold = None
    if inputGold:
        print "Loading gold from", inputGold
        gold = ETUtils.ETFromObj(inputGold) if inputGold else None
    print "Copying a as template"
    template = copy.deepcopy(a)
    print "Calculating confidence score ranges"
    scoreRanges = {}
    scoreRanges["a"] = getScoreRange(a, skip)
    scoreRanges["b"] = getScoreRange(b, skip)
    print scoreRanges
    print "Combining"
    counts = defaultdict(int)
    counts["skipped"] = defaultdict(int)
    counter = ProgressCounter(len([x for x in a.findall("document")]),
                              "Combine")
    for docA, docB, docGold, docTemplate in itertools.izip_longest(
            *[x.findall("document") for x in (a, b, gold, template)]):
        counter.update()
        assert len(
            set([x.get("id")
                 for x in (docA, docB, docGold, docTemplate)])) == 1
        for sentA, sentB, sentGold, sentTemplate in itertools.izip_longest(*[
                x.findall("sentence")
                for x in (docA, docB, docGold, docTemplate)
        ]):
            assert len(
                set([
                    x.get("id") for x in (sentA, sentB, sentGold, sentTemplate)
                ])) == 1
            interactions = getInteractions(sentA, sentB, sentGold, skip,
                                           counts["skipped"])
            for interaction in sentTemplate.findall("interaction"):
                sentTemplate.remove(interaction)
            analyses = sentTemplate.find("analyses")
            if analyses:
                sentTemplate.remove(analyses)
            for key in interactions:
                interaction = getCombinedInteraction(interactions[key], mode,
                                                     counts, scoreRanges)
                if interaction != None:
                    sentTemplate.append(copy.deepcopy(interaction))
            if analyses:
                sentTemplate.append(analyses)
    counts["skipped"] = dict(counts["skipped"])
    print "Counts:", dict(counts)
    if gold != None:
        print "****** Evaluating A ******"
        evaluateChemProt(
            a, gold
        )  #EvaluateIXML.run(AveragingMultiClassEvaluator, a, gold, "McCC")
        print "****** Evaluating B ******"
        evaluateChemProt(
            b, gold
        )  #EvaluateIXML.run(AveragingMultiClassEvaluator, b, gold, "McCC")
        print "****** Evaluating Combined ******"
        evaluateChemProt(
            template, gold
        )  #EvaluateIXML.run(AveragingMultiClassEvaluator, template, gold, "McCC")
    if outPath != None:
        print "Writing output to", outPath
        if outPath.endswith(".tsv"):
            Preprocessor(steps=["EXPORT_CHEMPROT"]).process(template, outPath)
        else:
            ETUtils.write(template, outPath)
    if logPath != None:
        Stream.closeLog(logPath)
Example #21
0
def convertDDI(outDir, trainUnified=None, trainMTMX=None, testUnified=None, testMTMX=None, downloadDir=None, redownload=False, makeIntermediateFiles=True, debug=False):
    cwd = os.getcwd()
    os.chdir(outDir)
    logFileName = os.path.join(outDir, "DDI-conversion-log.txt")
    Stream.openLog(logFileName)
    print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "======================="
    
    bigfileName = os.path.join(outDir, "DDI")
    #oldXML = ETUtils.ETFromObj(bigfileName+".xml")
    if trainUnified == None:
        trainUnified = Settings.URL["DDI_TRAIN_UNIFIED"]
    if trainMTMX == None:
        trainMTMX = Settings.URL["DDI_TRAIN_MTMX"]
    if testUnified == None:
        testUnified = Settings.URL["DDI_TEST_UNIFIED"]
    if testMTMX == None:
        testMTMX = Settings.URL["DDI_TEST_MTMX"]
    
    tempdir = tempfile.mkdtemp()
    print >> sys.stderr, "Temporary files directory at", tempdir
    if True:
        documents, docById, docCounts = loadDocs(trainUnified, outDir, tempdir)
        
        sortedDocCounts = sorted(docCounts.iteritems(), key=lambda (k,v): (v,k), reverse=True)
        datasetCounts = {"train":[0,0], "devel":[0,0], "test":[0,0]}
        for i in range(0, len(sortedDocCounts)-3, 4):
            for j in [0,1]:
                docById[sortedDocCounts[i+j][0]].set("set", "train")
                datasetCounts["train"][0] += sortedDocCounts[i+j][1][0]
                datasetCounts["train"][1] += sortedDocCounts[i+j][1][1]
            docById[sortedDocCounts[i+2][0]].set("set", "train") #docById[sortedDocCounts[i+2][0]].set("set", "devel")
            docById[sortedDocCounts[i+3][0]].set("set", "devel") #docById[sortedDocCounts[i+3][0]].set("set", "test")
            datasetCounts["train"][0] += sortedDocCounts[i+2][1][0] #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0]
            datasetCounts["train"][1] += sortedDocCounts[i+2][1][1] #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1]
            datasetCounts["devel"][0] += sortedDocCounts[i+3][1][0] #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0]
            datasetCounts["devel"][1] += sortedDocCounts[i+3][1][1] #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1]
        for document in documents: # epajaolliset jaa yli
            if document.get("set") == None:
                document.set("set", "train")
        
        print datasetCounts
        for key in datasetCounts.keys():
            if datasetCounts[key][1] != 0:
                print key, datasetCounts[key][0] / float(datasetCounts[key][1])
            else:
                print key, datasetCounts[key][0], "/", float(datasetCounts[key][1])
        
        if testUnified != None:
            testDocuments, testDocById, testDocCounts = loadDocs(testUnified, tempdir)
            for document in testDocuments:
                document.set("set", "test")
            documents = documents + testDocuments
        
    xmlTree = ET.ElementTree(ET.Element("corpus"))
    root = xmlTree.getroot()
    root.set("source", "DrugDDI")
    for document in documents:
        root.append(document)
    if makeIntermediateFiles:
        ETUtils.write(root, bigfileName + "-documents-notfixed.xml")
    xml = xmlTree
    print >> sys.stderr, "Fixing DDI XML"
    fixEntities(xml)
    convertToInteractions(xml)
    if makeIntermediateFiles:
        ETUtils.write(root, bigfileName + "-documents.xml")
    #sys.exit()
        
    if False:
        print >> sys.stderr, "Parsing"
        Tools.CharniakJohnsonParser.parse(xml, bigfileName+"-parsed.xml", tokenizationName=None, parseName="McClosky", requireEntities=True, timeout=10)
        print >> sys.stderr, "Stanford Conversion"
        Tools.StanfordParser.convertXML("McClosky", xml, bigfileName+"-stanford.xml")
    
        #if True:
        #xml = bigfileName + "-stanford.xml"        
        print >> sys.stderr, "Protein Name Splitting"
        splitTarget = "McClosky"
        xml = ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget)
        print >> sys.stderr, "Head Detection"
        xml = FindHeads.findHeads(xml, "split-McClosky", tokenization=None, output=bigfileName+".xml", removeExisting=True)
        print >> sys.stderr, "Dividing into sets"
        InteractionXML.DivideSets.processCorpus(xml, outDir, "DrugDDI-", ".xml", [("devel", "train", "test"), ("devel", "train")])
        #InteractionXML.DivideSets.processCorpus(oldXML, outDir, "DrugDDI-", ".xml", [("devel", "train", "test"), ("devel", "train")])
    #InteractionXML.DivideSets.processCorpus(bigfileName+".xml", outDir, "DrugDDI-", ".xml", [("devel", "train", "test"), ("devel", "train")])
    #if "devel" in [x[0] for x in datasets]:
    #    print >> sys.stderr, "Creating empty devel set"
    #    deletionRules = {"interaction":{},"entity":{"isName":"False"}}
    #    InteractionXML.DeleteElements.processCorpus(corpusName + "-devel.xml", corpusName + "-devel-empty.xml", deletionRules)
    #return xml
    Stream.closeLog(logFileName)
    if not debug:
        print >> sys.stderr, "Removing temporary directory", tempdir
        shutil.rmtree(tempdir)
    os.chdir(cwd)
Example #22
0
def convertDDI13(
        outDir,
        downloadDir=None,
        datasets=["DDI13_TRAIN", "DDI13_TEST_TASK_9.1", "DDI13_TEST_TASK_9.2"],
        redownload=False,
        insertParses=True,
        parse=False,
        makeIntermediateFiles=True,
        debug=False):
    cwd = os.getcwd()
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    os.chdir(outDir)
    logFileName = os.path.join(outDir, "DDI13-conversion-log.txt")
    Stream.openLog(logFileName)
    print >> sys.stderr, "=======================", "Converting DDI'13 corpus", "======================="

    tempdir = tempfile.mkdtemp()
    downloaded = downloadFiles(downloadDir, tempdir, redownload)

    for dataset in datasets:
        corpusTree = getCorpusXML()
        xml = corpusTree.getroot()
        print >> sys.stderr, "Merging input XMLs"
        assert downloaded[dataset] != None
        combineXML(xml,
                   "train",
                   downloaded[dataset],
                   subDirs=["DrugBank", "MedLine", "NER"])
        print >> sys.stderr, "Processing elements"
        processElements(xml)

        if dataset == "DDI13_TRAIN":
            print >> sys.stderr, "Dividing training set into folds"
            divideSets(xml, "train", 10)
        else:
            for doc in xml.getiterator("document"):
                doc.set("set", "test")

        if parse:
            print >> sys.stderr, "Parsing"
            parseXML(corpusTree, os.path.join(tempdir, "parsing"), debug)
        elif insertParses:
            assert parse == False
            print >> sys.stderr, "Inserting McCC parses"
            Tools.BLLIPParser.insertParses(corpusTree,
                                           downloaded[dataset +
                                                      "_TEES_PARSES"],
                                           None,
                                           extraAttributes={"source": "TEES"})
            print >> sys.stderr, "Inserting Stanford conversions"
            Tools.StanfordParser.insertParses(
                corpusTree,
                downloaded[dataset + "_TEES_PARSES"],
                None,
                extraAttributes={"stanfordSource": "TEES"})
        # Check what was produced by the conversion
        print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------"
        analyzer = StructureAnalyzer()
        analyzer.analyze([xml])
        print >> sys.stderr, analyzer.toString()
        if "9.1" in dataset:
            outFileName = os.path.join(outDir, "DDI13-test-task9.1.xml")
        elif "9.2" in dataset:
            outFileName = os.path.join(outDir, "DDI13-test-task9.2.xml")
        else:
            outFileName = os.path.join(outDir, "DDI13-train.xml")
        print >> sys.stderr, "Writing output to", outFileName
        ETUtils.write(xml, outFileName)

    Stream.closeLog(logFileName)
    if not debug and tempdir != None:
        print >> sys.stderr, "Removing temporary directory", tempdir
        shutil.rmtree(tempdir)
    os.chdir(cwd)
Example #23
0
def convertDDI(outDir,
               downloadDir=None,
               redownload=False,
               makeIntermediateFiles=True,
               debug=False):
    cwd = os.getcwd()
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    os.chdir(outDir)
    logFileName = os.path.join(outDir, "DDI11-conversion-log.txt")
    Stream.openLog(logFileName)
    print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "======================="
    corpusDir = outDir + "/DDI11-original"
    Utils.Download.downloadAndExtract(Settings.URL["DDI11_CORPUS"], corpusDir,
                                      downloadDir)

    bigfileName = os.path.join(outDir, "DDI11")
    #oldXML = ETUtils.ETFromObj(bigfileName+".xml")
    trainUnified = corpusDir + "/train"
    trainMTMX = corpusDir + "/train_MTMX"
    testUnified = corpusDir + "/test"
    testMTMX = corpusDir + "/test_MTMX"

    # Load main documents
    tempdir = tempfile.mkdtemp()
    print >> sys.stderr, "Temporary files directory at", tempdir
    documents, docById, docCounts = loadDocs(trainUnified)
    # Divide training data into a train and devel set
    sortedDocCounts = sorted(docCounts.iteritems(),
                             key=lambda (k, v): (v, k),
                             reverse=True)
    datasetCounts = {"train": [0, 0], "devel": [0, 0], "test": [0, 0]}
    for i in range(0, len(sortedDocCounts) - 3, 4):
        for j in [0, 1]:
            docById[sortedDocCounts[i + j][0]].set("set", "train")
            datasetCounts["train"][0] += sortedDocCounts[i + j][1][0]
            datasetCounts["train"][1] += sortedDocCounts[i + j][1][1]
        docById[sortedDocCounts[i + 2][0]].set(
            "set",
            "train")  #docById[sortedDocCounts[i+2][0]].set("set", "devel")
        docById[sortedDocCounts[i + 3][0]].set(
            "set",
            "devel")  #docById[sortedDocCounts[i+3][0]].set("set", "test")
        datasetCounts["train"][0] += sortedDocCounts[i + 2][1][
            0]  #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0]
        datasetCounts["train"][1] += sortedDocCounts[i + 2][1][
            1]  #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1]
        datasetCounts["devel"][0] += sortedDocCounts[i + 3][1][
            0]  #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0]
        datasetCounts["devel"][1] += sortedDocCounts[i + 3][1][
            1]  #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1]
    for document in documents:  # epajaolliset jaa yli
        if document.get("set") == None:
            document.set("set", "train")
    # Print division results
    print >> sys.stderr, datasetCounts
    for key in datasetCounts.keys():
        if datasetCounts[key][1] != 0:
            print key, datasetCounts[key][0] / float(datasetCounts[key][1])
        else:
            print key, datasetCounts[key][0], "/", float(datasetCounts[key][1])
    # Some of the train and test ids overlap. Let's change the train set ids, because test set ones are needed
    # for the final evaluation.
    changeIdCount = 1000
    for trainId in [
            'DrugDDI.d312', 'DrugDDI.d316', 'DrugDDI.d332', 'DrugDDI.d334',
            'DrugDDI.d337', 'DrugDDI.d342', 'DrugDDI.d349', 'DrugDDI.d354',
            'DrugDDI.d373', 'DrugDDI.d379', 'DrugDDI.d383', 'DrugDDI.d388',
            'DrugDDI.d392', 'DrugDDI.d396', 'DrugDDI.d398', 'DrugDDI.d409',
            'DrugDDI.d411', 'DrugDDI.d415', 'DrugDDI.d425', 'DrugDDI.d430',
            'DrugDDI.d433', 'DrugDDI.d448', 'DrugDDI.d450', 'DrugDDI.d452',
            'DrugDDI.d462', 'DrugDDI.d467', 'DrugDDI.d470', 'DrugDDI.d474',
            'DrugDDI.d480', 'DrugDDI.d482', 'DrugDDI.d485', 'DrugDDI.d492',
            'DrugDDI.d494', 'DrugDDI.d496', 'DrugDDI.d498', 'DrugDDI.d500',
            'DrugDDI.d503', 'DrugDDI.d506', 'DrugDDI.d518', 'DrugDDI.d523',
            'DrugDDI.d528', 'DrugDDI.d535', 'DrugDDI.d539', 'DrugDDI.d552',
            'DrugDDI.d554', 'DrugDDI.d558', 'DrugDDI.d561', 'DrugDDI.d570',
            'DrugDDI.d578'
    ]:
        newId = "DrugDDI.d" + str(changeIdCount)
        print >> sys.stderr, "Changing train/devel id", trainId, "to", newId
        for element in docById[trainId].getiterator():
            for attrName, attrValue in element.attrib.iteritems():
                if trainId in attrValue:
                    element.set(attrName, attrValue.replace(trainId, newId))
        docById[newId] = docById[trainId]
        del docById[trainId]
        changeIdCount += 1
    # If test set exists, load it, too
    if testUnified != None:
        testDocuments, testDocById, testDocCounts = loadDocs(testUnified)
        for document in testDocuments:
            document.set("set", "test")
        documents = documents + testDocuments
        overlappingIds = []
        for key in docById:
            if key in testDocById:
                overlappingIds.append(key)
        for key in docById:
            assert key not in testDocById, (key, docById[key].get("origId"),
                                            testDocById[key].get("origId"),
                                            sorted(docById.keys()),
                                            sorted(testDocById.keys()),
                                            sorted(overlappingIds))
        docById.update(testDocById)

    # Add all documents into one XML
    xmlTree = ET.ElementTree(ET.Element("corpus"))
    root = xmlTree.getroot()
    root.set("source", "DDI11")
    for document in documents:
        root.append(document)
    if makeIntermediateFiles:
        ETUtils.write(root, bigfileName + "-documents-notfixed.xml")
    xml = xmlTree
    print >> sys.stderr, "Fixing DDI XML"
    fixEntities(xml)
    convertToInteractions(xml)
    # Add MTMX
    if trainMTMX != None:
        inDir = Utils.Download.getTopDir(
            tempdir,
            Utils.Download.downloadAndExtract(trainMTMX, tempdir,
                                              outDir + "/DDI11-original"))
        DDITools.addMTMX(xml, inDir)
    if testMTMX != None:
        inDir = Utils.Download.getTopDir(
            tempdir,
            Utils.Download.downloadAndExtract(testMTMX, tempdir,
                                              outDir + "/DDI11-original"))
        DDITools.addMTMX(xml, inDir)
    if makeIntermediateFiles:
        ETUtils.write(root, bigfileName + "-documents.xml")

    print >> sys.stderr, "---------------", "Inserting TEES-generated analyses", "---------------"
    Utils.Download.downloadAndExtract(Settings.URL["TEES_PARSES"],
                                      os.path.join(Settings.DATAPATH,
                                                   "TEES-parses"),
                                      downloadDir,
                                      redownload=redownload)
    extractedFilename = os.path.join(Settings.DATAPATH,
                                     "TEES-parses") + "/DDI11"
    print >> sys.stderr, "Making sentences"
    Tools.SentenceSplitter.makeSentences(xml, extractedFilename, None)
    print >> sys.stderr, "Inserting McCC parses"
    Tools.BLLIPParser.insertParses(
        xml,
        extractedFilename,
        None,
        extraAttributes={"source": "TEES-preparsed"})
    print >> sys.stderr, "Inserting Stanford conversions"
    Tools.StanfordParser.insertParses(
        xml,
        extractedFilename,
        None,
        extraAttributes={"stanfordSource": "TEES-preparsed"})
    print >> sys.stderr, "Protein Name Splitting"
    splitTarget = "McCC"
    #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget)
    ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True)
    print >> sys.stderr, "Head Detection"
    #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True)
    xml = FindHeads.findHeads(xml,
                              splitTarget,
                              tokenization=None,
                              output=None,
                              removeExisting=True)

    print >> sys.stderr, "Dividing into sets"
    Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, "DDI11", ".xml")

    Stream.closeLog(logFileName)
    if not debug:
        print >> sys.stderr, "Removing temporary directory", tempdir
        shutil.rmtree(tempdir)
    os.chdir(cwd)
Example #24
0
def endLog(logPath):
    if logPath != None:
        Stream.closeLog(logPath)
Example #25
0
def optimizeCSC(Classifier,
                Evaluator,
                trainExamples,
                testExamples,
                classIds,
                combinations,
                workDir=None,
                timeout=None,
                cscConnection=None,
                downloadAllModels=False,
                steps="BOTH",
                threshold=False):
    bestResult = None
    combinationCount = 1
    combinationIds = []
    assert steps in ["BOTH", "SUBMIT", "RESULTS"], steps

    if type(classIds) == types.StringType:
        classIds = IdSet(filename=classIds)
    if Classifier.__name__ == "MultiLabelClassifier":
        negClass1 = True
        if "classifier" in combinations[0] and combinations[0][
                "classifier"] == "svmperf":
            negClass1 = False
        print "negclass1", negClass1
        Classifier.makeClassFiles(trainExamples,
                                  testExamples,
                                  classIds,
                                  negClass1=negClass1)

    if steps in ["BOTH", "SUBMIT"]:
        print >> sys.stderr, "Initializing runs"
        for combination in combinations:
            Stream.setIndent(" ")
            print >> sys.stderr, "Parameters " + str(
                combinationCount) + "/" + str(
                    len(combinations)) + ":", str(combination)
            # Train
            combinationIds.append(
                Classifier.initTrainAndTestOnLouhi(trainExamples, testExamples,
                                                   combination, cscConnection,
                                                   workDir, classIds))
            combinationCount += 1
    else:
        for combination in combinations:
            idStr = ""
            for key in sorted(combination.keys()):
                idStr += "-" + str(key) + "_" + str(combination[key])
            combinationIds.append(idStr)
    Stream.setIndent()

    if steps in ["BOTH", "RESULTS"]:
        Stream.setIndent(" ")
        print >> sys.stderr, "Waiting for results"
        finished = 0
        louhiTimer = Timer()
        #combinationStatus = {}
        while (True):
            # count finished
            finished = 0
            processStatus = {
                "FINISHED": 0,
                "QUEUED": 0,
                "FAILED": 0,
                "RUNNING": 0
            }
            for id in combinationIds:
                #status = Classifier.getLouhiStatus(id, cscConnection)
                #combinationStatus[id] = status
                #processStatus[status] += 1
                Classifier.getLouhiStatus(id, cscConnection, processStatus,
                                          classIds)
            p = processStatus
            processStatusString = str(p["QUEUED"]) + " queued, " + str(
                p["RUNNING"]) + " running, " + str(
                    p["FINISHED"]) + " finished, " + str(
                        p["FAILED"]) + " failed"
            if processStatus["QUEUED"] + processStatus["RUNNING"] == 0:
                print >> sys.stderr
                print >> sys.stderr, "All runs done (" + processStatusString + ")"
                break
            # decide what to do
            if timeout == None or louhiTimer.getElapsedTime() < timeout:
                sleepString = " [          ]     "
                print >> sys.stderr, "\rWaiting for " + str(
                    len(combinations)
                ) + " on " + cscConnection.machineName + "(" + processStatusString + "),", louhiTimer.elapsedTimeToString(
                ) + sleepString,
                #time.sleep(60)
                sleepTimer = Timer()
                while sleepTimer.getElapsedTime() < 60:
                    steps = int(10 * sleepTimer.getElapsedTime() / 60) + 1
                    sleepString = " [" + steps * "." + (10 -
                                                        steps) * " " + "]     "
                    print >> sys.stderr, "\rWaiting for " + str(
                        len(combinations)
                    ) + " on " + cscConnection.machineName + "(" + processStatusString + "),", louhiTimer.elapsedTimeToString(
                    ) + sleepString,
                    time.sleep(5)
            else:
                print >> sys.stderr
                print >> sys.stderr, "Timed out, ", louhiTimer.elapsedTimeToString(
                )
                break

        print >> sys.stderr, "Evaluating results"
        #if type(testExamples) != types.ListType:
        #    print >> sys.stderr, "Loading examples from file", testExamples
        #    testExamples = ExampleUtils.readExamples(testExamples,False)
        bestCombinationId = None
        for i in range(len(combinationIds)):
            id = combinationIds[i]
            Stream.setIndent(" ")
            # Evaluate
            predictions = Classifier.getLouhiPredictions(
                id, cscConnection, workDir, classIds)
            if predictions == None:
                print >> sys.stderr, "No results for combination" + id
            else:
                if downloadAllModels:
                    modelFileName = Classifier.downloadModel(
                        id, cscConnection, workDir)
                    if workDir != None:
                        modelFileName = os.path.join(workDir, modelFileName)
                        subprocess.call("gzip -fv " + modelFileName,
                                        shell=True)
                print >> sys.stderr, "Evaluating results for combination" + id
                evaluationOutput = "evaluation" + id + ".csv"
                if workDir != None:
                    evaluationOutput = os.path.join(workDir, evaluationOutput)
                evaluator = Evaluator.evaluate(testExamples, predictions,
                                               classIds, evaluationOutput)
                if threshold:
                    print >> sys.stderr, "Thresholding"
                    evaluator.determineThreshold(testExamples, predictions)
                if Classifier.__name__ != "MultiLabelClassifier":
                    if bestResult == None or evaluator.compare(
                            bestResult[0]
                    ) > 0:  #: averageResult.fScore > bestResult[1].fScore:
                        bestResult = [
                            evaluator, None, predictions, evaluationOutput,
                            combinations[i]
                        ]
                        bestCombinationId = id
                else:
                    assert Evaluator.__name__ == "MultiLabelEvaluator", Evaluator.__name__
                    if bestResult == None:
                        bestResult = [{}, None]
                        for className in classIds.Ids:
                            if className != "neg" and "---" not in className:
                                bestResult[0][className] = [
                                    -1, None,
                                    classIds.getId(className), None
                                ]
                    for className in classIds.Ids:
                        if className != "neg" and "---" not in className:
                            fscore = evaluator.dataByClass[classIds.getId(
                                className)].fscore
                            if fscore > bestResult[0][className][0]:
                                bestResult[0][className] = [
                                    fscore, id, bestResult[0][className][2]
                                ]
                                if threshold:
                                    classId = classIds.getId(className, False)
                                    if classId in evaluator.thresholds:
                                        bestResult[0][className].append(
                                            evaluator.thresholds[classId])
                                    else:
                                        bestResult[0][className].append(0.0)
                                else:
                                    bestResult[0][className].append(None)
                    bestCombinationId = bestResult
                os.remove(predictions)  # remove predictions to save space
        Stream.setIndent()
        print >> sys.stderr, "Selected parameters", bestResult[-1]
        #if Classifier.__name__ == "MultiLabelClassifier":
        #    evaluator = Evaluator.evaluate(testExamples, predictions, classIds, evaluationOutput)

        # Download best model and predictions
        modelFileName = Classifier.downloadModel(bestCombinationId,
                                                 cscConnection, workDir)
        if workDir != None:
            modelFileName = os.path.join(workDir, modelFileName)
        subprocess.call("gzip -fv " + modelFileName, shell=True)
        modelFileName = modelFileName + ".gz"
        #if Classifier.__name__ != "MultiLabelClassifier":
        #bestResult = [None, None]
        bestResult[1] = modelFileName
        return bestResult
Example #26
0
        options.triggerExampleBuilder = "PhraseTriggerExampleBuilder"
        options.edgeParams = "10,100,1000,5000,7500,10000,20000,25000,28000,50000,60000,65000,100000,500000,1000000"
        options.recallAdjustParams = "0.8,0.9,0.95,1.0"

# These commands will be in the beginning of most pipelines
WORKDIR=options.output
if options.copyFrom != None:
    if os.path.exists(WORKDIR):
        shutil.rmtree(WORKDIR)
    print >> sys.stderr, "Copying template from", options.copyFrom
    shutil.copytree(options.copyFrom, WORKDIR)
    workdir(WORKDIR, False)
else:
    workdir(WORKDIR, options.clearAll) # Select a working directory, optionally remove existing files
if not options.noLog:
    Stream.openLog("log.txt")
    #log() # Start logging into a file in working directory

## Make downsampling for learning curve
#downSampleTag = "-r" + str(options.downSampleTrain) + "_s" + str(options.downSampleSeed)
#newTrainFile = makeSubset(TRAIN_FILE, options.task + "-train-nodup" + options.extraTag + downSampleTag + ".xml", options.downSampleTrain, options.downSampleSeed)
#makeSubset(TRAIN_FILE.replace("-nodup", ""), options.task + "-train" + options.extraTag + downSampleTag + ".xml", options.downSampleTrain, options.downSampleSeed)
#TRAIN_FILE = newTrainFile

if subTask != None:
    print >> sys.stderr, "Task:", options.task + "." + str(subTask)
else:
    print >> sys.stderr, "Task:", options.task

eventDetector = EventDetector()
eventDetector.debug = options.debug
Example #27
0
        "-c",
        "--corpora",
        default="GE",
        dest="corpora",
        help="corpus names in a comma-separated list, e.g. \"GE,EPI,ID\"")
    optparser.add_option("-o",
                         "--outdir",
                         default=os.path.normpath(Settings.DATAPATH +
                                                  "/corpora"),
                         dest="outdir",
                         help="directory for output files")
    optparser.add_option("-d",
                         "--downloaddir",
                         default=None,
                         dest="downloaddir",
                         help="directory to download corpus files to")
    optparser.add_option("--intermediateFiles",
                         default=False,
                         action="store_true",
                         dest="intermediateFiles",
                         help="save intermediate corpus files")
    optparser.add_option("--forceDownload",
                         default=False,
                         action="store_true",
                         dest="forceDownload",
                         help="re-download all source files")
    (options, args) = optparser.parse_args()

    Stream.openLog(os.path.join(options.outdir, "conversion-log.txt"))
    convert(options.corpora.split(","), options.outdir, options.downloaddir,
            options.forceDownload, options.intermediateFiles)
Example #28
0
def convert(inPath,
            outDir,
            corpusId,
            directed,
            negatives,
            preprocess,
            preprocessorParameters=None,
            debug=False,
            clear=False,
            constParser="BLLIP-BIO",
            depParser="STANFORD-CONVERT",
            logging=True):
    assert negatives in ("INCLUDE", "SKIP", "REVERSE_POS")
    # Download the corpus if needed
    if inPath == None:
        if not hasattr(Settings, "SE10T8_CORPUS"):
            SemEval2010Task8Tools.install()
        inPath = Settings.SE10T8_CORPUS
    assert os.path.exists(inPath)
    # Prepare the output directory
    if not os.path.exists(outDir):
        print "Making output directory", outDir
        os.makedirs(outDir)
    elif clear:
        print "Removing output directory", outDir
        shutil.rmtree(outDir)
    # Start logging
    if logging:
        Stream.openLog(os.path.join(outDir, "log.txt"), clear=clear)
    # Read and process the corpus files
    archive = zipfile.ZipFile(inPath, 'r')
    usedIds = set()
    tree = None
    for fileName, setName in [("SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT", "train"),\
                              ("SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT", "test")]:
        print "Processing file", fileName, "as set", setName
        f = archive.open(fileName)
        tree = processLines(f.readlines(),
                            setName,
                            directed=directed,
                            negatives=negatives,
                            usedIds=usedIds,
                            tree=tree,
                            corpusId=corpusId)
        f.close()
    # Divide the training set into training and development sets
    MakeSets.processCorpus(tree, None, "train", [("train", 0.7),
                                                 ("devel", 1.0)], 1)
    # Write out the converted corpus
    convertedPath = os.path.join(outDir, corpusId + "-converted.xml")
    ETUtils.write(tree.getroot(), convertedPath)
    # Preprocess the converted corpus
    if preprocess:
        outPath = os.path.join(outDir, corpusId + ".xml")
        preprocessor = Preprocessor(constParser, depParser)
        preprocessor.setArgForAllSteps("debug", debug)
        preprocessor.stepArgs("CONVERT")["corpusName"] = corpusId
        preprocessor.process(
            convertedPath,
            outPath,
            preprocessorParameters,
            omitSteps=["SPLIT-SENTENCES", "NER", "SPLIT-NAMES"])
    # Stop logging
    if logging:
        Stream.closeLog(os.path.join(outDir, "log.txt"))
Example #29
0
def combine(inputA, inputB, inputGold, outPath=None, mode="OR", skip=None, logPath="AUTO"):
    assert options.mode in ("AND", "OR")
    if skip != None and isinstance(skip, basestring):
        skip = set(skip.split(","))
    if skip != None:
        print "Skipping interaction types:", skip
    if logPath == "AUTO":
        if outPath != None:
            logPath = os.path.join(outPath.rstrip("/").rstrip("\\") + "-log.txt")
        else:
            logPath = None
    if logPath != None:
        if not os.path.exists(os.path.dirname(logPath)):
            os.makedirs(os.path.dirname(logPath))
        Stream.openLog(logPath)
    print "Loading the Interaction XML files"
    print "Loading A from", inputA
    a = ETUtils.ETFromObj(inputA)
    print "Loading B from", inputB
    b = ETUtils.ETFromObj(inputB)
    gold = None
    if inputGold:
        print "Loading gold from", inputGold
        gold = ETUtils.ETFromObj(inputGold) if inputGold else None
    print "Copying a as template"
    template = copy.deepcopy(a)
    print "Calculating confidence score ranges"
    scoreRanges = {}
    scoreRanges["a"] = getScoreRange(a, skip)
    scoreRanges["b"] = getScoreRange(b, skip)
    print scoreRanges
    print "Combining"
    counts = defaultdict(int)
    counts["skipped"] = defaultdict(int)
    counter = ProgressCounter(len([x for x in a.findall("document")]), "Combine")
    for docA, docB, docGold, docTemplate in itertools.izip_longest(*[x.findall("document") for x in (a, b, gold, template)]):
        counter.update()
        assert len(set([x.get("id") for x in (docA, docB, docGold, docTemplate)])) == 1
        for sentA, sentB, sentGold, sentTemplate in itertools.izip_longest(*[x.findall("sentence") for x in (docA, docB, docGold, docTemplate)]):
            assert len(set([x.get("id") for x in (sentA, sentB, sentGold, sentTemplate)])) == 1
            interactions = getInteractions(sentA, sentB, sentGold, skip, counts["skipped"])
            for interaction in sentTemplate.findall("interaction"):
                sentTemplate.remove(interaction)
            analyses = sentTemplate.find("analyses") 
            if analyses:
                sentTemplate.remove(analyses)
            for key in interactions:
                interaction = getCombinedInteraction(interactions[key], mode, counts, scoreRanges)
                if interaction != None:
                    sentTemplate.append(copy.deepcopy(interaction))
            if analyses:
                sentTemplate.append(analyses)
    counts["skipped"] = dict(counts["skipped"])
    print "Counts:", dict(counts)
    if gold != None:
        print "****** Evaluating A ******"
        evaluateChemProt(a, gold) #EvaluateIXML.run(AveragingMultiClassEvaluator, a, gold, "McCC")
        print "****** Evaluating B ******"
        evaluateChemProt(b, gold) #EvaluateIXML.run(AveragingMultiClassEvaluator, b, gold, "McCC")
        print "****** Evaluating Combined ******"
        evaluateChemProt(template, gold) #EvaluateIXML.run(AveragingMultiClassEvaluator, template, gold, "McCC")
    if outPath != None:
        print "Writing output to", outPath
        if outPath.endswith(".tsv"):
            Preprocessor(steps=["EXPORT_CHEMPROT"]).process(template, outPath)
        else:
            ETUtils.write(template, outPath)
    if logPath != None:
        Stream.closeLog(logPath)
Example #30
0
            Tools.StanfordParser.insertParses(xml, tempdir + "/" + os.path.basename(files[corpus + "_" + setName.upper() + "_McCC"])[:-len(".tar.gz")].split("-", 2)[-1] + "/mccc/sd_ccproc", None, extraAttributes={"stanfordSource":"BioNLP'11"})
            print >> sys.stderr, "Removing temporary directory", tempdir
            shutil.rmtree(tempdir)

def processParses(xml, splitTarget="McCC"):
    print >> sys.stderr, "Protein Name Splitting"
    ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget)
    print >> sys.stderr, "Head Detection"
    xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True)

if __name__=="__main__":
    # Import Psyco if available
    try:
        import psyco
        psyco.full()
        print >> sys.stderr, "Found Psyco, using"
    except ImportError:
        print >> sys.stderr, "Psyco not installed"

    from optparse import OptionParser
    from Utils.Parameters import *
    optparser = OptionParser(usage="%prog [options]\nBioNLP'11 Shared Task corpus conversion")
    optparser.add_option("-c", "--corpora", default="GE", dest="corpora", help="corpus names in a comma-separated list, e.g. \"GE,EPI,ID\"")
    optparser.add_option("-o", "--outdir", default=os.path.normpath(Settings.DATAPATH + "/corpora"), dest="outdir", help="directory for output files")
    optparser.add_option("-d", "--downloaddir", default=None, dest="downloaddir", help="directory to download corpus files to")
    optparser.add_option("--intermediateFiles", default=False, action="store_true", dest="intermediateFiles", help="save intermediate corpus files")
    optparser.add_option("--forceDownload", default=False, action="store_true", dest="forceDownload", help="re-download all source files")
    (options, args) = optparser.parse_args()
    
    Stream.openLog(os.path.join(options.outdir, "conversion-log.txt"))
    convert(options.corpora.split(","), options.outdir, options.downloaddir, options.forceDownload, options.intermediateFiles)
Example #31
0
def endLog(logPath):
    if logPath != None:
        Stream.closeLog(logPath)
Example #32
0
def convertDDI(outDir, downloadDir=None, redownload=False, makeIntermediateFiles=True, debug=False):
    cwd = os.getcwd()
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    os.chdir(outDir)
    logFileName = os.path.join(outDir, "DDI11-conversion-log.txt")
    Stream.openLog(logFileName)
    print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "======================="
    corpusDir = outDir + "/DDI11-original"
    Utils.Download.downloadAndExtract(Settings.URL["DDI11_CORPUS"], corpusDir, downloadDir)
    
    bigfileName = os.path.join(outDir, "DDI11")
    #oldXML = ETUtils.ETFromObj(bigfileName+".xml")
    trainUnified = corpusDir + "/train"
    trainMTMX = corpusDir + "/train_MTMX"
    testUnified = corpusDir + "/test"
    testMTMX = corpusDir + "/test_MTMX"
    
    # Load main documents
    tempdir = tempfile.mkdtemp()
    print >> sys.stderr, "Temporary files directory at", tempdir
    documents, docById, docCounts = loadDocs(trainUnified)
    # Divide training data into a train and devel set
    sortedDocCounts = sorted(docCounts.iteritems(), key=lambda (k,v): (v,k), reverse=True)
    datasetCounts = {"train":[0,0], "devel":[0,0], "test":[0,0]}
    for i in range(0, len(sortedDocCounts)-3, 4):
        for j in [0,1]:
            docById[sortedDocCounts[i+j][0]].set("set", "train")
            datasetCounts["train"][0] += sortedDocCounts[i+j][1][0]
            datasetCounts["train"][1] += sortedDocCounts[i+j][1][1]
        docById[sortedDocCounts[i+2][0]].set("set", "train") #docById[sortedDocCounts[i+2][0]].set("set", "devel")
        docById[sortedDocCounts[i+3][0]].set("set", "devel") #docById[sortedDocCounts[i+3][0]].set("set", "test")
        datasetCounts["train"][0] += sortedDocCounts[i+2][1][0] #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0]
        datasetCounts["train"][1] += sortedDocCounts[i+2][1][1] #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1]
        datasetCounts["devel"][0] += sortedDocCounts[i+3][1][0] #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0]
        datasetCounts["devel"][1] += sortedDocCounts[i+3][1][1] #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1]
    for document in documents: # epajaolliset jaa yli
        if document.get("set") == None:
            document.set("set", "train")
    # Print division results
    print >> sys.stderr, datasetCounts
    for key in datasetCounts.keys():
        if datasetCounts[key][1] != 0:
            print key, datasetCounts[key][0] / float(datasetCounts[key][1])
        else:
            print key, datasetCounts[key][0], "/", float(datasetCounts[key][1])
    # Some of the train and test ids overlap. Let's change the train set ids, because test set ones are needed
    # for the final evaluation.
    changeIdCount = 1000
    for trainId in ['DrugDDI.d312', 'DrugDDI.d316', 'DrugDDI.d332', 'DrugDDI.d334', 'DrugDDI.d337', 
                    'DrugDDI.d342', 'DrugDDI.d349', 'DrugDDI.d354', 'DrugDDI.d373', 'DrugDDI.d379', 
                    'DrugDDI.d383', 'DrugDDI.d388', 'DrugDDI.d392', 'DrugDDI.d396', 'DrugDDI.d398', 
                    'DrugDDI.d409', 'DrugDDI.d411', 'DrugDDI.d415', 'DrugDDI.d425', 'DrugDDI.d430', 
                    'DrugDDI.d433', 'DrugDDI.d448', 'DrugDDI.d450', 'DrugDDI.d452', 'DrugDDI.d462', 
                    'DrugDDI.d467', 'DrugDDI.d470', 'DrugDDI.d474', 'DrugDDI.d480', 'DrugDDI.d482', 
                    'DrugDDI.d485', 'DrugDDI.d492', 'DrugDDI.d494', 'DrugDDI.d496', 'DrugDDI.d498', 
                    'DrugDDI.d500', 'DrugDDI.d503', 'DrugDDI.d506', 'DrugDDI.d518', 'DrugDDI.d523', 
                    'DrugDDI.d528', 'DrugDDI.d535', 'DrugDDI.d539', 'DrugDDI.d552', 'DrugDDI.d554', 
                    'DrugDDI.d558', 'DrugDDI.d561', 'DrugDDI.d570', 'DrugDDI.d578']:
        newId = "DrugDDI.d" + str(changeIdCount)
        print >> sys.stderr, "Changing train/devel id", trainId, "to", newId
        for element in docById[trainId].getiterator():
            for attrName, attrValue in element.attrib.iteritems():
                if trainId in attrValue:
                    element.set(attrName, attrValue.replace(trainId, newId))
        docById[newId] = docById[trainId]
        del docById[trainId]
        changeIdCount += 1
    # If test set exists, load it, too
    if testUnified != None:
        testDocuments, testDocById, testDocCounts = loadDocs(testUnified)
        for document in testDocuments:
            document.set("set", "test")
        documents = documents + testDocuments
        overlappingIds = []
        for key in docById:
            if key in testDocById:
                overlappingIds.append(key)
        for key in docById:
            assert key not in testDocById, (key, docById[key].get("origId"), testDocById[key].get("origId"), sorted(docById.keys()), sorted(testDocById.keys()), sorted(overlappingIds))
        docById.update(testDocById)
    
    # Add all documents into one XML
    xmlTree = ET.ElementTree(ET.Element("corpus"))
    root = xmlTree.getroot()
    root.set("source", "DDI11")
    for document in documents:
        root.append(document)
    if makeIntermediateFiles:
        ETUtils.write(root, bigfileName + "-documents-notfixed.xml")
    xml = xmlTree
    print >> sys.stderr, "Fixing DDI XML"
    fixEntities(xml)
    convertToInteractions(xml)
    # Add MTMX
    if trainMTMX != None:
        inDir = Utils.Download.getTopDir(tempdir, Utils.Download.downloadAndExtract(trainMTMX, tempdir, outDir + "/DDI11-original"))
        DDITools.addMTMX(xml, inDir)
    if testMTMX != None:
        inDir = Utils.Download.getTopDir(tempdir, Utils.Download.downloadAndExtract(testMTMX, tempdir, outDir + "/DDI11-original"))
        DDITools.addMTMX(xml, inDir)
    if makeIntermediateFiles:
        ETUtils.write(root, bigfileName + "-documents.xml")



    print >> sys.stderr, "---------------", "Inserting TEES-generated analyses", "---------------"
    Utils.Download.downloadAndExtract(Settings.URL["TEES_PARSES"], os.path.join(Settings.DATAPATH, "TEES-parses"), downloadDir, redownload=redownload)
    extractedFilename = os.path.join(Settings.DATAPATH, "TEES-parses") + "/DDI11"
    print >> sys.stderr, "Making sentences"
    Tools.SentenceSplitter.makeSentences(xml, extractedFilename, None)
    print >> sys.stderr, "Inserting McCC parses"
    Tools.BLLIPParser.insertParses(xml, extractedFilename, None, extraAttributes={"source":"TEES-preparsed"})
    print >> sys.stderr, "Inserting Stanford conversions"
    Tools.StanfordParser.insertParses(xml, extractedFilename, None, extraAttributes={"stanfordSource":"TEES-preparsed"})
    print >> sys.stderr, "Protein Name Splitting"
    splitTarget = "McCC"
    #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget)
    ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True)
    print >> sys.stderr, "Head Detection"
    #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True)
    xml = FindHeads.findHeads(xml, splitTarget, tokenization=None, output=None, removeExisting=True)    
    
    print >> sys.stderr, "Dividing into sets"
    Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, "DDI11", ".xml")
    
    Stream.closeLog(logFileName)
    if not debug:
        print >> sys.stderr, "Removing temporary directory", tempdir
        shutil.rmtree(tempdir)
    os.chdir(cwd)
Example #33
0
def train(output, task=None, detector=None, inputFiles=None, models=None, parse=None,
          processUnmerging=None, processModifiers=None, 
          bioNLPSTParams=None, preprocessorParams=None, exampleStyles=None, 
          classifierParams=None,  doFullGrid=False, deleteOutput=False, copyFrom=None, 
          log="log.txt", step=None, omitSteps=None, debug=False, connection=None, subset=None, 
          folds=None, corpusDir=None, corpusPreprocessing=None, evaluator=None):
    """
    Train a new model for event or relation detection.
    
    @param output: A directory where output files will appear.
    @param task: If defined, overridable default settings are used for many of the training parameters. Must be one of the supported TEES tasks.
    @param detector: a Detector object, or a string defining one to be imported
    @param inputFiles: A dictionary of file names, with keys "train", "devel" and, "test"
    @param models: A dictionary of file names defining the place for the new models, with keys "devel" and, "test"
    @param parse: The parse element name in the training interaction XML
    @param processUnmerging: Use the unmerging step of EventDetector. True, False or None for task default.
    @param processModifiers: Use the modifier detection step of EventDetector. True, False or None for task default.
    @param bioNLPSTParams: Parameters controlling BioNLP ST format output.
    @param preprocessorParams: Parameters controlling the preprocessor. Not used for training, but saved to the model for use when classifying.
    @param exampleStyles: A parameter set for controlling example builders.
    @param classifierParams: A parameter set for controlling classifiers.
    @param doFullGrid: Whether all parameters, as opposed to just recall adjustment, are tested in the EventDetector grid search.
    @param deleteOutput: Remove an existing output directory
    @param copyFrom: Copy an existing output directory for use as a template
    @param log: An optional alternative name for the log file. None is for no logging.
    @param step: A step=substep pair, where the steps are "TRAIN", "DEVEL", "EMPTY" and "TEST"
    @param omitSteps: step=substep parameters, where multiple substeps can be defined.
    @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved
    @param connection: A parameter set defining a local or remote connection for training the classifier
    @param subset: A parameter set for making subsets of input files
    """
    # Insert default arguments where needed
    inputFiles = setDictDefaults(inputFiles, {"train":None, "devel":None, "test":None})
    models = setDictDefaults(models, {"devel":"model-devel", "test":"model-test"})
    exampleStyles = setDictDefaults(exampleStyles, {"examples":None, "trigger":None, "edge":None, "unmerging":None, "modifiers":None})
    classifierParams = setDictDefaults(classifierParams, {"examples":None, "trigger":None, "recall":None, "edge":None, "unmerging":None, "modifiers":None})
    subset = setDictDefaults(Parameters.get(subset), {"train":None, "devel":None, "test":None, "seed":0, "all":None})
    folds = setDictDefaults(folds, {"train":None, "devel":None, "test":None})
    processUnmerging = getDefinedBool(processUnmerging)
    processModifiers = getDefinedBool(processModifiers)
    # Initialize working directory
    workdir(output, deleteOutput, copyFrom, log)
    # Get task specific parameters
    useKerasDetector = False
    if detector != None and "keras" in detector.lower():
        print >> sys.stderr, "Using a Keras Detector"
        useKerasDetector = True
        if detector.lower() == "keras":
            detector = None
    detector, bioNLPSTParams, preprocessorParams, folds = getTaskSettings(task, detector, 
        bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParams, folds, corpusDir=corpusDir, useKerasDetector=useKerasDetector)
    # Learn training settings from input files
    detector = learnSettings(inputFiles, detector, classifierParams, task, exampleStyles, useKerasDetector=useKerasDetector)   
    # Get corpus subsets   
    getFolds(inputFiles, folds)
    getSubsets(inputFiles, subset)
    if task != None: 
        task = task.replace("-FULL", "")
    if "." in task:
        _, subTask = getSubTask(task)
        if subTask != 3:
            processModifiers = False
    # Preprocess the corpus if required
    if corpusPreprocessing != None:
        preprocessor = Preprocessor(steps=corpusPreprocessing)
        assert preprocessor.steps[0].name == "MERGE_SETS"
        assert preprocessor.steps[-1].name == "DIVIDE_SETS"
        preprocessedCorpusDir = os.path.join(output, "corpus")
        #outputFiles = {x:os.path.join(preprocessedCorpusDir, os.path.basename(inputFiles[x])) for x in inputFiles}
        preprocessor.process(inputFiles, os.path.join(preprocessedCorpusDir, task))
        #inputFiles = outputFiles
        for setName in inputFiles.keys():
            if inputFiles[setName] != None:
                inputFiles[setName] = os.path.join(preprocessedCorpusDir, task + "-" + setName + ".xml")
    # Define processing steps
    selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["TRAIN", "DEVEL", "EMPTY", "TEST"])
    
    # Initialize the detector
    detector, detectorName = getDetector(detector, evaluator=evaluator)
    evaluator, evaluatorName = importClass(evaluator, "evaluator")
    detector = detector() # initialize object
    if evaluator != None:
        print >> sys.stderr, "Using evaluator", evaluator.__name__
        detector.evaluator = evaluator
    detector.debug = debug
    detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams)
    #detector.useBioNLPSTFormat = useBioNLPSTFormat # classify-output and grid evaluation in ST-format
    #detector.stWriteScores = True # write confidence scores into additional st-format files
    connection = getConnection(connection)
    detector.setConnection(connection)
    connection.debug = debug
    if deleteOutput:
        connection.clearWorkDir()
    
    # Train
    if selector.check("TRAIN"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------------ Train Detector ------------------"
        print >> sys.stderr, "----------------------------------------------------"
        if not isinstance(detector, EventDetector):
            detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"],
                           exampleStyles["examples"], classifierParams["examples"], parse, None, task,
                           fromStep=detectorSteps["TRAIN"], workDir="training", testData=inputFiles["test"])
        else:
            detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"],
                           exampleStyles["trigger"], exampleStyles["edge"], exampleStyles["unmerging"], exampleStyles["modifiers"],
                           classifierParams["trigger"], classifierParams["edge"], classifierParams["unmerging"], classifierParams["modifiers"],
                           classifierParams["recall"], processUnmerging, processModifiers, 
                           doFullGrid, task, parse, None,
                           fromStep=detectorSteps["TRAIN"], workDir="training", testData=inputFiles["test"])
        # Save the detector type
        for model in [models["devel"], models["test"]]:
            if model != None and os.path.exists(model):
                model = Model(model, "a")
                model.addStr("detector", detectorName)
                if evaluatorName != None:
                    model.addStr("detector", evaluatorName)
                if preprocessorParams != None:
                    preprocessor = Preprocessor()
                    model.addStr("preprocessorParams", Parameters.toString(preprocessor.getParameters(preprocessorParams)))
                model.save()
                model.close()
    if selector.check("DEVEL"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------ Check devel classification ------------"
        print >> sys.stderr, "----------------------------------------------------"
        #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
        detector.classify(inputFiles["devel"], models["devel"], "classification-devel/devel", goldData=inputFiles["devel"], fromStep=detectorSteps["DEVEL"], workDir="classification-devel")
    if selector.check("EMPTY"):
        # By passing an emptied devel set through the prediction system, we can check that we get the same predictions
        # as in the DEVEL step, ensuring the model does not use leaked information.
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------ Empty devel classification ------------"
        print >> sys.stderr, "----------------------------------------------------"
        #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
        removalScope = "non-given"
        if "names" in str(exampleStyles["examples"]) or "names" in str(exampleStyles["trigger"]):
            removalScope = "all"
        elif "Edge" in detector.__class__.__name__:
            removalScope = "interactions"
        detector.classify(getEmptyCorpus(inputFiles["devel"], scope=removalScope), models["devel"], "classification-empty/devel-empty", fromStep=detectorSteps["EMPTY"], workDir="classification-empty")
        print >> sys.stderr, "*** Evaluate empty devel classification ***"
        if os.path.exists("classification-empty/devel-empty-pred.xml.gz"):
            EvaluateInteractionXML.run(detector.evaluator, "classification-empty/devel-empty-pred.xml.gz", inputFiles["devel"], parse)
        else:
            print >> sys.stderr, "No output file for evaluation"
    if selector.check("TEST"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------- Test set classification --------------"
        print >> sys.stderr, "----------------------------------------------------"
        if inputFiles["test"] == None or not os.path.exists(inputFiles["test"]):
            print >> sys.stderr, "Skipping, test file", inputFiles["test"], "does not exist"
        else:
            #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
            detector.classify(inputFiles["test"], models["test"] if models["test"] != None else models["devel"], "classification-test/test", fromStep=detectorSteps["TEST"], workDir="classification-test")
            if detector.bioNLPSTParams["convert"]:
                extension = ".zip" if (detector.bioNLPSTParams["convert"] == "zip") else ".tar.gz" 
                Utils.STFormat.Compare.compare("classification-test/test-events" + extension, "classification-devel/devel-events" + extension, "a2")
    # Stop logging
    if log != None:
        Stream.closeLog(log)
Example #34
0
                         help="")
    optparser.add_option("--debug",
                         default=False,
                         action="store_true",
                         dest="debug",
                         help="")
    optparser.add_option("--requireEntities",
                         default=False,
                         action="store_true",
                         dest="requireEntities",
                         help="")
    (options, args) = optparser.parse_args()
    if options.omitSteps != None:
        options.omitSteps = options.omitSteps.split(",")

    if not options.noLog:
        Stream.openLog(os.path.join(options.output + "-log.txt"))
        #log(False, True, os.path.join(options.output, options.corpus + "-log.txt"))
    preprocessor = Preprocessor()
    preprocessor.setArgForAllSteps("debug", options.debug)
    preprocessor.stepArgs("CONVERT")["corpusName"] = options.corpus
    preprocessor.stepArgs("PARSE")["requireEntities"] = options.requireEntities
    preprocessor.process(options.input,
                         options.output,
                         options.parameters,
                         None,
                         options.inputNames,
                         fromStep=options.step,
                         toStep=options.toStep,
                         omitSteps=options.omitSteps)
Example #35
0
def classify(input,
             model,
             output,
             workDir=None,
             step=None,
             omitSteps=None,
             goldInput=None,
             detector=None,
             debug=False,
             clear=False,
             preprocessorTag="-preprocessed.xml.gz",
             preprocessorParams=None,
             bioNLPSTParams=None):
    """
    Detect events or relations from text.
    
    @param input: The input file in either interaction XML or BioNLP ST format. Can also be a PMID or TEES default corpus name.
    @param model: A path to a model file or the name of a TEES default model.
    @param output: The output file stem. Output files will be of the form output-*
    @param workDir: If intermediate files need to be saved, they will go here.
    @param step: A step=substep pair, where the steps are PREPROCESS and CLASSIFY
    @param omitSteps: step=substep parameters, where multiple substeps can be defined.
    @param goldInput: a version of the corpus file with gold annotation. Enables measuring of performance
    @param detector: a Detector object, or a string defining one to be imported. If None, will be read from model.
    @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved
    @param clear: Remove existing workDir
    @param preprocessorTag: preprocessor output file will be output + preprocessorTag
    @param preprocessorParams: Optional parameters controlling preprocessing. If None, will be read from model.
    @param bioNLPSTParams: Optional parameters controlling BioNLP ST format output. If None, will be read from model.
    """
    input = os.path.abspath(input)
    if goldInput != None: goldInput = os.path.abspath(goldInput)
    if model != None: model = os.path.abspath(model)
    # Initialize working directory
    if workDir != None:  # use a permanent work directory
        workdir(workDir, clear)
    Stream.openLog(output + "-log.txt")  # log in the output directory
    # Get input files
    input, preprocess = getInput(input)
    model = getModel(model)
    # Define processing steps
    selector, detectorSteps, omitDetectorSteps = getSteps(
        step, omitSteps, ["PREPROCESS", "CLASSIFY"])
    if not preprocess:
        selector.markOmitSteps("PREPROCESS")

    classifyInput = input
    if selector.check("PREPROCESS"):
        if preprocessorParams == None:
            preprocessorParams = [
                "LOAD", "GENIA_SPLITTER", "BANNER", "BLLIP_BIO",
                "STANFORD_CONVERT", "SPLIT_NAMES", "FIND_HEADS", "SAVE"
            ]
        preprocessor = Preprocessor(preprocessorParams)
        if debug:
            preprocessor.setArgForAllSteps("debug", True)
        preprocessorOutput = output + preprocessorTag
        #preprocessor.debug = debug
        #preprocessor.source = input # This has to be defined already here, needs to be fixed later
        #preprocessor.requireEntitiesForParsing = True # parse only sentences which contain named entities
        if os.path.exists(
                preprocessorOutput
        ) and not clear:  #os.path.exists(preprocessor.getOutputPath("FIND-HEADS")):
            #print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "exists, skipping preprocessing."
            print >> sys.stderr, "Preprocessor output", preprocessorOutput, "exists, skipping preprocessing."
            classifyInput = preprocessorOutput  # preprocessor.getOutputPath("FIND-HEADS")
        else:
            #print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "does not exist"
            print >> sys.stderr, "Preprocessor output", preprocessorOutput, "does not exist"
            print >> sys.stderr, "------------ Preprocessing ------------"
            # Remove some of the unnecessary intermediate files
            #preprocessor.setIntermediateFiles({"Convert":None, "SPLIT-SENTENCES":None, "PARSE":None, "CONVERT-PARSE":None, "SPLIT-NAMES":None})
            # Process input into interaction XML
            classifyInput = preprocessor.process(input, preprocessorOutput,
                                                 model)

    if selector.check("CLASSIFY"):
        detector = getDetector(detector,
                               model)[0]()  # initialize detector object
        detector.debug = debug
        detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(
            bioNLPSTParams, model)
        detector.classify(classifyInput,
                          model,
                          output,
                          goldData=goldInput,
                          fromStep=detectorSteps["CLASSIFY"],
                          omitSteps=omitDetectorSteps["CLASSIFY"],
                          workDir=workDir)
Example #36
0
         options.triggerExampleBuilder = "PhraseTriggerExampleBuilder"
         options.edgeParams = "10,100,1000,5000,7500,10000,20000,25000,28000,50000,60000,65000,100000,500000,1000000"
         options.recallAdjustParams = "0.8,0.9,0.95,1.0"
 
 # These commands will be in the beginning of most pipelines
 WORKDIR=options.output
 if options.copyFrom != None:
     if os.path.exists(WORKDIR):
         shutil.rmtree(WORKDIR)
     print >> sys.stderr, "Copying template from", options.copyFrom
     shutil.copytree(options.copyFrom, WORKDIR)
     workdir(WORKDIR, False)
 else:
     workdir(WORKDIR, options.clearAll) # Select a working directory, optionally remove existing files
 if not options.noLog:
     Stream.openLog("log.txt")
     #log() # Start logging into a file in working directory
 
 print >> sys.stderr, "Importing detector", options.detector
 Detector = eval("from " + options.detector + " import " + options.detector.split(".")[-1])
 detector = Detector()
 detector.debug = options.debug
 detector.stWriteScores = True # write confidence scores into additional st-format files
 detector.setConnection(getConnection(options.connection)).debug = options.debug
 # Pre-calculate all the required SVM models
 if selector.check("TRAIN"):
     print >> sys.stderr, "----------------------------------------------------"
     print >> sys.stderr, "------------------ Train Detector ------------------"
     print >> sys.stderr, "----------------------------------------------------"
     if options.singleStage:
         detector.train(trainFile, develFile, options.develModel, options.testModel,
Example #37
0
def optimizeCSC(Classifier, Evaluator, trainExamples, testExamples, classIds, combinations, workDir=None, timeout=None, cscConnection=None, downloadAllModels=False, steps="BOTH", threshold=False):
    bestResult = None
    combinationCount = 1
    combinationIds = []
    assert steps in ["BOTH", "SUBMIT", "RESULTS"], steps
    
    if type(classIds) == types.StringType:
        classIds = IdSet(filename=classIds)
    if Classifier.__name__ == "MultiLabelClassifier":
        negClass1 = True
        if "classifier" in combinations[0] and combinations[0]["classifier"] == "svmperf":
            negClass1 = False
        print "negclass1", negClass1
        Classifier.makeClassFiles(trainExamples, testExamples, classIds, negClass1=negClass1)
    
    if steps in ["BOTH", "SUBMIT"]:
        print >> sys.stderr, "Initializing runs"
        for combination in combinations:
            Stream.setIndent(" ")
            print >> sys.stderr, "Parameters "+str(combinationCount)+"/"+str(len(combinations))+":", str(combination)
            # Train
            combinationIds.append(Classifier.initTrainAndTestOnLouhi(trainExamples, testExamples, combination, cscConnection, workDir, classIds) )
            combinationCount += 1
    else:
        for combination in combinations:
            idStr = ""
            for key in sorted(combination.keys()):
                idStr += "-" + str(key) + "_" + str(combination[key])
            combinationIds.append(idStr)
    Stream.setIndent()
    
    if steps in ["BOTH", "RESULTS"]:
        Stream.setIndent(" ")
        print >> sys.stderr, "Waiting for results"
        finished = 0
        louhiTimer = Timer()
        #combinationStatus = {}
        while(True):
            # count finished
            finished = 0
            processStatus = {"FINISHED":0, "QUEUED":0, "FAILED":0, "RUNNING":0}
            for id in combinationIds:
                #status = Classifier.getLouhiStatus(id, cscConnection)
                #combinationStatus[id] = status
                #processStatus[status] += 1
                Classifier.getLouhiStatus(id, cscConnection, processStatus, classIds)
            p = processStatus
            processStatusString = str(p["QUEUED"]) + " queued, " + str(p["RUNNING"]) + " running, " + str(p["FINISHED"]) + " finished, " + str(p["FAILED"]) + " failed"
            if processStatus["QUEUED"] + processStatus["RUNNING"] == 0:
                print >> sys.stderr
                print >> sys.stderr, "All runs done (" + processStatusString + ")"
                break
            # decide what to do
            if timeout == None or louhiTimer.getElapsedTime() < timeout:
                sleepString = " [          ]     "
                print >> sys.stderr, "\rWaiting for " + str(len(combinations)) + " on " + cscConnection.machineName + "(" + processStatusString + "),", louhiTimer.elapsedTimeToString() + sleepString,
                #time.sleep(60)
                sleepTimer = Timer()
                while sleepTimer.getElapsedTime() < 60:
                    steps = int(10 * sleepTimer.getElapsedTime() / 60) + 1
                    sleepString = " [" + steps * "." + (10-steps) * " " + "]     "
                    print >> sys.stderr, "\rWaiting for " + str(len(combinations)) + " on " + cscConnection.machineName + "(" + processStatusString + "),", louhiTimer.elapsedTimeToString() + sleepString,
                    time.sleep(5)                
            else:
                print >> sys.stderr
                print >> sys.stderr, "Timed out, ", louhiTimer.elapsedTimeToString()
                break
        
        print >> sys.stderr, "Evaluating results"
        #if type(testExamples) != types.ListType:
        #    print >> sys.stderr, "Loading examples from file", testExamples
        #    testExamples = ExampleUtils.readExamples(testExamples,False)
        bestCombinationId = None
        for i in range(len(combinationIds)):
            id = combinationIds[i]
            Stream.setIndent(" ")
            # Evaluate
            predictions = Classifier.getLouhiPredictions(id, cscConnection, workDir, classIds)
            if predictions == None:
                print >> sys.stderr, "No results for combination" + id
            else:
                if downloadAllModels:
                    modelFileName = Classifier.downloadModel(id, cscConnection, workDir)
                    if workDir != None:
                        modelFileName = os.path.join(workDir, modelFileName)
                        subprocess.call("gzip -fv " + modelFileName, shell=True)
                print >> sys.stderr, "Evaluating results for combination" + id
                evaluationOutput = "evaluation" + id + ".csv"
                if workDir != None:
                    evaluationOutput = os.path.join(workDir, evaluationOutput)
                evaluator = Evaluator.evaluate(testExamples, predictions, classIds, evaluationOutput)
                if threshold:
                    print >> sys.stderr, "Thresholding"
                    evaluator.determineThreshold(testExamples, predictions)
                if Classifier.__name__ != "MultiLabelClassifier":
                    if bestResult == None or evaluator.compare(bestResult[0]) > 0: #: averageResult.fScore > bestResult[1].fScore:
                        bestResult = [evaluator, None, predictions, evaluationOutput, combinations[i]]
                        bestCombinationId = id
                else:
                    assert Evaluator.__name__ == "MultiLabelEvaluator", Evaluator.__name__
                    if bestResult == None:
                        bestResult = [{}, None]
                        for className in classIds.Ids:
                            if className != "neg" and "---" not in className:
                                bestResult[0][className] = [-1, None, classIds.getId(className), None]
                    for className in classIds.Ids:
                        if className != "neg" and "---" not in className:
                            fscore = evaluator.dataByClass[classIds.getId(className)].fscore
                            if fscore > bestResult[0][className][0]:
                                bestResult[0][className] = [fscore, id, bestResult[0][className][2]]
                                if threshold:
                                    classId = classIds.getId(className, False)
                                    if classId in evaluator.thresholds:
                                        bestResult[0][className].append(evaluator.thresholds[classId])
                                    else:
                                        bestResult[0][className].append(0.0)
                                else:
                                    bestResult[0][className].append(None)
                    bestCombinationId = bestResult
                os.remove(predictions) # remove predictions to save space
        Stream.setIndent()
        print >> sys.stderr, "Selected parameters", bestResult[-1]
        #if Classifier.__name__ == "MultiLabelClassifier":
        #    evaluator = Evaluator.evaluate(testExamples, predictions, classIds, evaluationOutput)
    
        # Download best model and predictions
        modelFileName = Classifier.downloadModel(bestCombinationId, cscConnection, workDir)
        if workDir != None:
            modelFileName = os.path.join(workDir, modelFileName)
        subprocess.call("gzip -fv " + modelFileName, shell=True)
        modelFileName = modelFileName + ".gz"
        #if Classifier.__name__ != "MultiLabelClassifier":
            #bestResult = [None, None]
        bestResult[1] = modelFileName
        return bestResult