Example #1
0
def classify(input, model, output, workDir=None, step=None, omitSteps=None, 
             goldInput=None, detector=None, debug=False, clear=False, 
             preprocessorTag="-preprocessed.xml.gz", preprocessorParams=None, bioNLPSTParams=None):
    """
    Detect events or relations from text.
    
    @param input: The input file in either interaction XML or BioNLP ST format. Can also be a PMID or TEES default corpus name.
    @param model: A path to a model file or the name of a TEES default model.
    @param output: The output file stem. Output files will be of the form output-*
    @param workDir: If intermediate files need to be saved, they will go here.
    @param step: A step=substep pair, where the steps are PREPROCESS and CLASSIFY
    @param omitSteps: step=substep parameters, where multiple substeps can be defined.
    @param goldInput: a version of the corpus file with gold annotation. Enables measuring of performance
    @param detector: a Detector object, or a string defining one to be imported. If None, will be read from model.
    @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved
    @param clear: Remove existing workDir
    @param preprocessorTag: preprocessor output file will be output + preprocessorTag
    @param preprocessorParams: Optional parameters controlling preprocessing. If None, will be read from model.
    @param bioNLPSTParams: Optional parameters controlling BioNLP ST format output. If None, will be read from model.
    """
    input = os.path.abspath(input)
    if goldInput != None: goldInput = os.path.abspath(goldInput)
    if model != None: model = os.path.abspath(model)
    # Initialize working directory
    if workDir != None: # use a permanent work directory
        workdir(workDir, clear)
    Stream.openLog(output + "-log.txt") # log in the output directory
    # Get input files
    input, preprocess = getInput(input)
    model = getModel(model)
    # Define processing steps
    selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["PREPROCESS", "CLASSIFY"])
    if not preprocess:
        selector.markOmitSteps("PREPROCESS")
    
    classifyInput = input
    if selector.check("PREPROCESS"):
        preprocessor = Preprocessor()
        preprocessorOutput = output + preprocessorTag
        #preprocessor.debug = debug
        #preprocessor.source = input # This has to be defined already here, needs to be fixed later
        #preprocessor.requireEntitiesForParsing = True # parse only sentences which contain named entities
        if os.path.exists(preprocessorOutput) and not clear: #os.path.exists(preprocessor.getOutputPath("FIND-HEADS")):
            #print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "exists, skipping preprocessing."
            print >> sys.stderr, "Preprocessor output", preprocessorOutput, "exists, skipping preprocessing."
            classifyInput = preprocessorOutput # preprocessor.getOutputPath("FIND-HEADS")
        else:
            #print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "does not exist"
            print >> sys.stderr, "Preprocessor output", preprocessorOutput, "does not exist"
            print >> sys.stderr, "------------ Preprocessing ------------"
            # Remove some of the unnecessary intermediate files
            #preprocessor.setIntermediateFiles({"Convert":None, "SPLIT-SENTENCES":None, "PARSE":None, "CONVERT-PARSE":None, "SPLIT-NAMES":None})
            # Process input into interaction XML
            classifyInput = preprocessor.process(input, preprocessorOutput, preprocessorParams, model, [], fromStep=detectorSteps["PREPROCESS"], toStep=None, omitSteps=omitDetectorSteps["PREPROCESS"])
    
    if selector.check("CLASSIFY"):
        detector = getDetector(detector, model)[0]() # initialize detector object
        detector.debug = debug
        detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams, model)
        detector.classify(classifyInput, model, output, goldData=goldInput, fromStep=detectorSteps["CLASSIFY"], omitSteps=omitDetectorSteps["CLASSIFY"], workDir=workDir)
Example #2
0
def evaluateChemProt(xml, gold):
    EvaluateIXML.run(AveragingMultiClassEvaluator, xml, gold, "McCC")
    preprocessor = Preprocessor(steps=["EXPORT_CHEMPROT"])
    tempDir = tempfile.mkdtemp()
    print >> sys.stderr, "Using temporary evaluation directory", tempDir
    tsvPath = os.path.join(tempDir, "predictions.tsv")
    preprocessor.process(xml, tsvPath)
    ChemProtEvaluator().evaluateTSV(tsvPath, tempDir)
    print >> sys.stderr, "Removing temporary evaluation directory", tempDir
    shutil.rmtree(tempDir)
Example #3
0
def evaluateChemProt(xml, gold):
    EvaluateIXML.run(AveragingMultiClassEvaluator, xml, gold, "McCC")
    preprocessor = Preprocessor(steps=["EXPORT_CHEMPROT"])
    tempDir = tempfile.mkdtemp()
    print >> sys.stderr, "Using temporary evaluation directory", tempDir
    tsvPath = os.path.join(tempDir, "predictions.tsv")
    preprocessor.process(xml, tsvPath)
    ChemProtEvaluator().evaluateTSV(tsvPath, tempDir)
    print >> sys.stderr, "Removing temporary evaluation directory", tempDir
    shutil.rmtree(tempDir)
Example #4
0
def parseXML(xml, intermediateFileDir, debug=False):
    preprocessor = Preprocessor()
    preprocessor.setArgForAllSteps("debug", debug)
    preprocessor.stepArgs("PARSE")["requireEntities"] = False
    #preprocessor.process(xml, intermediateFileDir, fromStep="SPLIT-SENTENCES", toStep="FIND-HEADS", omitSteps=["NER"])
    #preprocessor.process(xml, intermediateFileDir, fromStep="PARSE", toStep="FIND-HEADS")
    # Entity name splitting is omitted as this data may be used for predicting entities
    preprocessor.process(xml, intermediateFileDir, omitSteps=["CONVERT", "SPLIT-SENTENCES", "NER", "SPLIT-NAMES", "DIVIDE-SETS"])
Example #5
0
def parseXML(xml, intermediateFileDir, debug=False):
    preprocessor = Preprocessor()
    preprocessor.setArgForAllSteps("debug", debug)
    preprocessor.stepArgs("PARSE")["requireEntities"] = False
    #preprocessor.process(xml, intermediateFileDir, fromStep="SPLIT-SENTENCES", toStep="FIND-HEADS", omitSteps=["NER"])
    #preprocessor.process(xml, intermediateFileDir, fromStep="PARSE", toStep="FIND-HEADS")
    # Entity name splitting is omitted as this data may be used for predicting entities
    preprocessor.process(xml,
                         intermediateFileDir,
                         omitSteps=[
                             "CONVERT", "SPLIT-SENTENCES", "NER",
                             "SPLIT-NAMES", "DIVIDE-SETS"
                         ])
Example #6
0
def run(inPath, outPath, subDirs, model, connection, numJobs, subTask=3, posTags=None, useTestSet=False, clear=True, debug=False, force=False, training=True, preprocessorSteps=None, subset=None):
    # Remove existing non-empty work directory, if requested to do so
    if os.path.exists(outPath) and len(os.listdir(outPath)) > 0 and clear:
        if force or ask("Output directory '" + outPath + "' exists, remove?"):
            print >> sys.stderr, "Output directory exists, removing", outPath
            shutil.rmtree(outPath)
    # Create work directory if needed
    if not os.path.exists(outPath):
        print >> sys.stderr, "Making output directory", outPath
        os.makedirs(outPath)
    
    # Begin logging
    logPath = beginLog(outPath)
    
    # Collect the parse files
    parseDir = os.path.join(outPath, "parses")
    if not os.path.exists(parseDir) or len(os.listdir(parseDir)) == 0:
        parseDir = combineParses(inPath, parseDir, subDirs)
    else:
        print >> sys.stderr, "Using collected parses from", parseDir
    
    # Import the parses
    corpusDir = os.path.join(outPath, "corpus")
    if not os.path.exists(corpusDir):
        if preprocessorSteps == None:
            preprocessorSteps = ["MERGE_SETS", "REMOVE_ANALYSES", "REMOVE_HEADS", "MERGE_SENTENCES", "IMPORT_PARSE", "SPLIT_NAMES", "FIND_HEADS", "DIVIDE_SETS"]
        preprocessor = Preprocessor(preprocessorSteps)
        #preprocessor = Preprocessor(["MERGE-SETS", "REMOVE-ANALYSES", "REMOVE-HEADS", "MERGE-SENTENCES", "IMPORT-PARSE", "VALIDATE", "DIVIDE-SETS"])
        preprocessor.setArgForAllSteps("debug", debug)
        preprocessor.getStep("IMPORT_PARSE").setArg("parseDir", parseDir)
        preprocessor.getStep("IMPORT_PARSE").setArg("posTags", posTags)
        modelPattern = model + ".+\.xml" if useTestSet else model + "-devel\.xml|" + model + "-train\.xml"
        preprocessor.process(modelPattern, os.path.join(corpusDir, model), logPath=None)
    else:
        print >> sys.stderr, "Using imported parses from", corpusDir
    
    # Train the model
    if training:
        connection = connection.replace("$JOBS", str(numJobs))
        if subTask > 0:
            model = model + "." + str(subTask)
        train(outPath, model, parse="McCC", debug=debug, connection=connection, corpusDir=corpusDir, subset=subset, log=None) #classifierParams={"examples":None, "trigger":"150000", "recall":None, "edge":"7500", "unmerging":"2500", "modifiers":"10000"})
        
    # Close the log
    endLog(logPath)
Example #7
0
def convert(inPath, outDir, corpusId, directed, negatives, preprocess, preprocessorParameters=None, debug=False, clear=False, constParser="BLLIP-BIO", depParser="STANFORD-CONVERT", logging=True):
    assert negatives in ("INCLUDE", "SKIP", "REVERSE_POS")
    # Download the corpus if needed
    if inPath == None:
        if not hasattr(Settings, "SE10T8_CORPUS"):
            SemEval2010Task8Tools.install()
        inPath = Settings.SE10T8_CORPUS
    assert os.path.exists(inPath)
    # Prepare the output directory
    if not os.path.exists(outDir):
        print "Making output directory", outDir
        os.makedirs(outDir)
    elif clear:
        print "Removing output directory", outDir
        shutil.rmtree(outDir)
    # Start logging
    if logging:
        Stream.openLog(os.path.join(outDir, "log.txt"), clear=clear)
    # Read and process the corpus files
    archive = zipfile.ZipFile(inPath, 'r')
    usedIds = set()
    tree = None
    for fileName, setName in [("SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT", "train"),\
                              ("SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT", "test")]:
        print "Processing file", fileName, "as set", setName
        f = archive.open(fileName)
        tree = processLines(f.readlines(), setName, directed=directed, negatives=negatives, usedIds=usedIds, tree=tree, corpusId=corpusId)
        f.close()
    # Divide the training set into training and development sets
    MakeSets.processCorpus(tree, None, "train", [("train", 0.7), ("devel", 1.0)], 1)
    # Write out the converted corpus
    convertedPath = os.path.join(outDir, corpusId + "-converted.xml")
    ETUtils.write(tree.getroot(), convertedPath)
    # Preprocess the converted corpus
    if preprocess:
        outPath = os.path.join(outDir, corpusId + ".xml")
        preprocessor = Preprocessor(constParser, depParser)
        preprocessor.setArgForAllSteps("debug", debug)
        preprocessor.stepArgs("CONVERT")["corpusName"] = corpusId
        preprocessor.process(convertedPath, outPath, preprocessorParameters, omitSteps=["SPLIT-SENTENCES", "NER", "SPLIT-NAMES"])
    # Stop logging
    if logging:
        Stream.closeLog(os.path.join(outDir, "log.txt"))
Example #8
0
#if os.path.isfile(options.input):
#    if INPUT_TAG.endswith(".tar.gz"):
#        INPUT_TAG = INPUT_TAG[:-len(".tar.gz")]
open(INPUT_TAG+"-STARTED", "w").close() # Mark process status
        
# Start logging
WORKDIR = options.workdir
if WORKDIR == None:
    WORKDIR = tempfile.mkdtemp()
workdir(WORKDIR, options.clearAll) # Select a working directory, optionally remove existing files
if not options.noLog:
    log(options.clearAll, True, INPUT_TAG + "-" + options.eventTag + ".log") # Start logging into a file in working directory
print >> sys.stderr, "Work directory at", WORKDIR

eventDetectionInput = None
preprocessor = Preprocessor()
preprocessor.debug = options.debug
preprocessor.source = options.input # This has to be defined already here, needs to be fixed later
preprocessor.compressIntermediateFiles = True # save space
preprocessor.intermediateFilesAtSource = True # create output at source file location
preprocessor.requireEntitiesForParsing = True # parse only sentences which contain BANNER entities
if selector.check("PREPROCESS"):
    if os.path.exists(preprocessor.getOutputPath("FIND-HEADS")):
        print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "exists, skipping preprocessing."
        eventDetectionInput = preprocessor.getOutputPath("FIND-HEADS")
    else:
        print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "does not exist"
        print >> sys.stderr, "------------ Preprocessing ------------"
        # Remove some of the unnecessary intermediate files
        preprocessor.setIntermediateFile("CONVERT", None)
        preprocessor.setIntermediateFile("SPLIT-SENTENCES", None)
Example #9
0
def combine(inputA,
            inputB,
            inputGold,
            outPath=None,
            mode="OR",
            skip=None,
            logPath="AUTO"):
    assert options.mode in ("AND", "OR")
    if skip != None and isinstance(skip, basestring):
        skip = set(skip.split(","))
    if skip != None:
        print "Skipping interaction types:", skip
    if logPath == "AUTO":
        if outPath != None:
            logPath = os.path.join(
                outPath.rstrip("/").rstrip("\\") + "-log.txt")
        else:
            logPath = None
    if logPath != None:
        if not os.path.exists(os.path.dirname(logPath)):
            os.makedirs(os.path.dirname(logPath))
        Stream.openLog(logPath)
    print "Loading the Interaction XML files"
    print "Loading A from", inputA
    a = ETUtils.ETFromObj(inputA)
    print "Loading B from", inputB
    b = ETUtils.ETFromObj(inputB)
    gold = None
    if inputGold:
        print "Loading gold from", inputGold
        gold = ETUtils.ETFromObj(inputGold) if inputGold else None
    print "Copying a as template"
    template = copy.deepcopy(a)
    print "Calculating confidence score ranges"
    scoreRanges = {}
    scoreRanges["a"] = getScoreRange(a, skip)
    scoreRanges["b"] = getScoreRange(b, skip)
    print scoreRanges
    print "Combining"
    counts = defaultdict(int)
    counts["skipped"] = defaultdict(int)
    counter = ProgressCounter(len([x for x in a.findall("document")]),
                              "Combine")
    for docA, docB, docGold, docTemplate in itertools.izip_longest(
            *[x.findall("document") for x in (a, b, gold, template)]):
        counter.update()
        assert len(
            set([x.get("id")
                 for x in (docA, docB, docGold, docTemplate)])) == 1
        for sentA, sentB, sentGold, sentTemplate in itertools.izip_longest(*[
                x.findall("sentence")
                for x in (docA, docB, docGold, docTemplate)
        ]):
            assert len(
                set([
                    x.get("id") for x in (sentA, sentB, sentGold, sentTemplate)
                ])) == 1
            interactions = getInteractions(sentA, sentB, sentGold, skip,
                                           counts["skipped"])
            for interaction in sentTemplate.findall("interaction"):
                sentTemplate.remove(interaction)
            analyses = sentTemplate.find("analyses")
            if analyses:
                sentTemplate.remove(analyses)
            for key in interactions:
                interaction = getCombinedInteraction(interactions[key], mode,
                                                     counts, scoreRanges)
                if interaction != None:
                    sentTemplate.append(copy.deepcopy(interaction))
            if analyses:
                sentTemplate.append(analyses)
    counts["skipped"] = dict(counts["skipped"])
    print "Counts:", dict(counts)
    if gold != None:
        print "****** Evaluating A ******"
        evaluateChemProt(
            a, gold
        )  #EvaluateIXML.run(AveragingMultiClassEvaluator, a, gold, "McCC")
        print "****** Evaluating B ******"
        evaluateChemProt(
            b, gold
        )  #EvaluateIXML.run(AveragingMultiClassEvaluator, b, gold, "McCC")
        print "****** Evaluating Combined ******"
        evaluateChemProt(
            template, gold
        )  #EvaluateIXML.run(AveragingMultiClassEvaluator, template, gold, "McCC")
    if outPath != None:
        print "Writing output to", outPath
        if outPath.endswith(".tsv"):
            Preprocessor(steps=["EXPORT_CHEMPROT"]).process(template, outPath)
        else:
            ETUtils.write(template, outPath)
    if logPath != None:
        Stream.closeLog(logPath)
Example #10
0
def run(inPath,
        outPath,
        subDirs,
        model,
        connection,
        numJobs,
        subTask=3,
        posTags=None,
        useTestSet=False,
        clear=True,
        debug=False,
        force=False,
        training=True,
        preprocessorSteps=None,
        subset=None):
    # Remove existing non-empty work directory, if requested to do so
    if os.path.exists(outPath) and len(os.listdir(outPath)) > 0 and clear:
        if force or ask("Output directory '" + outPath + "' exists, remove?"):
            print >> sys.stderr, "Output directory exists, removing", outPath
            shutil.rmtree(outPath)
    # Create work directory if needed
    if not os.path.exists(outPath):
        print >> sys.stderr, "Making output directory", outPath
        os.makedirs(outPath)

    # Begin logging
    logPath = beginLog(outPath)

    # Collect the parse files
    parseDir = os.path.join(outPath, "parses")
    if not os.path.exists(parseDir) or len(os.listdir(parseDir)) == 0:
        parseDir = combineParses(inPath, parseDir, subDirs)
    else:
        print >> sys.stderr, "Using collected parses from", parseDir

    # Import the parses
    corpusDir = os.path.join(outPath, "corpus")
    if not os.path.exists(corpusDir):
        if preprocessorSteps == None:
            preprocessorSteps = [
                "MERGE_SETS", "REMOVE_ANALYSES", "REMOVE_HEADS",
                "MERGE_SENTENCES", "IMPORT_PARSE", "SPLIT_NAMES", "FIND_HEADS",
                "DIVIDE_SETS"
            ]
        preprocessor = Preprocessor(preprocessorSteps)
        #preprocessor = Preprocessor(["MERGE-SETS", "REMOVE-ANALYSES", "REMOVE-HEADS", "MERGE-SENTENCES", "IMPORT-PARSE", "VALIDATE", "DIVIDE-SETS"])
        preprocessor.setArgForAllSteps("debug", debug)
        preprocessor.getStep("IMPORT_PARSE").setArg("parseDir", parseDir)
        preprocessor.getStep("IMPORT_PARSE").setArg("posTags", posTags)
        modelPattern = model + ".+\.xml" if useTestSet else model + "-devel\.xml|" + model + "-train\.xml"
        preprocessor.process(modelPattern,
                             os.path.join(corpusDir, model),
                             logPath=None)
    else:
        print >> sys.stderr, "Using imported parses from", corpusDir

    # Train the model
    if training:
        connection = connection.replace("$JOBS", str(numJobs))
        if subTask > 0:
            model = model + "." + str(subTask)
        train(
            outPath,
            model,
            parse="McCC",
            debug=debug,
            connection=connection,
            corpusDir=corpusDir,
            subset=subset,
            log=None
        )  #classifierParams={"examples":None, "trigger":"150000", "recall":None, "edge":"7500", "unmerging":"2500", "modifiers":"10000"})

    # Close the log
    endLog(logPath)
def parseXML(xml,
             outStem,
             intermediateFiles=True,
             debug=False,
             bbResources=False):
    preprocessor = Preprocessor()
    if bbResources:
        preprocessor.insertStep(5, "BB_RESOURCES", insertResources.process, {},
                                "bb-resources.xml")
    preprocessor.setArgForAllSteps("debug", debug)
    preprocessor.stepArgs("PARSE")["requireEntities"] = False
    if not intermediateFiles:
        preprocessor.setNoIntermediateFiles()
    preprocessor.process(xml, outStem, omitSteps=["NER", "DIVIDE-SETS"])
Example #12
0
                     help="AUTO, None, or a path")
    #debug.add_option("--intermediateFiles", default=False, action="store_true", dest="intermediateFiles", help="Save an intermediate file for each step")
    debug.add_option("--debug",
                     default=False,
                     action="store_true",
                     dest="debug",
                     help="Set debug mode for all steps")
    optparser.add_option_group(debug)
    (options, args) = optparser.parse_args()

    #     if options.steps != None:
    #         options.steps = [x.strip() for x in options.steps.split(",")]
    #     if options.omitSteps != None:
    #         options.omitSteps = options.omitSteps.split(",")
    #
    preprocessor = Preprocessor(options.steps, options.parseName,
                                options.requireEntities)
    if options.steps == None:
        print >> sys.stderr, preprocessor.getHelpString()
    else:
        preprocessor.setArgForAllSteps("debug", options.debug)
        if preprocessor.hasStep("CONVERT"):
            if options.corpus != None:
                preprocessor.getStep("CONVERT").setArg("corpusName",
                                                       options.corpus)
            if options.dataSetNames != None:
                preprocessor.getStep("CONVERT").setArg("dataSetNames",
                                                       options.dataSetNames)
        if options.parseDir:
            preprocessor.getStep("IMPORT_PARSE").setArg(
                "parseDir", options.parseDir)
        if options.exportFormats and preprocessor.hasStep("EXPORT"):
Example #13
0
def train(output,
          task=None,
          detector=None,
          inputFiles=None,
          models=None,
          parse=None,
          processUnmerging=None,
          processModifiers=None,
          bioNLPSTParams=None,
          preprocessorParams=None,
          exampleStyles=None,
          classifierParams=None,
          doFullGrid=False,
          deleteOutput=False,
          copyFrom=None,
          log="log.txt",
          step=None,
          omitSteps=None,
          debug=False,
          connection=None,
          subset=None,
          folds=None):
    """
    Train a new model for event or relation detection.
    
    @param output: A directory where output files will appear.
    @param task: If defined, overridable default settings are used for many of the training parameters. Must be one of the supported TEES tasks.
    @param detector: a Detector object, or a string defining one to be imported
    @param inputFiles: A dictionary of file names, with keys "train", "devel" and, "test"
    @param models: A dictionary of file names defining the place for the new models, with keys "devel" and, "test"
    @param parse: The parse element name in the training interaction XML
    @param processUnmerging: Use the unmerging step of EventDetector. True, False or None for task default.
    @param processModifiers: Use the modifier detection step of EventDetector. True, False or None for task default.
    @param bioNLPSTParams: Parameters controlling BioNLP ST format output.
    @param preprocessorParams: Parameters controlling the preprocessor. Not used for training, but saved to the model for use when classifying.
    @param exampleStyles: A parameter set for controlling example builders.
    @param classifierParams: A parameter set for controlling classifiers.
    @param doFullGrid: Whether all parameters, as opposed to just recall adjustment, are tested in the EventDetector grid search.
    @param deleteOutput: Remove an existing output directory
    @param copyFrom: Copy an existing output directory for use as a template
    @param log: An optional alternative name for the log file. None is for no logging.
    @param step: A step=substep pair, where the steps are "TRAIN", "DEVEL", "EMPTY" and "TEST"
    @param omitSteps: step=substep parameters, where multiple substeps can be defined.
    @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved
    @param connection: A parameter set defining a local or remote connection for training the classifier
    @param subset: A parameter set for making subsets of input files
    """
    # Insert default arguments where needed
    inputFiles = setDictDefaults(inputFiles, {
        "train": None,
        "devel": None,
        "test": None
    })
    models = setDictDefaults(models, {"devel": None, "test": None})
    exampleStyles = setDictDefaults(
        exampleStyles, {
            "examples": None,
            "trigger": None,
            "edge": None,
            "unmerging": None,
            "modifiers": None
        })
    classifierParams = setDictDefaults(
        classifierParams, {
            "examples": None,
            "trigger": None,
            "recall": None,
            "edge": None,
            "unmerging": None,
            "modifiers": None
        })
    subset = setDictDefaults(Parameters.get(subset), {
        "train": None,
        "devel": None,
        "test": None,
        "seed": 0,
        "all": None
    })
    folds = setDictDefaults(folds, {
        "train": None,
        "devel": None,
        "test": None
    })
    processUnmerging = getDefinedBool(processUnmerging)
    processModifiers = getDefinedBool(processModifiers)
    # Initialize working directory
    workdir(output, deleteOutput, copyFrom, log)
    # Get task specific parameters
    detector, bioNLPSTParams, preprocessorParams = getTaskSettings(
        task, detector, bioNLPSTParams, preprocessorParams, inputFiles,
        exampleStyles, classifierParams)
    # Learn training settings from input files
    detector = learnSettings(inputFiles, detector, classifierParams)
    # Get corpus subsets
    getFolds(inputFiles, folds)
    getSubsets(inputFiles, subset)
    if task != None:
        task = task.replace("-FULL", "")
    # Define processing steps
    selector, detectorSteps, omitDetectorSteps = getSteps(
        step, omitSteps, ["TRAIN", "DEVEL", "EMPTY", "TEST"])

    # Initialize the detector
    detector, detectorName = getDetector(detector)
    detector = detector()  # initialize object
    detector.debug = debug
    detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(
        bioNLPSTParams)
    #detector.useBioNLPSTFormat = useBioNLPSTFormat # classify-output and grid evaluation in ST-format
    #detector.stWriteScores = True # write confidence scores into additional st-format files
    connection = getConnection(connection)
    detector.setConnection(connection)
    connection.debug = debug
    if deleteOutput:
        connection.clearWorkDir()

    # Train
    if selector.check("TRAIN"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------------ Train Detector ------------------"
        print >> sys.stderr, "----------------------------------------------------"
        if isinstance(detector, SingleStageDetector):
            detector.train(inputFiles["train"],
                           inputFiles["devel"],
                           models["devel"],
                           models["test"],
                           exampleStyles["examples"],
                           classifierParams["examples"],
                           parse,
                           None,
                           task,
                           fromStep=detectorSteps["TRAIN"],
                           workDir="training")
        else:
            detector.train(inputFiles["train"],
                           inputFiles["devel"],
                           models["devel"],
                           models["test"],
                           exampleStyles["trigger"],
                           exampleStyles["edge"],
                           exampleStyles["unmerging"],
                           exampleStyles["modifiers"],
                           classifierParams["trigger"],
                           classifierParams["edge"],
                           classifierParams["unmerging"],
                           classifierParams["modifiers"],
                           classifierParams["recall"],
                           processUnmerging,
                           processModifiers,
                           doFullGrid,
                           task,
                           parse,
                           None,
                           fromStep=detectorSteps["TRAIN"],
                           workDir="training")
        # Save the detector type
        for model in [models["devel"], models["test"]]:
            if model != None and os.path.exists(model):
                model = Model(model, "a")
                model.addStr("detector", detectorName)
                if preprocessorParams != None:
                    preprocessor = Preprocessor()
                    model.addStr(
                        "preprocessorParams",
                        Parameters.toString(
                            preprocessor.getParameters(preprocessorParams)))
                model.save()
                model.close()
    if selector.check("DEVEL"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------ Check devel classification ------------"
        print >> sys.stderr, "----------------------------------------------------"
        #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
        detector.classify(inputFiles["devel"],
                          models["devel"],
                          "classification-devel/devel",
                          goldData=inputFiles["devel"],
                          fromStep=detectorSteps["DEVEL"],
                          workDir="classification-devel")
    if selector.check("EMPTY"):
        # By passing an emptied devel set through the prediction system, we can check that we get the same predictions
        # as in the DEVEL step, ensuring the model does not use leaked information.
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------ Empty devel classification ------------"
        print >> sys.stderr, "----------------------------------------------------"
        #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
        detector.classify(getEmptyCorpus(
            inputFiles["devel"],
            removeNames=("names" in str(exampleStyles["examples"])
                         or "names" in str(exampleStyles["trigger"]))),
                          models["devel"],
                          "classification-empty/devel-empty",
                          fromStep=detectorSteps["EMPTY"],
                          workDir="classification-empty")
    if selector.check("TEST"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------- Test set classification --------------"
        print >> sys.stderr, "----------------------------------------------------"
        if inputFiles["test"] == None or not os.path.exists(
                inputFiles["test"]):
            print >> sys.stderr, "Skipping, test file", inputFiles[
                "test"], "does not exist"
        else:
            #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
            detector.classify(inputFiles["test"],
                              models["test"],
                              "classification-test/test",
                              fromStep=detectorSteps["TEST"],
                              workDir="classification-test")
            if detector.bioNLPSTParams["convert"]:
                Utils.STFormat.Compare.compare(
                    "classification-test/test-events.tar.gz",
                    "classification-devel/devel-events.tar.gz", "a2")
Example #14
0
def convert(inPath,
            outDir,
            corpusId,
            directed,
            negatives,
            preprocess,
            preprocessorParameters=None,
            debug=False,
            clear=False,
            constParser="BLLIP-BIO",
            depParser="STANFORD-CONVERT",
            logging=True):
    assert negatives in ("INCLUDE", "SKIP", "REVERSE_POS")
    # Download the corpus if needed
    if inPath == None:
        if not hasattr(Settings, "SE10T8_CORPUS"):
            SemEval2010Task8Tools.install()
        inPath = Settings.SE10T8_CORPUS
    assert os.path.exists(inPath)
    # Prepare the output directory
    if not os.path.exists(outDir):
        print "Making output directory", outDir
        os.makedirs(outDir)
    elif clear:
        print "Removing output directory", outDir
        shutil.rmtree(outDir)
    # Start logging
    if logging:
        Stream.openLog(os.path.join(outDir, "log.txt"), clear=clear)
    # Read and process the corpus files
    archive = zipfile.ZipFile(inPath, 'r')
    usedIds = set()
    tree = None
    for fileName, setName in [("SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT", "train"),\
                              ("SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT", "test")]:
        print "Processing file", fileName, "as set", setName
        f = archive.open(fileName)
        tree = processLines(f.readlines(),
                            setName,
                            directed=directed,
                            negatives=negatives,
                            usedIds=usedIds,
                            tree=tree,
                            corpusId=corpusId)
        f.close()
    # Divide the training set into training and development sets
    MakeSets.processCorpus(tree, None, "train", [("train", 0.7),
                                                 ("devel", 1.0)], 1)
    # Write out the converted corpus
    convertedPath = os.path.join(outDir, corpusId + "-converted.xml")
    ETUtils.write(tree.getroot(), convertedPath)
    # Preprocess the converted corpus
    if preprocess:
        outPath = os.path.join(outDir, corpusId + ".xml")
        preprocessor = Preprocessor(constParser, depParser)
        preprocessor.setArgForAllSteps("debug", debug)
        preprocessor.stepArgs("CONVERT")["corpusName"] = corpusId
        preprocessor.process(
            convertedPath,
            outPath,
            preprocessorParameters,
            omitSteps=["SPLIT-SENTENCES", "NER", "SPLIT-NAMES"])
    # Stop logging
    if logging:
        Stream.closeLog(os.path.join(outDir, "log.txt"))
Example #15
0
def train(output, task=None, detector=None, inputFiles=None, models=None, parse=None,
          processUnmerging=None, processModifiers=None, 
          bioNLPSTParams=None, preprocessorParams=None, exampleStyles=None, 
          classifierParams=None,  doFullGrid=False, deleteOutput=False, copyFrom=None, 
          log="log.txt", step=None, omitSteps=None, debug=False, connection=None, subset=None, 
          folds=None, corpusDir=None, corpusPreprocessing=None, evaluator=None):
    """
    Train a new model for event or relation detection.
    
    @param output: A directory where output files will appear.
    @param task: If defined, overridable default settings are used for many of the training parameters. Must be one of the supported TEES tasks.
    @param detector: a Detector object, or a string defining one to be imported
    @param inputFiles: A dictionary of file names, with keys "train", "devel" and, "test"
    @param models: A dictionary of file names defining the place for the new models, with keys "devel" and, "test"
    @param parse: The parse element name in the training interaction XML
    @param processUnmerging: Use the unmerging step of EventDetector. True, False or None for task default.
    @param processModifiers: Use the modifier detection step of EventDetector. True, False or None for task default.
    @param bioNLPSTParams: Parameters controlling BioNLP ST format output.
    @param preprocessorParams: Parameters controlling the preprocessor. Not used for training, but saved to the model for use when classifying.
    @param exampleStyles: A parameter set for controlling example builders.
    @param classifierParams: A parameter set for controlling classifiers.
    @param doFullGrid: Whether all parameters, as opposed to just recall adjustment, are tested in the EventDetector grid search.
    @param deleteOutput: Remove an existing output directory
    @param copyFrom: Copy an existing output directory for use as a template
    @param log: An optional alternative name for the log file. None is for no logging.
    @param step: A step=substep pair, where the steps are "TRAIN", "DEVEL", "EMPTY" and "TEST"
    @param omitSteps: step=substep parameters, where multiple substeps can be defined.
    @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved
    @param connection: A parameter set defining a local or remote connection for training the classifier
    @param subset: A parameter set for making subsets of input files
    """
    # Insert default arguments where needed
    inputFiles = setDictDefaults(inputFiles, {"train":None, "devel":None, "test":None})
    models = setDictDefaults(models, {"devel":"model-devel", "test":"model-test"})
    exampleStyles = setDictDefaults(exampleStyles, {"examples":None, "trigger":None, "edge":None, "unmerging":None, "modifiers":None})
    classifierParams = setDictDefaults(classifierParams, {"examples":None, "trigger":None, "recall":None, "edge":None, "unmerging":None, "modifiers":None})
    subset = setDictDefaults(Parameters.get(subset), {"train":None, "devel":None, "test":None, "seed":0, "all":None})
    folds = setDictDefaults(folds, {"train":None, "devel":None, "test":None})
    processUnmerging = getDefinedBool(processUnmerging)
    processModifiers = getDefinedBool(processModifiers)
    # Initialize working directory
    workdir(output, deleteOutput, copyFrom, log)
    # Get task specific parameters
    useKerasDetector = False
    if detector != None and "keras" in detector.lower():
        print >> sys.stderr, "Using a Keras Detector"
        useKerasDetector = True
        if detector.lower() == "keras":
            detector = None
    detector, bioNLPSTParams, preprocessorParams, folds = getTaskSettings(task, detector, 
        bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParams, folds, corpusDir=corpusDir, useKerasDetector=useKerasDetector)
    # Learn training settings from input files
    detector = learnSettings(inputFiles, detector, classifierParams, task, exampleStyles, useKerasDetector=useKerasDetector)   
    # Get corpus subsets   
    getFolds(inputFiles, folds)
    getSubsets(inputFiles, subset)
    if task != None: 
        task = task.replace("-FULL", "")
    if "." in task:
        _, subTask = getSubTask(task)
        if subTask != 3:
            processModifiers = False
    # Preprocess the corpus if required
    if corpusPreprocessing != None:
        preprocessor = Preprocessor(steps=corpusPreprocessing)
        assert preprocessor.steps[0].name == "MERGE_SETS"
        assert preprocessor.steps[-1].name == "DIVIDE_SETS"
        preprocessedCorpusDir = os.path.join(output, "corpus")
        #outputFiles = {x:os.path.join(preprocessedCorpusDir, os.path.basename(inputFiles[x])) for x in inputFiles}
        preprocessor.process(inputFiles, os.path.join(preprocessedCorpusDir, task))
        #inputFiles = outputFiles
        for setName in inputFiles.keys():
            if inputFiles[setName] != None:
                inputFiles[setName] = os.path.join(preprocessedCorpusDir, task + "-" + setName + ".xml")
    # Define processing steps
    selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["TRAIN", "DEVEL", "EMPTY", "TEST"])
    
    # Initialize the detector
    detector, detectorName = getDetector(detector, evaluator=evaluator)
    evaluator, evaluatorName = importClass(evaluator, "evaluator")
    detector = detector() # initialize object
    if evaluator != None:
        print >> sys.stderr, "Using evaluator", evaluator.__name__
        detector.evaluator = evaluator
    detector.debug = debug
    detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams)
    #detector.useBioNLPSTFormat = useBioNLPSTFormat # classify-output and grid evaluation in ST-format
    #detector.stWriteScores = True # write confidence scores into additional st-format files
    connection = getConnection(connection)
    detector.setConnection(connection)
    connection.debug = debug
    if deleteOutput:
        connection.clearWorkDir()
    
    # Train
    if selector.check("TRAIN"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------------ Train Detector ------------------"
        print >> sys.stderr, "----------------------------------------------------"
        if not isinstance(detector, EventDetector):
            detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"],
                           exampleStyles["examples"], classifierParams["examples"], parse, None, task,
                           fromStep=detectorSteps["TRAIN"], workDir="training", testData=inputFiles["test"])
        else:
            detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"],
                           exampleStyles["trigger"], exampleStyles["edge"], exampleStyles["unmerging"], exampleStyles["modifiers"],
                           classifierParams["trigger"], classifierParams["edge"], classifierParams["unmerging"], classifierParams["modifiers"],
                           classifierParams["recall"], processUnmerging, processModifiers, 
                           doFullGrid, task, parse, None,
                           fromStep=detectorSteps["TRAIN"], workDir="training", testData=inputFiles["test"])
        # Save the detector type
        for model in [models["devel"], models["test"]]:
            if model != None and os.path.exists(model):
                model = Model(model, "a")
                model.addStr("detector", detectorName)
                if evaluatorName != None:
                    model.addStr("detector", evaluatorName)
                if preprocessorParams != None:
                    preprocessor = Preprocessor()
                    model.addStr("preprocessorParams", Parameters.toString(preprocessor.getParameters(preprocessorParams)))
                model.save()
                model.close()
    if selector.check("DEVEL"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------ Check devel classification ------------"
        print >> sys.stderr, "----------------------------------------------------"
        #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
        detector.classify(inputFiles["devel"], models["devel"], "classification-devel/devel", goldData=inputFiles["devel"], fromStep=detectorSteps["DEVEL"], workDir="classification-devel")
    if selector.check("EMPTY"):
        # By passing an emptied devel set through the prediction system, we can check that we get the same predictions
        # as in the DEVEL step, ensuring the model does not use leaked information.
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------ Empty devel classification ------------"
        print >> sys.stderr, "----------------------------------------------------"
        #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
        removalScope = "non-given"
        if "names" in str(exampleStyles["examples"]) or "names" in str(exampleStyles["trigger"]):
            removalScope = "all"
        elif "Edge" in detector.__class__.__name__:
            removalScope = "interactions"
        detector.classify(getEmptyCorpus(inputFiles["devel"], scope=removalScope), models["devel"], "classification-empty/devel-empty", fromStep=detectorSteps["EMPTY"], workDir="classification-empty")
        print >> sys.stderr, "*** Evaluate empty devel classification ***"
        if os.path.exists("classification-empty/devel-empty-pred.xml.gz"):
            EvaluateInteractionXML.run(detector.evaluator, "classification-empty/devel-empty-pred.xml.gz", inputFiles["devel"], parse)
        else:
            print >> sys.stderr, "No output file for evaluation"
    if selector.check("TEST"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------- Test set classification --------------"
        print >> sys.stderr, "----------------------------------------------------"
        if inputFiles["test"] == None or not os.path.exists(inputFiles["test"]):
            print >> sys.stderr, "Skipping, test file", inputFiles["test"], "does not exist"
        else:
            #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
            detector.classify(inputFiles["test"], models["test"] if models["test"] != None else models["devel"], "classification-test/test", fromStep=detectorSteps["TEST"], workDir="classification-test")
            if detector.bioNLPSTParams["convert"]:
                extension = ".zip" if (detector.bioNLPSTParams["convert"] == "zip") else ".tar.gz" 
                Utils.STFormat.Compare.compare("classification-test/test-events" + extension, "classification-devel/devel-events" + extension, "a2")
    # Stop logging
    if log != None:
        Stream.closeLog(log)
Example #16
0
    def toStringConcise(self, indent="", title=None):
        self.internal.toStringConcise(indent, title)
        
    def toDict(self):
        return {x:self.results[x] for x in self.results}

if __name__=="__main__":
    from optparse import OptionParser
    optparser = OptionParser(usage="%prog [options]\nCalculate f-score and other statistics.")
    optparser.add_option("-e", "--examples", default=None, dest="examples", help="", metavar="FILE")
    optparser.add_option("-p", "--predictions", default=None, dest="predictions", help="", metavar="FILE")
    optparser.add_option("-c", "--classSet", default=None, dest="classSet", help="", metavar="FILE")
    optparser.add_option("-d", "--dataSet", default="devel", dest="dataSet", help="", metavar="FILE")
    (options, args) = optparser.parse_args()
    
    assert options.dataSet in ("devel", "test")
    options.dataSet = {"devel":"./data/chemprot_development_gold_standard.tsv", "test":"./data/chemprot_test_gold_standard.tsv"}[options.dataSet]
    
    if options.examples.endswith(".xml") or options.examples.endswith(".xml.gz"):
        preprocessor = Preprocessor(steps="EXPORT_CHEMPROT")
        tempDir = tempfile.mkdtemp()
        tsvPath = os.path.join(tempDir, os.path.basename(options.examples) + ".tsv")
        preprocessor.process(options.examples, tsvPath)
        ChemProtEvaluator().evaluateTSV(tsvPath, options.dataSet)
        shutil.rmtree(tempDir)
    if options.examples.endswith(".tsv"):
        ChemProtEvaluator().evaluateTSV(options.examples, options.dataSet)
    else:
        ev = ChemProtEvaluator(options.examples, options.predictions, options.classSet)
    #print ev.toStringConcise()
Example #17
0
def classify(input,
             model,
             output,
             workDir=None,
             step=None,
             omitSteps=None,
             goldInput=None,
             detector=None,
             debug=False,
             clear=False,
             preprocessorTag="-preprocessed.xml.gz",
             preprocessorParams=None,
             bioNLPSTParams=None):
    """
    Detect events or relations from text.
    
    @param input: The input file in either interaction XML or BioNLP ST format. Can also be a PMID or TEES default corpus name.
    @param model: A path to a model file or the name of a TEES default model.
    @param output: The output file stem. Output files will be of the form output-*
    @param workDir: If intermediate files need to be saved, they will go here.
    @param step: A step=substep pair, where the steps are PREPROCESS and CLASSIFY
    @param omitSteps: step=substep parameters, where multiple substeps can be defined.
    @param goldInput: a version of the corpus file with gold annotation. Enables measuring of performance
    @param detector: a Detector object, or a string defining one to be imported. If None, will be read from model.
    @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved
    @param clear: Remove existing workDir
    @param preprocessorTag: preprocessor output file will be output + preprocessorTag
    @param preprocessorParams: Optional parameters controlling preprocessing. If None, will be read from model.
    @param bioNLPSTParams: Optional parameters controlling BioNLP ST format output. If None, will be read from model.
    """
    input = os.path.abspath(input)
    if goldInput != None: goldInput = os.path.abspath(goldInput)
    if model != None: model = os.path.abspath(model)
    # Initialize working directory
    if workDir != None:  # use a permanent work directory
        workdir(workDir, clear)
    Stream.openLog(output + "-log.txt")  # log in the output directory
    # Get input files
    input, preprocess = getInput(input)
    model = getModel(model)
    # Define processing steps
    selector, detectorSteps, omitDetectorSteps = getSteps(
        step, omitSteps, ["PREPROCESS", "CLASSIFY"])
    if not preprocess:
        selector.markOmitSteps("PREPROCESS")

    classifyInput = input
    if selector.check("PREPROCESS"):
        if preprocessorParams == None:
            preprocessorParams = [
                "LOAD", "GENIA_SPLITTER", "BANNER", "BLLIP_BIO",
                "STANFORD_CONVERT", "SPLIT_NAMES", "FIND_HEADS", "SAVE"
            ]
        preprocessor = Preprocessor(preprocessorParams)
        if debug:
            preprocessor.setArgForAllSteps("debug", True)
        preprocessorOutput = output + preprocessorTag
        #preprocessor.debug = debug
        #preprocessor.source = input # This has to be defined already here, needs to be fixed later
        #preprocessor.requireEntitiesForParsing = True # parse only sentences which contain named entities
        if os.path.exists(
                preprocessorOutput
        ) and not clear:  #os.path.exists(preprocessor.getOutputPath("FIND-HEADS")):
            #print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "exists, skipping preprocessing."
            print >> sys.stderr, "Preprocessor output", preprocessorOutput, "exists, skipping preprocessing."
            classifyInput = preprocessorOutput  # preprocessor.getOutputPath("FIND-HEADS")
        else:
            #print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "does not exist"
            print >> sys.stderr, "Preprocessor output", preprocessorOutput, "does not exist"
            print >> sys.stderr, "------------ Preprocessing ------------"
            # Remove some of the unnecessary intermediate files
            #preprocessor.setIntermediateFiles({"Convert":None, "SPLIT-SENTENCES":None, "PARSE":None, "CONVERT-PARSE":None, "SPLIT-NAMES":None})
            # Process input into interaction XML
            classifyInput = preprocessor.process(input, preprocessorOutput,
                                                 model)

    if selector.check("CLASSIFY"):
        detector = getDetector(detector,
                               model)[0]()  # initialize detector object
        detector.debug = debug
        detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(
            bioNLPSTParams, model)
        detector.classify(classifyInput,
                          model,
                          output,
                          goldData=goldInput,
                          fromStep=detectorSteps["CLASSIFY"],
                          omitSteps=omitDetectorSteps["CLASSIFY"],
                          workDir=workDir)
Example #18
0
File: train.py Project: ninjin/TEES
def train(output, task=None, detector=None, inputFiles=None, models=None, parse=None,
          processUnmerging=None, processModifiers=None, isSingleStage=False, 
          bioNLPSTParams=None, preprocessorParams=None, exampleStyles=None, 
          classifierParams=None,  doFullGrid=False, deleteOutput=False, copyFrom=None, 
          log="log.txt", step=None, omitSteps=None, debug=False, connection=None):
    """
    Train a new model for event or relation detection.
    
    @param output: A directory where output files will appear.
    @param task: If defined, overridable default settings are used for many of the training parameters. Must be one of the supported TEES tasks.
    @param detector: a Detector object, or a string defining one to be imported
    @param inputFiles: A dictionary of file names, with keys "train", "devel" and, "test"
    @param models: A dictionary of file names defining the place for the new models, with keys "devel" and, "test"
    @param parse: The parse element name in the training interaction XML
    @param processUnmerging: Use the unmerging step of EventDetector. True, False or None for task default.
    @param processModifiers: Use the modifier detection step of EventDetector. True, False or None for task default.
    @param isSingleStage: False for EventDetector, True for a single stage detector.
    @param bioNLPSTParams: Parameters controlling BioNLP ST format output.
    @param preprocessorParams: Parameters controlling the preprocessor. Not used for training, but saved to the model for use when classifying.
    @param exampleStyles: A parameter set for controlling example builders.
    @param classifierParams: A parameter set for controlling classifiers.
    @param doFullGrid: Whether all parameters, as opposed to just recall adjustment, are tested in the EventDetector grid search.
    @param deleteOutput: Remove an existing output directory
    @param copyFrom: Copy an existing output directory for use as a template
    @param log: An optional alternative name for the log file. None is for no logging.
    @param step: A step=substep pair, where the steps are "TRAIN", "DEVEL", "EMPTY" and "TEST"
    @param omitSteps: step=substep parameters, where multiple substeps can be defined.
    @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved
    @param connection: A parameter set defining a local or remote connection for training the classifier
    """
    # Insert default arguments where needed
    inputFiles = Parameters.get(inputFiles, {"train":None, "devel":None, "test":None})
    models = Parameters.get(models, {"devel":None, "test":None})
    exampleStyles = Parameters.get(exampleStyles, {"examples":None, "trigger":None, "edge":None, "unmerging":None, "modifiers":None})
    classifierParams = Parameters.get(classifierParams, {"examples":None, "trigger":None, "recall":None, "edge":None, "unmerging":None, "modifiers":None})
    processUnmerging = getDefinedBool(processUnmerging)
    processModifiers = getDefinedBool(processModifiers)
    # Initialize working directory
    workdir(output, deleteOutput, copyFrom, log)
    # Get task specific parameters
    detector, processUnmerging, processModifiers, isSingleStage, bioNLPSTParams, preprocessorParams, exampleStyles, classifierParams, removeNamesFromEmpty = getTaskSettings(task, 
        detector, processUnmerging, processModifiers, isSingleStage, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParams)   
    if task != None: task = task.replace("-MINI", "").replace("-FULL", "")
    # Define processing steps
    selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["TRAIN", "DEVEL", "EMPTY", "TEST"])
    
    # Initialize the detector
    detector, detectorName = getDetector(detector)
    detector = detector() # initialize object
    detector.debug = debug
    detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams)
    #detector.useBioNLPSTFormat = useBioNLPSTFormat # classify-output and grid evaluation in ST-format
    #detector.stWriteScores = True # write confidence scores into additional st-format files
    connection = getConnection(connection)
    detector.setConnection(connection)
    connection.debug = debug
    if deleteOutput:
        connection.clearWorkDir()
    
    # Train
    if selector.check("TRAIN"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------------ Train Detector ------------------"
        print >> sys.stderr, "----------------------------------------------------"
        if isSingleStage:
            detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"],
                           exampleStyles["examples"], classifierParams["examples"], parse, None, task,
                           fromStep=detectorSteps["TRAIN"], workDir="training")
        else:
            detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"],
                           exampleStyles["trigger"], exampleStyles["edge"], exampleStyles["unmerging"], exampleStyles["modifiers"],
                           classifierParams["trigger"], classifierParams["edge"], classifierParams["unmerging"], classifierParams["modifiers"],
                           classifierParams["recall"], processUnmerging, processModifiers, 
                           doFullGrid, task, parse, None,
                           fromStep=detectorSteps["TRAIN"], workDir="training")
        # Save the detector type
        for model in [models["devel"], models["test"]]:
            if os.path.exists(model):
                model = Model(model, "a")
                model.addStr("detector", detectorName)
                if preprocessorParams != None:
                    preprocessor = Preprocessor()
                    model.addStr("preprocessorParams", Parameters.toString(preprocessor.getParameters(preprocessorParams)))
                model.save()
                model.close()
    if selector.check("DEVEL"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------ Check devel classification ------------"
        print >> sys.stderr, "----------------------------------------------------"
        detector.classify(inputFiles["devel"], models["devel"], "classification-devel/devel", goldData=inputFiles["devel"], fromStep=detectorSteps["DEVEL"], workDir="classification-devel")
    if selector.check("EMPTY"):
        # By passing an emptied devel set through the prediction system, we can check that we get the same predictions
        # as in the DEVEL step, ensuring the model does not use leaked information.
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------ Empty devel classification ------------"
        print >> sys.stderr, "----------------------------------------------------"
        detector.classify(getEmptyCorpus(inputFiles["devel"], removeNames=removeNamesFromEmpty), models["devel"], "classification-empty/devel-empty", fromStep=detectorSteps["EMPTY"], workDir="classification-empty")
    if selector.check("TEST"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------- Test set classification --------------"
        print >> sys.stderr, "----------------------------------------------------"
        if inputFiles["test"] == None or not os.path.exists(inputFiles["test"]):
            print >> sys.stderr, "Skipping, test file", inputFiles["test"], "does not exist"
        else:
            detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
            detector.classify(inputFiles["test"], models["test"], "classification-test/test", fromStep=detectorSteps["TEST"], workDir="classification-test")
            if detector.bioNLPSTParams["convert"]:
                Utils.STFormat.Compare.compare("classification-test/test-events.tar.gz", "classification-devel/devel-events.tar.gz", "a2")
Example #19
0
                         metavar="FILE")
    optparser.add_option("-d",
                         "--dataSet",
                         default="devel",
                         dest="dataSet",
                         help="",
                         metavar="FILE")
    (options, args) = optparser.parse_args()

    assert options.dataSet in ("devel", "test")
    options.dataSet = {
        "devel": "./data/chemprot_development_gold_standard.tsv",
        "test": "./data/chemprot_test_gold_standard.tsv"
    }[options.dataSet]

    if options.examples.endswith(".xml") or options.examples.endswith(
            ".xml.gz"):
        preprocessor = Preprocessor(steps="EXPORT_CHEMPROT")
        tempDir = tempfile.mkdtemp()
        tsvPath = os.path.join(tempDir,
                               os.path.basename(options.examples) + ".tsv")
        preprocessor.process(options.examples, tsvPath)
        ChemProtEvaluator().evaluateTSV(tsvPath, options.dataSet)
        shutil.rmtree(tempDir)
    if options.examples.endswith(".tsv"):
        ChemProtEvaluator().evaluateTSV(options.examples, options.dataSet)
    else:
        ev = ChemProtEvaluator(options.examples, options.predictions,
                               options.classSet)
    #print ev.toStringConcise()
Example #20
0
    debug = OptionGroup(optparser, "Debug and Process Control Options", "")
#    debug.add_option("-f", "--fromStep", default=None, dest="fromStep", help="Continue from this step")
#    debug.add_option("-t", "--toStep", default=None, dest="toStep", help="Stop at after this step")
#    debug.add_option("--omitSteps", default=None, dest="omitSteps", help="Skip these steps")
    debug.add_option("--logPath", default="AUTO", dest="logPath", help="AUTO, None, or a path")
    #debug.add_option("--intermediateFiles", default=False, action="store_true", dest="intermediateFiles", help="Save an intermediate file for each step")
    debug.add_option("--debug", default=False, action="store_true", dest="debug", help="Set debug mode for all steps")
    optparser.add_option_group(debug)
    (options, args) = optparser.parse_args()
    
#     if options.steps != None:
#         options.steps = [x.strip() for x in options.steps.split(",")]
#     if options.omitSteps != None:
#         options.omitSteps = options.omitSteps.split(",")
#         
    preprocessor = Preprocessor(options.steps, options.parseName, options.requireEntities)
    if options.steps == None:
        print >> sys.stderr, preprocessor.getHelpString()
    else:
        preprocessor.setArgForAllSteps("debug", options.debug)
        if preprocessor.hasStep("CONVERT"):
            if options.corpus != None:
                preprocessor.getStep("CONVERT").setArg("corpusName", options.corpus)
            if options.dataSetNames != None:
                preprocessor.getStep("CONVERT").setArg("dataSetNames", options.dataSetNames)
        if options.parseDir:
            preprocessor.getStep("IMPORT_PARSE").setArg("parseDir", options.parseDir)
        if options.exportFormats and preprocessor.hasStep("EXPORT"):
            preprocessor.getStep("EXPORT").setArg("formats", options.exportFormats.split(","))
        if options.importFormats:
            if preprocessor.hasStep("LOAD"):