def evaluateChemProt(xml, gold): EvaluateIXML.run(AveragingMultiClassEvaluator, xml, gold, "McCC") preprocessor = Preprocessor(steps=["EXPORT_CHEMPROT"]) tempDir = tempfile.mkdtemp() print >> sys.stderr, "Using temporary evaluation directory", tempDir tsvPath = os.path.join(tempDir, "predictions.tsv") preprocessor.process(xml, tsvPath) ChemProtEvaluator().evaluateTSV(tsvPath, tempDir) print >> sys.stderr, "Removing temporary evaluation directory", tempDir shutil.rmtree(tempDir)
def parseXML(xml, intermediateFileDir, debug=False): preprocessor = Preprocessor() preprocessor.setArgForAllSteps("debug", debug) preprocessor.stepArgs("PARSE")["requireEntities"] = False #preprocessor.process(xml, intermediateFileDir, fromStep="SPLIT-SENTENCES", toStep="FIND-HEADS", omitSteps=["NER"]) #preprocessor.process(xml, intermediateFileDir, fromStep="PARSE", toStep="FIND-HEADS") # Entity name splitting is omitted as this data may be used for predicting entities preprocessor.process(xml, intermediateFileDir, omitSteps=[ "CONVERT", "SPLIT-SENTENCES", "NER", "SPLIT-NAMES", "DIVIDE-SETS" ])
def parseXML(xml, outStem, intermediateFiles=True, debug=False, bbResources=False): preprocessor = Preprocessor() if bbResources: preprocessor.insertStep(5, "BB_RESOURCES", insertResources.process, {}, "bb-resources.xml") preprocessor.setArgForAllSteps("debug", debug) preprocessor.stepArgs("PARSE")["requireEntities"] = False if not intermediateFiles: preprocessor.setNoIntermediateFiles() preprocessor.process(xml, outStem, omitSteps=["NER", "DIVIDE-SETS"])
metavar="FILE") optparser.add_option("-d", "--dataSet", default="devel", dest="dataSet", help="", metavar="FILE") (options, args) = optparser.parse_args() assert options.dataSet in ("devel", "test") options.dataSet = { "devel": "./data/chemprot_development_gold_standard.tsv", "test": "./data/chemprot_test_gold_standard.tsv" }[options.dataSet] if options.examples.endswith(".xml") or options.examples.endswith( ".xml.gz"): preprocessor = Preprocessor(steps="EXPORT_CHEMPROT") tempDir = tempfile.mkdtemp() tsvPath = os.path.join(tempDir, os.path.basename(options.examples) + ".tsv") preprocessor.process(options.examples, tsvPath) ChemProtEvaluator().evaluateTSV(tsvPath, options.dataSet) shutil.rmtree(tempDir) if options.examples.endswith(".tsv"): ChemProtEvaluator().evaluateTSV(options.examples, options.dataSet) else: ev = ChemProtEvaluator(options.examples, options.predictions, options.classSet) #print ev.toStringConcise()
def classify(input, model, output, workDir=None, step=None, omitSteps=None, goldInput=None, detector=None, debug=False, clear=False, preprocessorTag="-preprocessed.xml.gz", preprocessorParams=None, bioNLPSTParams=None): """ Detect events or relations from text. @param input: The input file in either interaction XML or BioNLP ST format. Can also be a PMID or TEES default corpus name. @param model: A path to a model file or the name of a TEES default model. @param output: The output file stem. Output files will be of the form output-* @param workDir: If intermediate files need to be saved, they will go here. @param step: A step=substep pair, where the steps are PREPROCESS and CLASSIFY @param omitSteps: step=substep parameters, where multiple substeps can be defined. @param goldInput: a version of the corpus file with gold annotation. Enables measuring of performance @param detector: a Detector object, or a string defining one to be imported. If None, will be read from model. @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved @param clear: Remove existing workDir @param preprocessorTag: preprocessor output file will be output + preprocessorTag @param preprocessorParams: Optional parameters controlling preprocessing. If None, will be read from model. @param bioNLPSTParams: Optional parameters controlling BioNLP ST format output. If None, will be read from model. """ input = os.path.abspath(input) if goldInput != None: goldInput = os.path.abspath(goldInput) if model != None: model = os.path.abspath(model) # Initialize working directory if workDir != None: # use a permanent work directory workdir(workDir, clear) Stream.openLog(output + "-log.txt") # log in the output directory # Get input files input, preprocess = getInput(input) model = getModel(model) # Define processing steps selector, detectorSteps, omitDetectorSteps = getSteps( step, omitSteps, ["PREPROCESS", "CLASSIFY"]) if not preprocess: selector.markOmitSteps("PREPROCESS") classifyInput = input if selector.check("PREPROCESS"): if preprocessorParams == None: preprocessorParams = [ "LOAD", "GENIA_SPLITTER", "BANNER", "BLLIP_BIO", "STANFORD_CONVERT", "SPLIT_NAMES", "FIND_HEADS", "SAVE" ] preprocessor = Preprocessor(preprocessorParams) if debug: preprocessor.setArgForAllSteps("debug", True) preprocessorOutput = output + preprocessorTag #preprocessor.debug = debug #preprocessor.source = input # This has to be defined already here, needs to be fixed later #preprocessor.requireEntitiesForParsing = True # parse only sentences which contain named entities if os.path.exists( preprocessorOutput ) and not clear: #os.path.exists(preprocessor.getOutputPath("FIND-HEADS")): #print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "exists, skipping preprocessing." print >> sys.stderr, "Preprocessor output", preprocessorOutput, "exists, skipping preprocessing." classifyInput = preprocessorOutput # preprocessor.getOutputPath("FIND-HEADS") else: #print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "does not exist" print >> sys.stderr, "Preprocessor output", preprocessorOutput, "does not exist" print >> sys.stderr, "------------ Preprocessing ------------" # Remove some of the unnecessary intermediate files #preprocessor.setIntermediateFiles({"Convert":None, "SPLIT-SENTENCES":None, "PARSE":None, "CONVERT-PARSE":None, "SPLIT-NAMES":None}) # Process input into interaction XML classifyInput = preprocessor.process(input, preprocessorOutput, model) if selector.check("CLASSIFY"): detector = getDetector(detector, model)[0]() # initialize detector object detector.debug = debug detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams( bioNLPSTParams, model) detector.classify(classifyInput, model, output, goldData=goldInput, fromStep=detectorSteps["CLASSIFY"], omitSteps=omitDetectorSteps["CLASSIFY"], workDir=workDir)
#if os.path.isfile(options.input): # if INPUT_TAG.endswith(".tar.gz"): # INPUT_TAG = INPUT_TAG[:-len(".tar.gz")] open(INPUT_TAG+"-STARTED", "w").close() # Mark process status # Start logging WORKDIR = options.workdir if WORKDIR == None: WORKDIR = tempfile.mkdtemp() workdir(WORKDIR, options.clearAll) # Select a working directory, optionally remove existing files if not options.noLog: log(options.clearAll, True, INPUT_TAG + "-" + options.eventTag + ".log") # Start logging into a file in working directory print >> sys.stderr, "Work directory at", WORKDIR eventDetectionInput = None preprocessor = Preprocessor() preprocessor.debug = options.debug preprocessor.source = options.input # This has to be defined already here, needs to be fixed later preprocessor.compressIntermediateFiles = True # save space preprocessor.intermediateFilesAtSource = True # create output at source file location preprocessor.requireEntitiesForParsing = True # parse only sentences which contain BANNER entities if selector.check("PREPROCESS"): if os.path.exists(preprocessor.getOutputPath("FIND-HEADS")): print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "exists, skipping preprocessing." eventDetectionInput = preprocessor.getOutputPath("FIND-HEADS") else: print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "does not exist" print >> sys.stderr, "------------ Preprocessing ------------" # Remove some of the unnecessary intermediate files preprocessor.setIntermediateFile("CONVERT", None) preprocessor.setIntermediateFile("SPLIT-SENTENCES", None)
def combine(inputA, inputB, inputGold, outPath=None, mode="OR", skip=None, logPath="AUTO"): assert options.mode in ("AND", "OR") if skip != None and isinstance(skip, basestring): skip = set(skip.split(",")) if skip != None: print "Skipping interaction types:", skip if logPath == "AUTO": if outPath != None: logPath = os.path.join( outPath.rstrip("/").rstrip("\\") + "-log.txt") else: logPath = None if logPath != None: if not os.path.exists(os.path.dirname(logPath)): os.makedirs(os.path.dirname(logPath)) Stream.openLog(logPath) print "Loading the Interaction XML files" print "Loading A from", inputA a = ETUtils.ETFromObj(inputA) print "Loading B from", inputB b = ETUtils.ETFromObj(inputB) gold = None if inputGold: print "Loading gold from", inputGold gold = ETUtils.ETFromObj(inputGold) if inputGold else None print "Copying a as template" template = copy.deepcopy(a) print "Calculating confidence score ranges" scoreRanges = {} scoreRanges["a"] = getScoreRange(a, skip) scoreRanges["b"] = getScoreRange(b, skip) print scoreRanges print "Combining" counts = defaultdict(int) counts["skipped"] = defaultdict(int) counter = ProgressCounter(len([x for x in a.findall("document")]), "Combine") for docA, docB, docGold, docTemplate in itertools.izip_longest( *[x.findall("document") for x in (a, b, gold, template)]): counter.update() assert len( set([x.get("id") for x in (docA, docB, docGold, docTemplate)])) == 1 for sentA, sentB, sentGold, sentTemplate in itertools.izip_longest(*[ x.findall("sentence") for x in (docA, docB, docGold, docTemplate) ]): assert len( set([ x.get("id") for x in (sentA, sentB, sentGold, sentTemplate) ])) == 1 interactions = getInteractions(sentA, sentB, sentGold, skip, counts["skipped"]) for interaction in sentTemplate.findall("interaction"): sentTemplate.remove(interaction) analyses = sentTemplate.find("analyses") if analyses: sentTemplate.remove(analyses) for key in interactions: interaction = getCombinedInteraction(interactions[key], mode, counts, scoreRanges) if interaction != None: sentTemplate.append(copy.deepcopy(interaction)) if analyses: sentTemplate.append(analyses) counts["skipped"] = dict(counts["skipped"]) print "Counts:", dict(counts) if gold != None: print "****** Evaluating A ******" evaluateChemProt( a, gold ) #EvaluateIXML.run(AveragingMultiClassEvaluator, a, gold, "McCC") print "****** Evaluating B ******" evaluateChemProt( b, gold ) #EvaluateIXML.run(AveragingMultiClassEvaluator, b, gold, "McCC") print "****** Evaluating Combined ******" evaluateChemProt( template, gold ) #EvaluateIXML.run(AveragingMultiClassEvaluator, template, gold, "McCC") if outPath != None: print "Writing output to", outPath if outPath.endswith(".tsv"): Preprocessor(steps=["EXPORT_CHEMPROT"]).process(template, outPath) else: ETUtils.write(template, outPath) if logPath != None: Stream.closeLog(logPath)
def run(inPath, outPath, subDirs, model, connection, numJobs, subTask=3, posTags=None, useTestSet=False, clear=True, debug=False, force=False, training=True, preprocessorSteps=None, subset=None): # Remove existing non-empty work directory, if requested to do so if os.path.exists(outPath) and len(os.listdir(outPath)) > 0 and clear: if force or ask("Output directory '" + outPath + "' exists, remove?"): print >> sys.stderr, "Output directory exists, removing", outPath shutil.rmtree(outPath) # Create work directory if needed if not os.path.exists(outPath): print >> sys.stderr, "Making output directory", outPath os.makedirs(outPath) # Begin logging logPath = beginLog(outPath) # Collect the parse files parseDir = os.path.join(outPath, "parses") if not os.path.exists(parseDir) or len(os.listdir(parseDir)) == 0: parseDir = combineParses(inPath, parseDir, subDirs) else: print >> sys.stderr, "Using collected parses from", parseDir # Import the parses corpusDir = os.path.join(outPath, "corpus") if not os.path.exists(corpusDir): if preprocessorSteps == None: preprocessorSteps = [ "MERGE_SETS", "REMOVE_ANALYSES", "REMOVE_HEADS", "MERGE_SENTENCES", "IMPORT_PARSE", "SPLIT_NAMES", "FIND_HEADS", "DIVIDE_SETS" ] preprocessor = Preprocessor(preprocessorSteps) #preprocessor = Preprocessor(["MERGE-SETS", "REMOVE-ANALYSES", "REMOVE-HEADS", "MERGE-SENTENCES", "IMPORT-PARSE", "VALIDATE", "DIVIDE-SETS"]) preprocessor.setArgForAllSteps("debug", debug) preprocessor.getStep("IMPORT_PARSE").setArg("parseDir", parseDir) preprocessor.getStep("IMPORT_PARSE").setArg("posTags", posTags) modelPattern = model + ".+\.xml" if useTestSet else model + "-devel\.xml|" + model + "-train\.xml" preprocessor.process(modelPattern, os.path.join(corpusDir, model), logPath=None) else: print >> sys.stderr, "Using imported parses from", corpusDir # Train the model if training: connection = connection.replace("$JOBS", str(numJobs)) if subTask > 0: model = model + "." + str(subTask) train( outPath, model, parse="McCC", debug=debug, connection=connection, corpusDir=corpusDir, subset=subset, log=None ) #classifierParams={"examples":None, "trigger":"150000", "recall":None, "edge":"7500", "unmerging":"2500", "modifiers":"10000"}) # Close the log endLog(logPath)
help="AUTO, None, or a path") #debug.add_option("--intermediateFiles", default=False, action="store_true", dest="intermediateFiles", help="Save an intermediate file for each step") debug.add_option("--debug", default=False, action="store_true", dest="debug", help="Set debug mode for all steps") optparser.add_option_group(debug) (options, args) = optparser.parse_args() # if options.steps != None: # options.steps = [x.strip() for x in options.steps.split(",")] # if options.omitSteps != None: # options.omitSteps = options.omitSteps.split(",") # preprocessor = Preprocessor(options.steps, options.parseName, options.requireEntities) if options.steps == None: print >> sys.stderr, preprocessor.getHelpString() else: preprocessor.setArgForAllSteps("debug", options.debug) if preprocessor.hasStep("CONVERT"): if options.corpus != None: preprocessor.getStep("CONVERT").setArg("corpusName", options.corpus) if options.dataSetNames != None: preprocessor.getStep("CONVERT").setArg("dataSetNames", options.dataSetNames) if options.parseDir: preprocessor.getStep("IMPORT_PARSE").setArg( "parseDir", options.parseDir) if options.exportFormats and preprocessor.hasStep("EXPORT"):
def train(output, task=None, detector=None, inputFiles=None, models=None, parse=None, processUnmerging=None, processModifiers=None, bioNLPSTParams=None, preprocessorParams=None, exampleStyles=None, classifierParams=None, doFullGrid=False, deleteOutput=False, copyFrom=None, log="log.txt", step=None, omitSteps=None, debug=False, connection=None, subset=None, folds=None): """ Train a new model for event or relation detection. @param output: A directory where output files will appear. @param task: If defined, overridable default settings are used for many of the training parameters. Must be one of the supported TEES tasks. @param detector: a Detector object, or a string defining one to be imported @param inputFiles: A dictionary of file names, with keys "train", "devel" and, "test" @param models: A dictionary of file names defining the place for the new models, with keys "devel" and, "test" @param parse: The parse element name in the training interaction XML @param processUnmerging: Use the unmerging step of EventDetector. True, False or None for task default. @param processModifiers: Use the modifier detection step of EventDetector. True, False or None for task default. @param bioNLPSTParams: Parameters controlling BioNLP ST format output. @param preprocessorParams: Parameters controlling the preprocessor. Not used for training, but saved to the model for use when classifying. @param exampleStyles: A parameter set for controlling example builders. @param classifierParams: A parameter set for controlling classifiers. @param doFullGrid: Whether all parameters, as opposed to just recall adjustment, are tested in the EventDetector grid search. @param deleteOutput: Remove an existing output directory @param copyFrom: Copy an existing output directory for use as a template @param log: An optional alternative name for the log file. None is for no logging. @param step: A step=substep pair, where the steps are "TRAIN", "DEVEL", "EMPTY" and "TEST" @param omitSteps: step=substep parameters, where multiple substeps can be defined. @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved @param connection: A parameter set defining a local or remote connection for training the classifier @param subset: A parameter set for making subsets of input files """ # Insert default arguments where needed inputFiles = setDictDefaults(inputFiles, { "train": None, "devel": None, "test": None }) models = setDictDefaults(models, {"devel": None, "test": None}) exampleStyles = setDictDefaults( exampleStyles, { "examples": None, "trigger": None, "edge": None, "unmerging": None, "modifiers": None }) classifierParams = setDictDefaults( classifierParams, { "examples": None, "trigger": None, "recall": None, "edge": None, "unmerging": None, "modifiers": None }) subset = setDictDefaults(Parameters.get(subset), { "train": None, "devel": None, "test": None, "seed": 0, "all": None }) folds = setDictDefaults(folds, { "train": None, "devel": None, "test": None }) processUnmerging = getDefinedBool(processUnmerging) processModifiers = getDefinedBool(processModifiers) # Initialize working directory workdir(output, deleteOutput, copyFrom, log) # Get task specific parameters detector, bioNLPSTParams, preprocessorParams = getTaskSettings( task, detector, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParams) # Learn training settings from input files detector = learnSettings(inputFiles, detector, classifierParams) # Get corpus subsets getFolds(inputFiles, folds) getSubsets(inputFiles, subset) if task != None: task = task.replace("-FULL", "") # Define processing steps selector, detectorSteps, omitDetectorSteps = getSteps( step, omitSteps, ["TRAIN", "DEVEL", "EMPTY", "TEST"]) # Initialize the detector detector, detectorName = getDetector(detector) detector = detector() # initialize object detector.debug = debug detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams( bioNLPSTParams) #detector.useBioNLPSTFormat = useBioNLPSTFormat # classify-output and grid evaluation in ST-format #detector.stWriteScores = True # write confidence scores into additional st-format files connection = getConnection(connection) detector.setConnection(connection) connection.debug = debug if deleteOutput: connection.clearWorkDir() # Train if selector.check("TRAIN"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------------ Train Detector ------------------" print >> sys.stderr, "----------------------------------------------------" if isinstance(detector, SingleStageDetector): detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"], exampleStyles["examples"], classifierParams["examples"], parse, None, task, fromStep=detectorSteps["TRAIN"], workDir="training") else: detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"], exampleStyles["trigger"], exampleStyles["edge"], exampleStyles["unmerging"], exampleStyles["modifiers"], classifierParams["trigger"], classifierParams["edge"], classifierParams["unmerging"], classifierParams["modifiers"], classifierParams["recall"], processUnmerging, processModifiers, doFullGrid, task, parse, None, fromStep=detectorSteps["TRAIN"], workDir="training") # Save the detector type for model in [models["devel"], models["test"]]: if model != None and os.path.exists(model): model = Model(model, "a") model.addStr("detector", detectorName) if preprocessorParams != None: preprocessor = Preprocessor() model.addStr( "preprocessorParams", Parameters.toString( preprocessor.getParameters(preprocessorParams))) model.save() model.close() if selector.check("DEVEL"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------ Check devel classification ------------" print >> sys.stderr, "----------------------------------------------------" #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(inputFiles["devel"], models["devel"], "classification-devel/devel", goldData=inputFiles["devel"], fromStep=detectorSteps["DEVEL"], workDir="classification-devel") if selector.check("EMPTY"): # By passing an emptied devel set through the prediction system, we can check that we get the same predictions # as in the DEVEL step, ensuring the model does not use leaked information. print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------ Empty devel classification ------------" print >> sys.stderr, "----------------------------------------------------" #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(getEmptyCorpus( inputFiles["devel"], removeNames=("names" in str(exampleStyles["examples"]) or "names" in str(exampleStyles["trigger"]))), models["devel"], "classification-empty/devel-empty", fromStep=detectorSteps["EMPTY"], workDir="classification-empty") if selector.check("TEST"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------- Test set classification --------------" print >> sys.stderr, "----------------------------------------------------" if inputFiles["test"] == None or not os.path.exists( inputFiles["test"]): print >> sys.stderr, "Skipping, test file", inputFiles[ "test"], "does not exist" else: #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(inputFiles["test"], models["test"], "classification-test/test", fromStep=detectorSteps["TEST"], workDir="classification-test") if detector.bioNLPSTParams["convert"]: Utils.STFormat.Compare.compare( "classification-test/test-events.tar.gz", "classification-devel/devel-events.tar.gz", "a2")
def convert(inPath, outDir, corpusId, directed, negatives, preprocess, preprocessorParameters=None, debug=False, clear=False, constParser="BLLIP-BIO", depParser="STANFORD-CONVERT", logging=True): assert negatives in ("INCLUDE", "SKIP", "REVERSE_POS") # Download the corpus if needed if inPath == None: if not hasattr(Settings, "SE10T8_CORPUS"): SemEval2010Task8Tools.install() inPath = Settings.SE10T8_CORPUS assert os.path.exists(inPath) # Prepare the output directory if not os.path.exists(outDir): print "Making output directory", outDir os.makedirs(outDir) elif clear: print "Removing output directory", outDir shutil.rmtree(outDir) # Start logging if logging: Stream.openLog(os.path.join(outDir, "log.txt"), clear=clear) # Read and process the corpus files archive = zipfile.ZipFile(inPath, 'r') usedIds = set() tree = None for fileName, setName in [("SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT", "train"),\ ("SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT", "test")]: print "Processing file", fileName, "as set", setName f = archive.open(fileName) tree = processLines(f.readlines(), setName, directed=directed, negatives=negatives, usedIds=usedIds, tree=tree, corpusId=corpusId) f.close() # Divide the training set into training and development sets MakeSets.processCorpus(tree, None, "train", [("train", 0.7), ("devel", 1.0)], 1) # Write out the converted corpus convertedPath = os.path.join(outDir, corpusId + "-converted.xml") ETUtils.write(tree.getroot(), convertedPath) # Preprocess the converted corpus if preprocess: outPath = os.path.join(outDir, corpusId + ".xml") preprocessor = Preprocessor(constParser, depParser) preprocessor.setArgForAllSteps("debug", debug) preprocessor.stepArgs("CONVERT")["corpusName"] = corpusId preprocessor.process( convertedPath, outPath, preprocessorParameters, omitSteps=["SPLIT-SENTENCES", "NER", "SPLIT-NAMES"]) # Stop logging if logging: Stream.closeLog(os.path.join(outDir, "log.txt"))