コード例 #1
0
ファイル: convertDDI13.py プロジェクト: DUT-LiuYang/TEES
def parseXML(xml, intermediateFileDir, debug=False):
    preprocessor = Preprocessor()
    preprocessor.setArgForAllSteps("debug", debug)
    preprocessor.stepArgs("PARSE")["requireEntities"] = False
    #preprocessor.process(xml, intermediateFileDir, fromStep="SPLIT-SENTENCES", toStep="FIND-HEADS", omitSteps=["NER"])
    #preprocessor.process(xml, intermediateFileDir, fromStep="PARSE", toStep="FIND-HEADS")
    # Entity name splitting is omitted as this data may be used for predicting entities
    preprocessor.process(xml, intermediateFileDir, omitSteps=["CONVERT", "SPLIT-SENTENCES", "NER", "SPLIT-NAMES", "DIVIDE-SETS"])
コード例 #2
0
def parseXML(xml, intermediateFileDir, debug=False):
    preprocessor = Preprocessor()
    preprocessor.setArgForAllSteps("debug", debug)
    preprocessor.stepArgs("PARSE")["requireEntities"] = False
    #preprocessor.process(xml, intermediateFileDir, fromStep="SPLIT-SENTENCES", toStep="FIND-HEADS", omitSteps=["NER"])
    #preprocessor.process(xml, intermediateFileDir, fromStep="PARSE", toStep="FIND-HEADS")
    # Entity name splitting is omitted as this data may be used for predicting entities
    preprocessor.process(xml,
                         intermediateFileDir,
                         omitSteps=[
                             "CONVERT", "SPLIT-SENTENCES", "NER",
                             "SPLIT-NAMES", "DIVIDE-SETS"
                         ])
コード例 #3
0
def parseXML(xml,
             outStem,
             intermediateFiles=True,
             debug=False,
             bbResources=False):
    preprocessor = Preprocessor()
    if bbResources:
        preprocessor.insertStep(5, "BB_RESOURCES", insertResources.process, {},
                                "bb-resources.xml")
    preprocessor.setArgForAllSteps("debug", debug)
    preprocessor.stepArgs("PARSE")["requireEntities"] = False
    if not intermediateFiles:
        preprocessor.setNoIntermediateFiles()
    preprocessor.process(xml, outStem, omitSteps=["NER", "DIVIDE-SETS"])
コード例 #4
0
ファイル: EvaluateEPE.py プロジェクト: jbjorne/TEES
def run(inPath, outPath, subDirs, model, connection, numJobs, subTask=3, posTags=None, useTestSet=False, clear=True, debug=False, force=False, training=True, preprocessorSteps=None, subset=None):
    # Remove existing non-empty work directory, if requested to do so
    if os.path.exists(outPath) and len(os.listdir(outPath)) > 0 and clear:
        if force or ask("Output directory '" + outPath + "' exists, remove?"):
            print >> sys.stderr, "Output directory exists, removing", outPath
            shutil.rmtree(outPath)
    # Create work directory if needed
    if not os.path.exists(outPath):
        print >> sys.stderr, "Making output directory", outPath
        os.makedirs(outPath)
    
    # Begin logging
    logPath = beginLog(outPath)
    
    # Collect the parse files
    parseDir = os.path.join(outPath, "parses")
    if not os.path.exists(parseDir) or len(os.listdir(parseDir)) == 0:
        parseDir = combineParses(inPath, parseDir, subDirs)
    else:
        print >> sys.stderr, "Using collected parses from", parseDir
    
    # Import the parses
    corpusDir = os.path.join(outPath, "corpus")
    if not os.path.exists(corpusDir):
        if preprocessorSteps == None:
            preprocessorSteps = ["MERGE_SETS", "REMOVE_ANALYSES", "REMOVE_HEADS", "MERGE_SENTENCES", "IMPORT_PARSE", "SPLIT_NAMES", "FIND_HEADS", "DIVIDE_SETS"]
        preprocessor = Preprocessor(preprocessorSteps)
        #preprocessor = Preprocessor(["MERGE-SETS", "REMOVE-ANALYSES", "REMOVE-HEADS", "MERGE-SENTENCES", "IMPORT-PARSE", "VALIDATE", "DIVIDE-SETS"])
        preprocessor.setArgForAllSteps("debug", debug)
        preprocessor.getStep("IMPORT_PARSE").setArg("parseDir", parseDir)
        preprocessor.getStep("IMPORT_PARSE").setArg("posTags", posTags)
        modelPattern = model + ".+\.xml" if useTestSet else model + "-devel\.xml|" + model + "-train\.xml"
        preprocessor.process(modelPattern, os.path.join(corpusDir, model), logPath=None)
    else:
        print >> sys.stderr, "Using imported parses from", corpusDir
    
    # Train the model
    if training:
        connection = connection.replace("$JOBS", str(numJobs))
        if subTask > 0:
            model = model + "." + str(subTask)
        train(outPath, model, parse="McCC", debug=debug, connection=connection, corpusDir=corpusDir, subset=subset, log=None) #classifierParams={"examples":None, "trigger":"150000", "recall":None, "edge":"7500", "unmerging":"2500", "modifiers":"10000"})
        
    # Close the log
    endLog(logPath)
コード例 #5
0
def convert(inPath, outDir, corpusId, directed, negatives, preprocess, preprocessorParameters=None, debug=False, clear=False, constParser="BLLIP-BIO", depParser="STANFORD-CONVERT", logging=True):
    assert negatives in ("INCLUDE", "SKIP", "REVERSE_POS")
    # Download the corpus if needed
    if inPath == None:
        if not hasattr(Settings, "SE10T8_CORPUS"):
            SemEval2010Task8Tools.install()
        inPath = Settings.SE10T8_CORPUS
    assert os.path.exists(inPath)
    # Prepare the output directory
    if not os.path.exists(outDir):
        print "Making output directory", outDir
        os.makedirs(outDir)
    elif clear:
        print "Removing output directory", outDir
        shutil.rmtree(outDir)
    # Start logging
    if logging:
        Stream.openLog(os.path.join(outDir, "log.txt"), clear=clear)
    # Read and process the corpus files
    archive = zipfile.ZipFile(inPath, 'r')
    usedIds = set()
    tree = None
    for fileName, setName in [("SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT", "train"),\
                              ("SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT", "test")]:
        print "Processing file", fileName, "as set", setName
        f = archive.open(fileName)
        tree = processLines(f.readlines(), setName, directed=directed, negatives=negatives, usedIds=usedIds, tree=tree, corpusId=corpusId)
        f.close()
    # Divide the training set into training and development sets
    MakeSets.processCorpus(tree, None, "train", [("train", 0.7), ("devel", 1.0)], 1)
    # Write out the converted corpus
    convertedPath = os.path.join(outDir, corpusId + "-converted.xml")
    ETUtils.write(tree.getroot(), convertedPath)
    # Preprocess the converted corpus
    if preprocess:
        outPath = os.path.join(outDir, corpusId + ".xml")
        preprocessor = Preprocessor(constParser, depParser)
        preprocessor.setArgForAllSteps("debug", debug)
        preprocessor.stepArgs("CONVERT")["corpusName"] = corpusId
        preprocessor.process(convertedPath, outPath, preprocessorParameters, omitSteps=["SPLIT-SENTENCES", "NER", "SPLIT-NAMES"])
    # Stop logging
    if logging:
        Stream.closeLog(os.path.join(outDir, "log.txt"))
コード例 #6
0
def classify(input,
             model,
             output,
             workDir=None,
             step=None,
             omitSteps=None,
             goldInput=None,
             detector=None,
             debug=False,
             clear=False,
             preprocessorTag="-preprocessed.xml.gz",
             preprocessorParams=None,
             bioNLPSTParams=None):
    """
    Detect events or relations from text.
    
    @param input: The input file in either interaction XML or BioNLP ST format. Can also be a PMID or TEES default corpus name.
    @param model: A path to a model file or the name of a TEES default model.
    @param output: The output file stem. Output files will be of the form output-*
    @param workDir: If intermediate files need to be saved, they will go here.
    @param step: A step=substep pair, where the steps are PREPROCESS and CLASSIFY
    @param omitSteps: step=substep parameters, where multiple substeps can be defined.
    @param goldInput: a version of the corpus file with gold annotation. Enables measuring of performance
    @param detector: a Detector object, or a string defining one to be imported. If None, will be read from model.
    @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved
    @param clear: Remove existing workDir
    @param preprocessorTag: preprocessor output file will be output + preprocessorTag
    @param preprocessorParams: Optional parameters controlling preprocessing. If None, will be read from model.
    @param bioNLPSTParams: Optional parameters controlling BioNLP ST format output. If None, will be read from model.
    """
    input = os.path.abspath(input)
    if goldInput != None: goldInput = os.path.abspath(goldInput)
    if model != None: model = os.path.abspath(model)
    # Initialize working directory
    if workDir != None:  # use a permanent work directory
        workdir(workDir, clear)
    Stream.openLog(output + "-log.txt")  # log in the output directory
    # Get input files
    input, preprocess = getInput(input)
    model = getModel(model)
    # Define processing steps
    selector, detectorSteps, omitDetectorSteps = getSteps(
        step, omitSteps, ["PREPROCESS", "CLASSIFY"])
    if not preprocess:
        selector.markOmitSteps("PREPROCESS")

    classifyInput = input
    if selector.check("PREPROCESS"):
        if preprocessorParams == None:
            preprocessorParams = [
                "LOAD", "GENIA_SPLITTER", "BANNER", "BLLIP_BIO",
                "STANFORD_CONVERT", "SPLIT_NAMES", "FIND_HEADS", "SAVE"
            ]
        preprocessor = Preprocessor(preprocessorParams)
        if debug:
            preprocessor.setArgForAllSteps("debug", True)
        preprocessorOutput = output + preprocessorTag
        #preprocessor.debug = debug
        #preprocessor.source = input # This has to be defined already here, needs to be fixed later
        #preprocessor.requireEntitiesForParsing = True # parse only sentences which contain named entities
        if os.path.exists(
                preprocessorOutput
        ) and not clear:  #os.path.exists(preprocessor.getOutputPath("FIND-HEADS")):
            #print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "exists, skipping preprocessing."
            print >> sys.stderr, "Preprocessor output", preprocessorOutput, "exists, skipping preprocessing."
            classifyInput = preprocessorOutput  # preprocessor.getOutputPath("FIND-HEADS")
        else:
            #print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "does not exist"
            print >> sys.stderr, "Preprocessor output", preprocessorOutput, "does not exist"
            print >> sys.stderr, "------------ Preprocessing ------------"
            # Remove some of the unnecessary intermediate files
            #preprocessor.setIntermediateFiles({"Convert":None, "SPLIT-SENTENCES":None, "PARSE":None, "CONVERT-PARSE":None, "SPLIT-NAMES":None})
            # Process input into interaction XML
            classifyInput = preprocessor.process(input, preprocessorOutput,
                                                 model)

    if selector.check("CLASSIFY"):
        detector = getDetector(detector,
                               model)[0]()  # initialize detector object
        detector.debug = debug
        detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(
            bioNLPSTParams, model)
        detector.classify(classifyInput,
                          model,
                          output,
                          goldData=goldInput,
                          fromStep=detectorSteps["CLASSIFY"],
                          omitSteps=omitDetectorSteps["CLASSIFY"],
                          workDir=workDir)
コード例 #7
0
ファイル: classify.py プロジェクト: arvindcr2/TEES
def classify(input, model, output, workDir=None, step=None, omitSteps=None, 
             goldInput=None, detector=None, debug=False, clear=False, 
             preprocessorTag="-preprocessed.xml.gz", preprocessorParams=None, bioNLPSTParams=None):
    """
    Detect events or relations from text.
    
    @param input: The input file in either interaction XML or BioNLP ST format. Can also be a PMID or TEES default corpus name.
    @param model: A path to a model file or the name of a TEES default model.
    @param output: The output file stem. Output files will be of the form output-*
    @param workDir: If intermediate files need to be saved, they will go here.
    @param step: A step=substep pair, where the steps are PREPROCESS and CLASSIFY
    @param omitSteps: step=substep parameters, where multiple substeps can be defined.
    @param goldInput: a version of the corpus file with gold annotation. Enables measuring of performance
    @param detector: a Detector object, or a string defining one to be imported. If None, will be read from model.
    @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved
    @param clear: Remove existing workDir
    @param preprocessorTag: preprocessor output file will be output + preprocessorTag
    @param preprocessorParams: Optional parameters controlling preprocessing. If None, will be read from model.
    @param bioNLPSTParams: Optional parameters controlling BioNLP ST format output. If None, will be read from model.
    """
    print "hello"
    input = os.path.abspath(input)
    if goldInput != None: goldInput = os.path.abspath(goldInput)
    if model != None: model = os.path.abspath(model)
    # Initialize working directory
    if workDir != None: # use a permanent work directory
        workdir(workDir, clear)
    Stream.openLog(output + "-log.txt") # log in the output directory
    # Get input files
    input, preprocess = getInput(input)
    model = getModel(model)
    # Define processing steps
    selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["PREPROCESS", "CLASSIFY"])
    if not preprocess:
        selector.markOmitSteps("PREPROCESS")
    
    classifyInput = input
    if selector.check("PREPROCESS"):
        preprocessor = Preprocessor()
        if debug: 
            preprocessor.setArgForAllSteps("debug", True)
        preprocessorOutput = output + preprocessorTag
        #preprocessor.debug = debug
        #preprocessor.source = input # This has to be defined already here, needs to be fixed later
        #preprocessor.requireEntitiesForParsing = True # parse only sentences which contain named entities
        if os.path.exists(preprocessorOutput) and not clear: #os.path.exists(preprocessor.getOutputPath("FIND-HEADS")):
            #print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "exists, skipping preprocessing."
            print >> sys.stderr, "Preprocessor output", preprocessorOutput, "exists, skipping preprocessing."
            classifyInput = preprocessorOutput # preprocessor.getOutputPath("FIND-HEADS")
        else:
            #print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "does not exist"
            print >> sys.stderr, "Preprocessor output", preprocessorOutput, "does not exist"
            print >> sys.stderr, "------------ Preprocessing ------------"
            # Remove some of the unnecessary intermediate files
            #preprocessor.setIntermediateFiles({"Convert":None, "SPLIT-SENTENCES":None, "PARSE":None, "CONVERT-PARSE":None, "SPLIT-NAMES":None})
            # Process input into interaction XML
            classifyInput = preprocessor.process(input, preprocessorOutput, preprocessorParams, model, [], fromStep=detectorSteps["PREPROCESS"], toStep=None, omitSteps=omitDetectorSteps["PREPROCESS"])
    
    if selector.check("CLASSIFY"):
        detector = getDetector(detector, model)[0]() # initialize detector object
        detector.debug = debug
        detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams, model)
        detector.classify(classifyInput, model, output, goldData=goldInput, fromStep=detectorSteps["CLASSIFY"], omitSteps=omitDetectorSteps["CLASSIFY"], workDir=workDir)
コード例 #8
0
ファイル: EvaluateEPE.py プロジェクト: Mu-Y/BioNLPST
def run(inPath,
        outPath,
        subDirs,
        model,
        connection,
        numJobs,
        subTask=3,
        posTags=None,
        useTestSet=False,
        clear=True,
        debug=False,
        force=False,
        training=True,
        preprocessorSteps=None,
        subset=None):
    # Remove existing non-empty work directory, if requested to do so
    if os.path.exists(outPath) and len(os.listdir(outPath)) > 0 and clear:
        if force or ask("Output directory '" + outPath + "' exists, remove?"):
            print >> sys.stderr, "Output directory exists, removing", outPath
            shutil.rmtree(outPath)
    # Create work directory if needed
    if not os.path.exists(outPath):
        print >> sys.stderr, "Making output directory", outPath
        os.makedirs(outPath)

    # Begin logging
    logPath = beginLog(outPath)

    # Collect the parse files
    parseDir = os.path.join(outPath, "parses")
    if not os.path.exists(parseDir) or len(os.listdir(parseDir)) == 0:
        parseDir = combineParses(inPath, parseDir, subDirs)
    else:
        print >> sys.stderr, "Using collected parses from", parseDir

    # Import the parses
    corpusDir = os.path.join(outPath, "corpus")
    if not os.path.exists(corpusDir):
        if preprocessorSteps == None:
            preprocessorSteps = [
                "MERGE_SETS", "REMOVE_ANALYSES", "REMOVE_HEADS",
                "MERGE_SENTENCES", "IMPORT_PARSE", "SPLIT_NAMES", "FIND_HEADS",
                "DIVIDE_SETS"
            ]
        preprocessor = Preprocessor(preprocessorSteps)
        #preprocessor = Preprocessor(["MERGE-SETS", "REMOVE-ANALYSES", "REMOVE-HEADS", "MERGE-SENTENCES", "IMPORT-PARSE", "VALIDATE", "DIVIDE-SETS"])
        preprocessor.setArgForAllSteps("debug", debug)
        preprocessor.getStep("IMPORT_PARSE").setArg("parseDir", parseDir)
        preprocessor.getStep("IMPORT_PARSE").setArg("posTags", posTags)
        modelPattern = model + ".+\.xml" if useTestSet else model + "-devel\.xml|" + model + "-train\.xml"
        preprocessor.process(modelPattern,
                             os.path.join(corpusDir, model),
                             logPath=None)
    else:
        print >> sys.stderr, "Using imported parses from", corpusDir

    # Train the model
    if training:
        connection = connection.replace("$JOBS", str(numJobs))
        if subTask > 0:
            model = model + "." + str(subTask)
        train(
            outPath,
            model,
            parse="McCC",
            debug=debug,
            connection=connection,
            corpusDir=corpusDir,
            subset=subset,
            log=None
        )  #classifierParams={"examples":None, "trigger":"150000", "recall":None, "edge":"7500", "unmerging":"2500", "modifiers":"10000"})

    # Close the log
    endLog(logPath)
コード例 #9
0
                     dest="debug",
                     help="Set debug mode for all steps")
    optparser.add_option_group(debug)
    (options, args) = optparser.parse_args()

    #     if options.steps != None:
    #         options.steps = [x.strip() for x in options.steps.split(",")]
    #     if options.omitSteps != None:
    #         options.omitSteps = options.omitSteps.split(",")
    #
    preprocessor = Preprocessor(options.steps, options.parseName,
                                options.requireEntities)
    if options.steps == None:
        print >> sys.stderr, preprocessor.getHelpString()
    else:
        preprocessor.setArgForAllSteps("debug", options.debug)
        if preprocessor.hasStep("CONVERT"):
            if options.corpus != None:
                preprocessor.getStep("CONVERT").setArg("corpusName",
                                                       options.corpus)
            if options.dataSetNames != None:
                preprocessor.getStep("CONVERT").setArg("dataSetNames",
                                                       options.dataSetNames)
        if options.parseDir:
            preprocessor.getStep("IMPORT_PARSE").setArg(
                "parseDir", options.parseDir)
        if options.exportFormats and preprocessor.hasStep("EXPORT"):
            preprocessor.getStep("EXPORT").setArg(
                "formats", options.exportFormats.split(","))
        if options.importFormats:
            if preprocessor.hasStep("LOAD"):
コード例 #10
0
def convert(inPath,
            outDir,
            corpusId,
            directed,
            negatives,
            preprocess,
            preprocessorParameters=None,
            debug=False,
            clear=False,
            constParser="BLLIP-BIO",
            depParser="STANFORD-CONVERT",
            logging=True):
    assert negatives in ("INCLUDE", "SKIP", "REVERSE_POS")
    # Download the corpus if needed
    if inPath == None:
        if not hasattr(Settings, "SE10T8_CORPUS"):
            SemEval2010Task8Tools.install()
        inPath = Settings.SE10T8_CORPUS
    assert os.path.exists(inPath)
    # Prepare the output directory
    if not os.path.exists(outDir):
        print "Making output directory", outDir
        os.makedirs(outDir)
    elif clear:
        print "Removing output directory", outDir
        shutil.rmtree(outDir)
    # Start logging
    if logging:
        Stream.openLog(os.path.join(outDir, "log.txt"), clear=clear)
    # Read and process the corpus files
    archive = zipfile.ZipFile(inPath, 'r')
    usedIds = set()
    tree = None
    for fileName, setName in [("SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT", "train"),\
                              ("SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT", "test")]:
        print "Processing file", fileName, "as set", setName
        f = archive.open(fileName)
        tree = processLines(f.readlines(),
                            setName,
                            directed=directed,
                            negatives=negatives,
                            usedIds=usedIds,
                            tree=tree,
                            corpusId=corpusId)
        f.close()
    # Divide the training set into training and development sets
    MakeSets.processCorpus(tree, None, "train", [("train", 0.7),
                                                 ("devel", 1.0)], 1)
    # Write out the converted corpus
    convertedPath = os.path.join(outDir, corpusId + "-converted.xml")
    ETUtils.write(tree.getroot(), convertedPath)
    # Preprocess the converted corpus
    if preprocess:
        outPath = os.path.join(outDir, corpusId + ".xml")
        preprocessor = Preprocessor(constParser, depParser)
        preprocessor.setArgForAllSteps("debug", debug)
        preprocessor.stepArgs("CONVERT")["corpusName"] = corpusId
        preprocessor.process(
            convertedPath,
            outPath,
            preprocessorParameters,
            omitSteps=["SPLIT-SENTENCES", "NER", "SPLIT-NAMES"])
    # Stop logging
    if logging:
        Stream.closeLog(os.path.join(outDir, "log.txt"))
コード例 #11
0
ファイル: preprocess.py プロジェクト: jbjorne/TEES
    debug.add_option("--logPath", default="AUTO", dest="logPath", help="AUTO, None, or a path")
    #debug.add_option("--intermediateFiles", default=False, action="store_true", dest="intermediateFiles", help="Save an intermediate file for each step")
    debug.add_option("--debug", default=False, action="store_true", dest="debug", help="Set debug mode for all steps")
    optparser.add_option_group(debug)
    (options, args) = optparser.parse_args()
    
#     if options.steps != None:
#         options.steps = [x.strip() for x in options.steps.split(",")]
#     if options.omitSteps != None:
#         options.omitSteps = options.omitSteps.split(",")
#         
    preprocessor = Preprocessor(options.steps, options.parseName, options.requireEntities)
    if options.steps == None:
        print >> sys.stderr, preprocessor.getHelpString()
    else:
        preprocessor.setArgForAllSteps("debug", options.debug)
        if preprocessor.hasStep("CONVERT"):
            if options.corpus != None:
                preprocessor.getStep("CONVERT").setArg("corpusName", options.corpus)
            if options.dataSetNames != None:
                preprocessor.getStep("CONVERT").setArg("dataSetNames", options.dataSetNames)
        if options.parseDir:
            preprocessor.getStep("IMPORT_PARSE").setArg("parseDir", options.parseDir)
        if options.exportFormats and preprocessor.hasStep("EXPORT"):
            preprocessor.getStep("EXPORT").setArg("formats", options.exportFormats.split(","))
        if options.importFormats:
            if preprocessor.hasStep("LOAD"):
                preprocessor.getStep("LOAD").setArg("extensions", options.importFormats.split(","))
            if preprocessor.hasStep("IMPORT_PARSE"):
                preprocessor.getStep("IMPORT_PARSE").setArg("extensions", options.importFormats.split(","))
        #if options.intermediateFiles: