def run(inPath, outPath, subDirs, model, connection, numJobs, subTask=3, posTags=None, useTestSet=False, clear=True, debug=False, force=False, training=True, preprocessorSteps=None, subset=None): # Remove existing non-empty work directory, if requested to do so if os.path.exists(outPath) and len(os.listdir(outPath)) > 0 and clear: if force or ask("Output directory '" + outPath + "' exists, remove?"): print >> sys.stderr, "Output directory exists, removing", outPath shutil.rmtree(outPath) # Create work directory if needed if not os.path.exists(outPath): print >> sys.stderr, "Making output directory", outPath os.makedirs(outPath) # Begin logging logPath = beginLog(outPath) # Collect the parse files parseDir = os.path.join(outPath, "parses") if not os.path.exists(parseDir) or len(os.listdir(parseDir)) == 0: parseDir = combineParses(inPath, parseDir, subDirs) else: print >> sys.stderr, "Using collected parses from", parseDir # Import the parses corpusDir = os.path.join(outPath, "corpus") if not os.path.exists(corpusDir): if preprocessorSteps == None: preprocessorSteps = ["MERGE_SETS", "REMOVE_ANALYSES", "REMOVE_HEADS", "MERGE_SENTENCES", "IMPORT_PARSE", "SPLIT_NAMES", "FIND_HEADS", "DIVIDE_SETS"] preprocessor = Preprocessor(preprocessorSteps) #preprocessor = Preprocessor(["MERGE-SETS", "REMOVE-ANALYSES", "REMOVE-HEADS", "MERGE-SENTENCES", "IMPORT-PARSE", "VALIDATE", "DIVIDE-SETS"]) preprocessor.setArgForAllSteps("debug", debug) preprocessor.getStep("IMPORT_PARSE").setArg("parseDir", parseDir) preprocessor.getStep("IMPORT_PARSE").setArg("posTags", posTags) modelPattern = model + ".+\.xml" if useTestSet else model + "-devel\.xml|" + model + "-train\.xml" preprocessor.process(modelPattern, os.path.join(corpusDir, model), logPath=None) else: print >> sys.stderr, "Using imported parses from", corpusDir # Train the model if training: connection = connection.replace("$JOBS", str(numJobs)) if subTask > 0: model = model + "." + str(subTask) train(outPath, model, parse="McCC", debug=debug, connection=connection, corpusDir=corpusDir, subset=subset, log=None) #classifierParams={"examples":None, "trigger":"150000", "recall":None, "edge":"7500", "unmerging":"2500", "modifiers":"10000"}) # Close the log endLog(logPath)
def run(inPath, outPath, subDirs, model, connection, numJobs, subTask=3, posTags=None, useTestSet=False, clear=True, debug=False, force=False, training=True, preprocessorSteps=None, subset=None): # Remove existing non-empty work directory, if requested to do so if os.path.exists(outPath) and len(os.listdir(outPath)) > 0 and clear: if force or ask("Output directory '" + outPath + "' exists, remove?"): print >> sys.stderr, "Output directory exists, removing", outPath shutil.rmtree(outPath) # Create work directory if needed if not os.path.exists(outPath): print >> sys.stderr, "Making output directory", outPath os.makedirs(outPath) # Begin logging logPath = beginLog(outPath) # Collect the parse files parseDir = os.path.join(outPath, "parses") if not os.path.exists(parseDir) or len(os.listdir(parseDir)) == 0: parseDir = combineParses(inPath, parseDir, subDirs) else: print >> sys.stderr, "Using collected parses from", parseDir # Import the parses corpusDir = os.path.join(outPath, "corpus") if not os.path.exists(corpusDir): if preprocessorSteps == None: preprocessorSteps = [ "MERGE_SETS", "REMOVE_ANALYSES", "REMOVE_HEADS", "MERGE_SENTENCES", "IMPORT_PARSE", "SPLIT_NAMES", "FIND_HEADS", "DIVIDE_SETS" ] preprocessor = Preprocessor(preprocessorSteps) #preprocessor = Preprocessor(["MERGE-SETS", "REMOVE-ANALYSES", "REMOVE-HEADS", "MERGE-SENTENCES", "IMPORT-PARSE", "VALIDATE", "DIVIDE-SETS"]) preprocessor.setArgForAllSteps("debug", debug) preprocessor.getStep("IMPORT_PARSE").setArg("parseDir", parseDir) preprocessor.getStep("IMPORT_PARSE").setArg("posTags", posTags) modelPattern = model + ".+\.xml" if useTestSet else model + "-devel\.xml|" + model + "-train\.xml" preprocessor.process(modelPattern, os.path.join(corpusDir, model), logPath=None) else: print >> sys.stderr, "Using imported parses from", corpusDir # Train the model if training: connection = connection.replace("$JOBS", str(numJobs)) if subTask > 0: model = model + "." + str(subTask) train( outPath, model, parse="McCC", debug=debug, connection=connection, corpusDir=corpusDir, subset=subset, log=None ) #classifierParams={"examples":None, "trigger":"150000", "recall":None, "edge":"7500", "unmerging":"2500", "modifiers":"10000"}) # Close the log endLog(logPath)
(options, args) = optparser.parse_args() # if options.steps != None: # options.steps = [x.strip() for x in options.steps.split(",")] # if options.omitSteps != None: # options.omitSteps = options.omitSteps.split(",") # preprocessor = Preprocessor(options.steps, options.parseName, options.requireEntities) if options.steps == None: print >> sys.stderr, preprocessor.getHelpString() else: preprocessor.setArgForAllSteps("debug", options.debug) if preprocessor.hasStep("CONVERT"): if options.corpus != None: preprocessor.getStep("CONVERT").setArg("corpusName", options.corpus) if options.dataSetNames != None: preprocessor.getStep("CONVERT").setArg("dataSetNames", options.dataSetNames) if options.parseDir: preprocessor.getStep("IMPORT_PARSE").setArg( "parseDir", options.parseDir) if options.exportFormats and preprocessor.hasStep("EXPORT"): preprocessor.getStep("EXPORT").setArg( "formats", options.exportFormats.split(",")) if options.importFormats: if preprocessor.hasStep("LOAD"): preprocessor.getStep("LOAD").setArg( "extensions", options.importFormats.split(",")) if preprocessor.hasStep("IMPORT_PARSE"): preprocessor.getStep("IMPORT_PARSE").setArg(
debug.add_option("--debug", default=False, action="store_true", dest="debug", help="Set debug mode for all steps") optparser.add_option_group(debug) (options, args) = optparser.parse_args() # if options.steps != None: # options.steps = [x.strip() for x in options.steps.split(",")] # if options.omitSteps != None: # options.omitSteps = options.omitSteps.split(",") # preprocessor = Preprocessor(options.steps, options.parseName, options.requireEntities) if options.steps == None: print >> sys.stderr, preprocessor.getHelpString() else: preprocessor.setArgForAllSteps("debug", options.debug) if preprocessor.hasStep("CONVERT"): if options.corpus != None: preprocessor.getStep("CONVERT").setArg("corpusName", options.corpus) if options.dataSetNames != None: preprocessor.getStep("CONVERT").setArg("dataSetNames", options.dataSetNames) if options.parseDir: preprocessor.getStep("IMPORT_PARSE").setArg("parseDir", options.parseDir) if options.exportFormats and preprocessor.hasStep("EXPORT"): preprocessor.getStep("EXPORT").setArg("formats", options.exportFormats.split(",")) if options.importFormats: if preprocessor.hasStep("LOAD"): preprocessor.getStep("LOAD").setArg("extensions", options.importFormats.split(",")) if preprocessor.hasStep("IMPORT_PARSE"): preprocessor.getStep("IMPORT_PARSE").setArg("extensions", options.importFormats.split(",")) #if options.intermediateFiles: # preprocessor.setIntermediateFiles(True) preprocessor.process(options.input, options.output, model=None, logPath=options.logPath)