def __init__(self, steps, parseName="McCC", requireEntities=False): #if constParser == "None": constParser = None #if depParser == "None": depParser = None #assert constParser in ("BLLIP", "BLLIP-BIO", "STANFORD", None), constParser #assert depParser in ("STANFORD", "STANFORD-CONVERT", "SYNTAXNET", None), depParser #self.constParser = constParser #self.depParser = depParser self.requireEntities = requireEntities self.parseName = parseName ToolChain.__init__(self, steps) self.modelParameterStringName = "preprocessorParams"
def __init__(self): ToolChain.__init__(self) # Steps self.addStep("CONVERT", self.convert, {"dataSetNames":None, "corpusName":None} , "documents.xml") self.addStep("SPLIT-SENTENCES", Tools.GeniaSentenceSplitter.makeSentences, {"debug":False, "postProcess":True}, "sentences.xml") self.addStep("NER", Tools.BANNER.run, {"elementName":"entity", "processElement":"sentence", "debug":False, "splitNewlines":True}, "ner.xml") self.addStep("PARSE", Tools.CharniakJohnsonParser.parse, {"parseName":"McCC", "requireEntities":False, "debug":False}, "parse.xml") self.addStep("CONVERT-PARSE", Tools.StanfordParser.convertXML, {"parser":"McCC", "debug":False}, "converted-parse.xml") self.addStep("SPLIT-NAMES", ProteinNameSplitter.mainFunc, {"parseName":"McCC"}, "split-names.xml") self.addStep("FIND-HEADS", FindHeads.findHeads, {"parse":"McCC", "removeExisting":True}, "heads.xml") self.addStep("DIVIDE-SETS", self.divideSets, {"outputStem":None, "saveCombined":True})
def process(self, source, output, parameters=None, model=None, sourceDataSetNames=None, fromStep=None, toStep=None, omitSteps=None): if omitSteps != None and ( (type(omitSteps) in types.StringTypes and omitSteps == "CONVERT") or "CONVERT" in omitSteps): raise Exception("Preprocessor step 'CONVERT' may not be omitted") # Initialize variables and save existing default values #self.intermediateFileTag = corpusName #parameters = self.getParameters(parameters, model) #parameters["CONVERT.dataSetNames"] = sourceDataSetNames #parameters["CONVERT.corpusName"] = corpusName #convertSetNames = self.stepArgs("CONVERT")["dataSetNames"] #convertCorpusName = self.stepArgs("CONVERT")["corpusName"] #self.stepArgs("CONVERT")["dataSetNames"] = sourceDataSetNames #self.stepArgs("CONVERT")["corpusName"] = corpusName # Run the tool chain xml = ToolChain.process(self, source, output, parameters, model, fromStep, toStep, omitSteps) # Reset variables to saved default values #self.stepArgs("CONVERT")["dataSetNames"] = convertSetNames #self.stepArgs("CONVERT")["corpusName"] = convertCorpusName return xml
def process(self, source, output, parameters=None, model=None, sourceDataSetNames=None, fromStep=None, toStep=None, omitSteps=None): # Initialize variables and save existing default values #self.intermediateFileTag = corpusName #parameters = self.getParameters(parameters, model) #parameters["CONVERT.dataSetNames"] = sourceDataSetNames #parameters["CONVERT.corpusName"] = corpusName #convertSetNames = self.stepArgs("CONVERT")["dataSetNames"] #convertCorpusName = self.stepArgs("CONVERT")["corpusName"] #self.stepArgs("CONVERT")["dataSetNames"] = sourceDataSetNames #self.stepArgs("CONVERT")["corpusName"] = corpusName # Run the tool chain xml = ToolChain.process(self, source, output, parameters, model, fromStep, toStep, omitSteps) # Reset variables to saved default values #self.stepArgs("CONVERT")["dataSetNames"] = convertSetNames #self.stepArgs("CONVERT")["corpusName"] = convertCorpusName return xml
def save(self, input, output=None): if "*" in output: if output.endswith("*.xml"): return self.divideSets(input, output.split("*")[0].rstrip("-")) else: exportPath, extension = output.split("*") extension = extension.strip(".") if not os.path.exists(exportPath): os.makedirs(exportPath) self.export(input, exportPath, [extension]) else: return ToolChain.save(self, input, output)
def process(self, source, output=None, model=None, fromStep=None, toStep=None, omitSteps=None, logPath=None): if logPath == "AUTO": if output != None: logPath = output if "*" in logPath: logPath = logPath.split("*")[0].rstrip("-") logPath = os.path.join( logPath.rstrip("/").rstrip("\\") + "-log.txt") else: logPath = None elif logPath == "None": logPath = None if logPath != None: if not os.path.exists(os.path.dirname(logPath)): os.makedirs(os.path.dirname(logPath)) Stream.openLog(logPath) print >> sys.stderr, "Preprocessor steps:", [ x.name for x in self.steps ] if len(self.steps) == 0: raise Exception("No preprocessing steps defined") #if omitSteps != None and((type(omitSteps) in types.StringTypes and omitSteps == "CONVERT") or "CONVERT" in omitSteps): # raise Exception("Preprocessor step 'CONVERT' may not be omitted") #if isinstance(source, basestring) and os.path.basename(source).isdigit(): # PMID # print >> sys.stderr, "Preprocessing PubMed abstract", os.path.basename(source) # source = Utils.Download.getPubMed(int(source)) # Initialize variables and save existing default values #self.intermediateFileTag = corpusName #parameters = self.getParameters(parameters, model) #parameters["CONVERT.dataSetNames"] = sourceDataSetNames #parameters["CONVERT.corpusName"] = corpusName #convertSetNames = self.stepArgs("CONVERT")["dataSetNames"] #convertCorpusName = self.stepArgs("CONVERT")["corpusName"] #self.stepArgs("CONVERT")["dataSetNames"] = sourceDataSetNames #self.stepArgs("CONVERT")["corpusName"] = corpusName # Run the tool chain xml = ToolChain.process(self, source, output, model, fromStep, toStep, omitSteps) # Reset variables to saved default values #self.stepArgs("CONVERT")["dataSetNames"] = convertSetNames #self.stepArgs("CONVERT")["corpusName"] = convertCorpusName if logPath != None: Stream.closeLog(logPath) return xml
def __init__(self): ToolChain.__init__(self) # Steps self.addStep("CONVERT", self.convert, { "dataSetNames": None, "corpusName": None }, "documents.xml") self.addStep("SPLIT-SENTENCES", Tools.GeniaSentenceSplitter.makeSentences, { "debug": False, "postProcess": True }, "sentences.xml") self.addStep( "NER", Tools.BANNER.run, { "elementName": "entity", "processElement": "sentence", "debug": False, "splitNewlines": True }, "ner.xml") self.addStep("PARSE", Tools.CharniakJohnsonParser.parse, { "parseName": "McCC", "requireEntities": False, "debug": False }, "parse.xml") self.addStep("CONVERT-PARSE", Tools.StanfordParser.convertXML, { "parser": "McCC", "debug": False }, "converted-parse.xml") self.addStep("SPLIT-NAMES", ProteinNameSplitter.mainFunc, {"parseName": "McCC"}, "split-names.xml") self.addStep("FIND-HEADS", FindHeads.findHeads, { "parse": "McCC", "removeExisting": True }, "heads.xml") self.addStep("DIVIDE-SETS", self.divideSets, { "outputStem": None, "saveCombined": True })
def process(self, source, output, parameters=None, model=None, sourceDataSetNames=None, fromStep=None, toStep=None, omitSteps=None): if omitSteps != None and((type(omitSteps) in types.StringTypes and omitSteps == "CONVERT") or "CONVERT" in omitSteps): raise Exception("Preprocessor step 'CONVERT' may not be omitted") # Initialize variables and save existing default values #self.intermediateFileTag = corpusName #parameters = self.getParameters(parameters, model) #parameters["CONVERT.dataSetNames"] = sourceDataSetNames #parameters["CONVERT.corpusName"] = corpusName #convertSetNames = self.stepArgs("CONVERT")["dataSetNames"] #convertCorpusName = self.stepArgs("CONVERT")["corpusName"] #self.stepArgs("CONVERT")["dataSetNames"] = sourceDataSetNames #self.stepArgs("CONVERT")["corpusName"] = corpusName # Run the tool chain xml = ToolChain.process(self, source, output, parameters, model, fromStep, toStep, omitSteps) # Reset variables to saved default values #self.stepArgs("CONVERT")["dataSetNames"] = convertSetNames #self.stepArgs("CONVERT")["corpusName"] = convertCorpusName return xml
def process(self, source, output=None, model=None, fromStep=None, toStep=None, omitSteps=None, logPath=None): if logPath == "AUTO": if output != None: logPath = output if "*" in logPath: logPath = logPath.split("*")[0].rstrip("-") logPath = os.path.join(logPath.rstrip("/").rstrip("\\") + "-log.txt") else: logPath = None elif logPath == "None": logPath = None if logPath != None: if not os.path.exists(os.path.dirname(logPath)): os.makedirs(os.path.dirname(logPath)) Stream.openLog(logPath) print >> sys.stderr, "Preprocessor steps:", [x.name for x in self.steps] if len(self.steps) == 0: raise Exception("No preprocessing steps defined") #if omitSteps != None and((type(omitSteps) in types.StringTypes and omitSteps == "CONVERT") or "CONVERT" in omitSteps): # raise Exception("Preprocessor step 'CONVERT' may not be omitted") #if isinstance(source, basestring) and os.path.basename(source).isdigit(): # PMID # print >> sys.stderr, "Preprocessing PubMed abstract", os.path.basename(source) # source = Utils.Download.getPubMed(int(source)) # Initialize variables and save existing default values #self.intermediateFileTag = corpusName #parameters = self.getParameters(parameters, model) #parameters["CONVERT.dataSetNames"] = sourceDataSetNames #parameters["CONVERT.corpusName"] = corpusName #convertSetNames = self.stepArgs("CONVERT")["dataSetNames"] #convertCorpusName = self.stepArgs("CONVERT")["corpusName"] #self.stepArgs("CONVERT")["dataSetNames"] = sourceDataSetNames #self.stepArgs("CONVERT")["corpusName"] = corpusName # Run the tool chain xml = ToolChain.process(self, source, output, model, fromStep, toStep, omitSteps) # Reset variables to saved default values #self.stepArgs("CONVERT")["dataSetNames"] = convertSetNames #self.stepArgs("CONVERT")["corpusName"] = convertCorpusName if logPath != None: Stream.closeLog(logPath) return xml
def __init__(self): ToolChain.__init__(self) self.modelParameterStringName = "preprocessorParams"