def __init__(self, configuration, dataStream=None, connect="", exceptions=True): if isinstance(configuration, config.AugustusConfiguration): configuration.validate(exception=True) else: try: configuration = xmlbase.loadfile(configuration, config.Config, lineNumbers=True) except IOError: configuration = xmlbase.load(configuration, config.Config) if configuration.exists(config.CustomProcessing): raise Exception("The Augustus class defines its own <CustomProcessing>; please leave this out of the configuration file.") self.dataStream = dataStream if configuration.child(config.DataInput).exists(config.Interactive): if dataStream is None: raise Exception("If the configuration has a DataInput <Interactive> block, then a DataStream object must be provided.") else: if dataStream is not None: raise Exception("If the configuration has no DataInput <Interactive> block, then a DataStream object must not be provided.") persistentStorage = config.PersistentStorage(connect=connect) persistentStorage.validate() customProcessing = config.CustomProcessing(persistentStorage) customProcessing.code = None customProcessing.callbackClass = self configuration.children.append(customProcessing) self.configuration = configuration self.mainLoop = augustus.engine.mainloop.MainLoop(self.configuration, rethrowExceptions=exceptions, dataStream=self.dataStream)
def post_validate(self): if "validate" not in self.attrib: self["validate"] = True try: self.data = xmlbase.loadfile(self["fileName"], pmml.X_ODG_PMML, validation=self["validate"]) except XMLValidationError, err: raise RuntimeError, "PMML file %s failed validation: %s" % (self["fileName"], str(err))
def post_validate(self): if "validate" not in self.attrib: self["validate"] = True try: self.data = xmlbase.loadfile(self["fileName"], pmml.X_ODG_PMML, validation=self["validate"]) except XMLValidationError, err: raise RuntimeError("PMML file %s failed validation: %s" % (self["fileName"], str(err)))
def __init__(self, configuration, dataStream=None, connect="", exceptions=True): if isinstance(configuration, config.AugustusConfiguration): configuration.validate(exception=True) else: try: configuration = xmlbase.loadfile(configuration, config.Config, lineNumbers=True) except IOError: configuration = xmlbase.load(configuration, config.Config) if configuration.exists(config.CustomProcessing): raise Exception( "The Augustus class defines its own <CustomProcessing>; please leave this out of the configuration file." ) self.dataStream = dataStream if configuration.child(config.DataInput).exists(config.Interactive): if dataStream is None: raise Exception( "If the configuration has a DataInput <Interactive> block, then a DataStream object must be provided." ) else: if dataStream is not None: raise Exception( "If the configuration has no DataInput <Interactive> block, then a DataStream object must not be provided." ) persistentStorage = config.PersistentStorage(connect=connect) persistentStorage.validate() customProcessing = config.CustomProcessing(persistentStorage) customProcessing.code = None customProcessing.callbackClass = self configuration.children.append(customProcessing) self.configuration = configuration self.mainLoop = augustus.engine.mainloop.MainLoop( self.configuration, rethrowExceptions=exceptions, dataStream=self.dataStream)
def getModel(configOptions): """Return a pmmlElement object: the root of the model. Arguments: configOptions (XML object, defined in xmlbase): The XML element <ModelInput>...</ModelInput> which contains the source location for the PMML model. """ #xsd: Assume FromFile/FromFifo, with file name required. sourceElement = configOptions.child(lambda x: x.tag.startswith("From")) filename = sourceElement["name"] if sourceElement.tag.endswith("File"): selectmode = sourceElement.attrib.get( "selectmode", "lastAlphabetic") if filename.startswith("http://") or filename.startswith("https://"): pass else: filelist = glob.glob(filename) if len(filelist) > 1: filelist = [ f for f in filelist if _modelExceptionIdentifier not in f] if len(filelist) == 0 : raise RuntimeError, "no files matched the given filename/glob: %s" % filename if selectmode == "mostRecent": filename = max(filelist, key=lambda x: os.stat(x).st_mtime) else: filelist.sort() filename = filelist[-1] if _modelExceptionIdentifier in filename: logging.getLogger().warning("Using a PMML model that was written on exception:File name: %s" % filename) try: # TODO: make lineNumbers optional (better diagnostics with them, better performance without them) model = xmlbase.loadfile(filename, pmml.X_ODG_PMML, lineNumbers=True) except: logging.getLogger().error("Error loading PMML model from %s." % filename) raise return model, filename
import glob from augustus.core.xmlbase import loadfile import augustus.core.pmml41 as pmml from cassius import * modelFiles = glob.glob("_out/modelout*.pmml") modelFiles.sort() models = [] for modelFile in modelFiles: model = loadfile(modelFile, pmml.X_ODG_PMML) models.append(model) print model.tree() plots = [] for model in models: eventNumber = model.descendant(pmml.X_ODG_Eventstamp)["number"] discretize = model.descendant(pmml.Discretize) bins = [] for discretizeBin in discretize.matches(pmml.DiscretizeBin): binName = discretizeBin["binValue"] leftMargin = discretizeBin.child(pmml.Interval)["leftMargin"] rightMargin = discretizeBin.child(pmml.Interval)["rightMargin"] bins.append((binName, leftMargin, rightMargin)) bins.sort(lambda a, b: cmp(a[1], b[1])) h = HistogramNonUniform([(b[1], b[2]) for b in bins], fillcolor="yellow",
def __init__(self, configuration, model=None, dataStream=None, rethrowExceptions=None): self.model = model self.dataStream = dataStream self.rethrowExceptions = rethrowExceptions self.fileNameOnException = None # get the configuration, in whatever form you find it if isinstance(configuration, config.AugustusConfiguration): pass elif isinstance(configuration, basestring): try: configuration = xmlbase.loadfile(configuration, config.Config, lineNumbers=True) except IOError: configuration = xmlbase.load(configuration, config.Config, lineNumbers=True) else: raise ConfigurationError( "Configuration must be a pre-validated XML object, a fileName, or a literal configuration string." ) # set up logging setupLogging( configuration.matches( lambda x: isinstance(x, (config.Logging, config.Metadata)))) self.logger = logging.getLogger() self.metadata = logging.getLogger("metadata") # begin "initialization" phase for l in self.logger, self.metadata: if "initialization" in l.differentLevel: l.setLevel(l.differentLevel["initialization"]) else: l.setLevel(l.naturalLevel) # get the model, in whatever form you find it self.logger.info("Loading PMML model.") self.metadata.startTiming("Time to load PMML model") modelFileName = "(none)" maturityThreshold = 0 if self.model is None: modelInput = configuration.child(config.ModelInput, exception=None) if modelInput is None: raise ConfigurationError( "If a model is not provided to MainLoop explicitly, it must be present in the configuration file." ) fileLocation = modelInput["fileLocation"] if not fileLocation.startswith( "http://") and not fileLocation.startswith("https://"): fileList = glob.glob(fileLocation) if len(fileList) > 1: fileList = [ f for f in fileList if self._modelExceptionIdentifier not in f ] if len(fileList) == 0: raise IOError( "No files matched the ModelInput fileLocation \"%s\"." % fileLocation) selectmode = modelInput.attrib.get("selectmode", "lastAlphabetic") if selectmode == "mostRecent": fileLocation = max(fileList, key=lambda x: os.stat(x).st_mtime) elif selectmode == "lastAlphabetic": fileList.sort() fileLocation = fileList[-1] else: assert False if self._modelExceptionIdentifier in fileLocation: self.logger.warning( "Using a PMML model that was written on exception (fileName \"%s\")" % fileLocation) self.model = xmlbase.loadfile(fileLocation, pmml.X_ODG_PMML, lineNumbers=True) if "maturityThreshold" in modelInput.attrib: maturityThreshold = modelInput["maturityThreshold"] elif isinstance(self.model, pmml.PMML): pass elif isinstance(self.model, basestring): try: self.model, modelFileName = xmlbase.loadfile( self.model, pmml.X_ODG_PMML, lineNumbers=True), self.model except IOError: self.model = xmlbase.load(self.model, pmml.X_ODG_PMML, lineNumbers=True) else: raise ConfigurationError( "Model must be a pre-validated XML object, a fileName, or a literal PMML string." ) self.metadata.stopTiming("Time to load PMML model") self.metadata.data["PMML model file"] = modelFileName # globally set random number seeds if "randomSeed" in configuration.attrib: augustusRandomSeed = configuration["randomSeed"] random.seed(augustusRandomSeed) numpy.random.seed(augustusRandomSeed + 1) else: augustusRandomSeed = "unspecified" # globally set numpy error handling numpy.seterr(divide="raise", over="raise", under="ignore", invalid="raise") # update schemes (producerUpdateScheme may be redefined below) consumerUpdateScheme = self._getUpdateScheme( configuration.child(config.ConsumerBlending, exception=False)) producerUpdateScheme = self._getUpdateScheme(None) # set up scoring output outputConfig = configuration.child(config.Output, exception=False) if outputConfig is None: self.outputWriter = None else: outputParams = { "pmmlFileName": modelFileName, "mode": outputConfig.destination.attrib.get("type", "XML").lower() } if isinstance(outputConfig.destination, config.ToFile): if outputConfig.destination.attrib.get("overwrite", False): outputStream = codecs.open( outputConfig.destination["name"], "w", encoding="utf-8") else: outputStream = codecs.open( outputConfig.destination["name"], "a", encoding="utf-8") elif isinstance(outputConfig.destination, config.ToStandardError): outputStream = sys.stderr elif isinstance(outputConfig.destination, config.ToStandardOut): outputStream = sys.stdout else: assert False reportTag = outputConfig.child("ReportTag", exception=False) if reportTag: outputParams["reportName"] = reportTag.attrib.get( "name", "Report") eventTag = outputConfig.child("EventTag", exception=False) if eventTag: outputParams["eventName"] = eventTag.attrib.get( "name", "Event") outputParams["pseudoEventName"] = eventTag.attrib.get( "pseudoName", "pseudoEvent") self.outputWriter = OutputWriter(outputStream, **outputParams) # initialize for the case of no output model engineSettings = { "maturityThreshold": maturityThreshold, "augustusRandomSeed": augustusRandomSeed } self.modelWriter = None segmentationScheme = SegmentationScheme(None, self.model) self.updateFlag = False self.aggregateUpdateFlag = False producerAlgorithm = dict(config.producerAlgorithmDefaults) for pa in producerAlgorithm.values(): validationResult = pa.validate() assert validationResult is None # set up output model, if present in the configuration modelSetup = configuration.child(config.ModelSetup, exception=False) engineSettings["hasProducer"] = modelSetup is not None if engineSettings["hasProducer"]: self.logger.info("Setting up model updating/producing.") producerBlending = modelSetup.child(config.ProducerBlending, exception=False) producerUpdateScheme = self._getUpdateScheme(producerBlending) if producerBlending is not None and producerBlending.contains( config.MaturityThreshold): maturityConfig = producerBlending.child( config.MaturityThreshold) engineSettings["maturityThreshold"] = int( maturityConfig.attrib.get("threshold", 1)) try: engineSettings["lockingThreshold"] = int( maturityConfig.attrib["lockingThreshold"]) except KeyError: engineSettings["lockingThreshold"] = None engineSettings["lockAllSegments"] = modelSetup.attrib.get( "mode", None) == "lockExisting" if engineSettings[ "lockAllSegments"] and segmentationScheme is not None and not segmentationScheme._generic and not segmentationScheme._whiteList: self.logger.warning( "The model is locked and no new segments are specified...new model files will be unchanged." ) self.modelWriter = getModelWriter(modelSetup) if self.modelWriter is not None: if self.modelWriter.baseName is None: self.fileNameOnException = self._modelExceptionIdentifier + ".pmml" else: self.fileNameOnException = "".join([ self.modelWriter.baseName, self._modelExceptionIdentifier, ".pmml" ]) else: self.logger.warning( "There is no outputFile attribute in the ModelSetup; no new model file will be created." ) segmentationScheme = SegmentationScheme( modelSetup.child(config.SegmentationSchema, exception=False), self.model) self.updateFlag = modelSetup.attrib.get("updateEvery", "event") in ("event", "both") self.aggregateUpdateFlag = modelSetup.attrib.get( "updateEvery", "event") in ("aggregate", "both") for pa in modelSetup.matches(config.ProducerAlgorithm): producerAlgorithm[pa["model"]] = pa if modelSetup.attrib.get("mode", None) == "updateExisting": for pa in producerAlgorithm.values(): pa.parameters["updateExisting"] = True if modelSetup.attrib.get("mode", None) == "replaceExisting": for pa in producerAlgorithm.values(): pa.parameters["updateExisting"] = False # to score or not to score eventSettings = configuration.child(config.EventSettings, exception=False) if eventSettings is not None: self.logger.info("Setting up output.") self.scoreFlag = eventSettings["score"] self.outputFlag = eventSettings["output"] else: self.scoreFlag = False self.outputFlag = False aggregationConfig = configuration.child(config.AggregationSettings, exception=False) if aggregationConfig is not None: self.aggregateScoreFlag = aggregationConfig["score"] self.aggregateOutputFlag = aggregationConfig["output"] self.aggregationSettings = dict(aggregationConfig.attrib) else: self.aggregateScoreFlag = False self.aggregateOutputFlag = False self.aggregationSettings = None self.metadata.data[ "Update model"] = "true" if self.updateFlag or self.aggregateUpdateFlag else "false" # build a scoring engine once without a dataStream (to evaluate any verification blocks) self.engine = Engine(self.model, None, producerUpdateScheme, consumerUpdateScheme, segmentationScheme, producerAlgorithm, **engineSettings) self.engine.initialize() if self.outputWriter is not None: self.outputWriter.open() # begin "verification" phase for l in self.logger, self.metadata: if "verification" in l.differentLevel: l.eventLogLevel = l.differentLevel["verification"] l.setLevel(l.differentLevel["verification"]) else: l.eventLogLevel = l.naturalLevel l.setLevel(l.naturalLevel) # evaluate verification blocks modelVerificationConfig = configuration.child(config.ModelVerification, exception=False) if modelVerificationConfig is not None: verify(modelVerificationConfig, self.engine, self.logger, self.outputWriter) # verification can increment aggregate variables, but # aggregates should all start at zero at the start of real # processing, whether verification happened or not self.engine.flushAggregates() # get the dataStream, in whatever form you find it self.logger.info("Setting up data input.") if self.dataStream is None: configDataInput = configuration.child(config.DataInput, exception=None) if configDataInput is None: raise ConfigurationError( "If a dataStream is not provided to MainLoop explicitly, it must be present in the configuration file." ) if configDataInput.contains(config.FromFile): self.dataStream = DataStreamer( configDataInput.child(config.FromFile), self.engine.pmmlModel) elif configDataInput.contains(config.FromStandardIn): self.dataStream = DataStreamer( configDataInput.child(config.FromStandardIn), self.engine.pmmlModel) elif configDataInput.contains(config.FromHTTP): self.dataStream = AugustusHTTPDataStream( configDataInput.child(config.FromHTTP)) if self.outputWriter is None: self.dataStream.respond = False if self.dataStream.respond: self.dataStream.setupOutput(self.outputWriter) else: assert False # begin "eventLoop" phase for l in self.logger, self.metadata: if "eventloop" in l.differentLevel: l.eventLogLevel = l.differentLevel["eventloop"] l.setLevel(l.differentLevel["eventloop"]) else: l.eventLogLevel = l.naturalLevel l.setLevel(l.naturalLevel) # possibly set up custom processing self.customProcessing = configuration.child(config.CustomProcessing, exception=False) if self.customProcessing is not None: constants = self.engine.pmmlModel.child(pmml.Extension, exception=False) if constants is None: constants = NameSpaceReadOnly() else: constants = constants.child( pmml.X_ODG_CustomProcessingConstants, exception=False) if constants is None: constants = NameSpaceReadOnly() else: constants = constants.nameSpace atoms = { "INVALID": INVALID, "MISSING": MISSING, "IMMATURE": IMMATURE, "MATURE": MATURE, "LOCKED": LOCKED, "UNINITIALIZED": UNINITIALIZED } for thing in pmml.OutputField.__dict__.values( ) + pmml.X_ODG_OutputField.__dict__.values(): if isinstance(thing, Atom): atoms[repr(thing)] = thing self.customProcessing.initialize( self.model, self.engine.pmmlModel, constants, [s.userFriendly for s in self.engine.segmentRecords], atoms, self.logger, self.metadata, consumerUpdateScheme, producerUpdateScheme) self.engine.customProcessing = self.customProcessing self.engine.reinitialize() else: # only turn off circular garbage collection if there is no CustomProcessing or AugustusInterface gc.disable()
def post_validate(self): self.config = xmlbase.loadfile(self["fileName"], augustus.applications.scoresAwk.root, lineNumbers=True)
def post_validate(self): self.config = xmlbase.loadfile(self["fileName"], augustus.core.config.Config, lineNumbers=True)
def post_validate(self): self.data = xmlbase.loadfile(self["fileName"], pmml.X_ODG_PMML, validation=False)
def main(config_file=None, rethrowExceptions=False): """From the configuration, set up the Augustus engine. Set up segments, PMML tree, I/O, and logging for the ProducerConsumer. Identify what task is to be done: Producing, Consuming, or Automatic Incremental Model updates (AIM). Identify the model type and start that model, passing the segments, PMML tree, and I/O information. Arguments: config_file (string): Path to the configuration file. """ # Get the configuration settings (as an XML instance) if isinstance(config_file, config.AugustusConfiguration): configRoot = config_file else: configRoot = xmlbase.loadfile(config_file, config.Config, lineNumbers=True) setupLogging( configRoot.matches(lambda x: x.tag in ("Logging", "Metadata"))) logger = logging.getLogger() metadata = logging.getLogger('metadata') logger.info("Loading PMML model.") child = configRoot.child(config.ModelInput, exception=False) metadata.startTiming("Time to load PMML model") pmmlModel, pmmlFileName = getModel(child) metadata.stopTiming("Time to load PMML model") metadata.data["PMML model file"] = pmmlFileName logger.info("Setting up data input.") child = configRoot.child(config.DataInput, exception=False) dataStreamer = getDataStreamer(child) child = configRoot.child(config.ConsumerBlending, exception=False) consumerUpdateScheme = getUpdateScheme(child) child = configRoot.child(config.ModelSetup, exception=False) # Default Model setup parameters modelWriter = getModelWriter(None) engineSettings = dict(maturityThreshold=0) producerParameters = {} filenameOnException = None producerUpdateScheme = getUpdateScheme(None) segmentationScheme = SegmentationScheme(None, pmmlModel) aggregateUpdateFlag = updateFlag = False # Model setup if child: logger.info("Setting up model updating/producing.") modelWriter = getModelWriter(child) segmentationScheme = SegmentationScheme( child.child(config.SegmentationSchema, exception=False), pmmlModel) if modelWriter is not None: engineSettings['lockAllSegments'] = child.attrib.get( "mode", None) == "lockExisting" producerParameters['resume'] = child.attrib.get( "mode", None) == "updateExisting" updateFlag = child.attrib.get("updateEvery", "event") in ("event", "both") aggregateUpdateFlag = child.attrib.get("updateEvery", "event") in ("aggregate", "both") filenameOnException = "".join( [modelWriter.baseName, _modelExceptionIdentifier, ".pmml"]) child = child.child(config.ProducerBlending, exception=False) producerUpdateScheme = getUpdateScheme(child) if child and child.exists(config.MaturityThreshold): maturityConfig = child.child(config.MaturityThreshold) engineSettings['maturityThreshold'] = int( maturityConfig.attrib.get("threshold", 1)) engineSettings['lockingThreshold'] = \ None if "lockingThreshold" not in \ maturityConfig.attrib \ else int(maturityConfig["lockingThreshold"]) if engineSettings[ 'lockAllSegments'] and not segmentationScheme._generic and not segmentationScheme.whiteList: logger.warning( "The model is locked and no new segments are specified...new model files will be unchanged." ) else: logger.warning( "There is no outputFile attribute in the ModelSetup; no new model file will be created." ) # Set up output child = configRoot.child(config.Output, exception=False) outputWriter = getOutputWriter(child, pmmlFileName) child = configRoot.child(config.EventSettings, exception=False) if child is not None: logger.info("Setting up output.") # not in a dictionary to reduce the number of lookups while looping scoreFlag = child.attrib['score'] outputFlag = child.attrib['output'] else: scoreFlag = outputFlag = False child = configRoot.child(config.AggregationSettings, exception=False) if child is not None: aggregateScoreFlag = child.attrib['score'] aggregateOutputFlag = child.attrib['output'] aggregationSettings = child.attrib else: aggregationSettings = None metadata.data[ 'Update model'] = "true" if updateFlag or aggregateUpdateFlag else "false" # build engine once without a data stream engine = Engine(pmmlModel, None, producerUpdateScheme, consumerUpdateScheme, segmentationScheme, **engineSettings) engine.initialize(producerParameters=producerParameters) if outputWriter: outputWriter.open() # score fake data from <ModelVerifications> modelVerificationConfig = configRoot.child(config.ModelVerification, exception=False) if modelVerificationConfig is not None: augustus.engine.verification.verify(modelVerificationConfig, engine, logger, outputWriter) # start of real data logger.info("Setting up Augustus's main engine.") engine.resetDataStream(dataStreamer) dataStreamer.start_streaming() metadata.data['Events'] = 0 logger.info("Calculating.") metadata.startTiming("Run time") try: while True: try: score = engine.event(score=scoreFlag, update=updateFlag) metadata.data['Events'] += 1 if outputWriter and outputFlag: try: outputWriter.write(score) except IOError: ## FIXME: this exception should be raised to the top level; I do not ## undersand why it is handled here, nor why a 'good' model is written...--tanya if modelWriter: modelWriter.write(pmmlModel) break if modelWriter: modelWriter.serialize(pmmlModel, metadata.data['Events']) if aggregationSettings: if engine.checkPseudoeventReadiness(aggregationSettings): score = engine.pseudoevent(score=aggregateScoreFlag, update=aggregateUpdateFlag) if outputWriter and outputOnAggregate: outputWriter.write(score) except StopIteration: if modelWriter: if modelWriter.serialization: modelWriter.serialize(pmmlModel, metadata.data['Events']) else: modelWriter.write(pmmlModel) break if aggregationSettings is not None and aggregationSettings['atEnd']: score = engine.pseudoevent(score=aggregateScoreFlag, update=aggregateUpdateFlag) if outputWriter and aggregateOutputFlag: outputWriter.write(score) except (Exception, KeyboardInterrupt), err: if rethrowExceptions: raise logger.error("Shutting down on exception...") excinfo = sys.exc_info() logger.error("...%s" % excinfo[0]) logger.error("...%s" % excinfo[1]) logger.error("...%s" % traceback.format_exc()) if filenameOnException: logger.error("Writing last model in location %s" % filenameOnException) pmmlModel.write(filenameOnException) sys.exit( "Shutting down on exception; for more information check the logfile (if logging is enabled)...\n%s" % traceback.format_exc())
for line in input_file: number_of_rows += 1 if number_of_rows <= 3: print "\t", line.strip() print "\t", line checkForQuit() print "To load an XML file using Augustus' xmlbase library, first include the library" print "from augustus.core import xmlbase # type this at the top of a script" print "\nthen load the file:" print """ filename = "../results/example_scores.xml" root_element = xmlbase.loadfile(filename) """ checkForQuit() root_element = xmlbase.loadfile(filename) print "To access a tag of an xml element, use 'element.tag' for example," print "if we type:" print ">>> root_element.tag" checkForQuit() print "we get:" print root_element.tag checkForQuit() print "The element's attributes are stored as a Python dictionary." print "The dictionary is named 'attrib', so for example," print ">>> root_element.attrib" checkForQuit() print "gets:"
def __init__(self, configuration, model=None, dataStream=None, rethrowExceptions=None): self.model = model self.dataStream = dataStream self.rethrowExceptions = rethrowExceptions self.fileNameOnException = None # get the configuration, in whatever form you find it if isinstance(configuration, config.AugustusConfiguration): pass elif isinstance(configuration, basestring): try: configuration = xmlbase.loadfile(configuration, config.Config, lineNumbers=True) except IOError: configuration = xmlbase.load(configuration, config.Config, lineNumbers=True) else: raise ConfigurationError("Configuration must be a pre-validated XML object, a fileName, or a literal configuration string.") # set up logging setupLogging(configuration.matches(lambda x: isinstance(x, (config.Logging, config.Metadata)))) self.logger = logging.getLogger() self.metadata = logging.getLogger("metadata") # begin "initialization" phase for l in self.logger, self.metadata: if "initialization" in l.differentLevel: l.setLevel(l.differentLevel["initialization"]) else: l.setLevel(l.naturalLevel) # get the model, in whatever form you find it self.logger.info("Loading PMML model.") self.metadata.startTiming("Time to load PMML model") modelFileName = "(none)" maturityThreshold = 0 if self.model is None: modelInput = configuration.child(config.ModelInput, exception=None) if modelInput is None: raise ConfigurationError("If a model is not provided to MainLoop explicitly, it must be present in the configuration file.") fileLocation = modelInput["fileLocation"] if not fileLocation.startswith("http://") and not fileLocation.startswith("https://"): fileList = glob.glob(fileLocation) if len(fileList) > 1: fileList = [f for f in fileList if self._modelExceptionIdentifier not in f] if len(fileList) == 0: raise IOError("No files matched the ModelInput fileLocation \"%s\"." % fileLocation) selectmode = modelInput.attrib.get("selectmode", "lastAlphabetic") if selectmode == "mostRecent": fileLocation = max(fileList, key=lambda x: os.stat(x).st_mtime) elif selectmode == "lastAlphabetic": fileList.sort() fileLocation = fileList[-1] else: assert False if self._modelExceptionIdentifier in fileLocation: self.logger.warning("Using a PMML model that was written on exception (fileName \"%s\")" % fileLocation) self.model = xmlbase.loadfile(fileLocation, pmml.X_ODG_PMML, lineNumbers=True) if "maturityThreshold" in modelInput.attrib: maturityThreshold = modelInput["maturityThreshold"] elif isinstance(self.model, pmml.PMML): pass elif isinstance(self.model, basestring): try: self.model, modelFileName = xmlbase.loadfile(self.model, pmml.X_ODG_PMML, lineNumbers=True), self.model except IOError: self.model = xmlbase.load(self.model, pmml.X_ODG_PMML, lineNumbers=True) else: raise ConfigurationError("Model must be a pre-validated XML object, a fileName, or a literal PMML string.") self.metadata.stopTiming("Time to load PMML model") self.metadata.data["PMML model file"] = modelFileName # globally set random number seeds if "randomSeed" in configuration.attrib: augustusRandomSeed = configuration["randomSeed"] random.seed(augustusRandomSeed) numpy.random.seed(augustusRandomSeed + 1) else: augustusRandomSeed = "unspecified" # globally set numpy error handling numpy.seterr(divide="raise", over="raise", under="ignore", invalid="raise") # update schemes (producerUpdateScheme may be redefined below) consumerUpdateScheme = self._getUpdateScheme(configuration.child(config.ConsumerBlending, exception=False)) producerUpdateScheme = self._getUpdateScheme(None) # set up scoring output outputConfig = configuration.child(config.Output, exception=False) if outputConfig is None: self.outputWriter = None else: outputParams = {"pmmlFileName": modelFileName, "mode": outputConfig.destination.attrib.get("type", "XML").lower()} if isinstance(outputConfig.destination, config.ToFile): if outputConfig.destination.attrib.get("overwrite", False): outputStream = codecs.open(outputConfig.destination["name"], "w", encoding="utf-8") else: outputStream = codecs.open(outputConfig.destination["name"], "a", encoding="utf-8") elif isinstance(outputConfig.destination, config.ToStandardError): outputStream = sys.stderr elif isinstance(outputConfig.destination, config.ToStandardOut): outputStream = sys.stdout else: assert False reportTag = outputConfig.child("ReportTag", exception=False) if reportTag: outputParams["reportName"] = reportTag.attrib.get("name", "Report") eventTag = outputConfig.child("EventTag", exception=False) if eventTag: outputParams["eventName"] = eventTag.attrib.get("name", "Event") outputParams["pseudoEventName"] = eventTag.attrib.get("pseudoName", "pseudoEvent") self.outputWriter = OutputWriter(outputStream, **outputParams) # initialize for the case of no output model engineSettings = {"maturityThreshold": maturityThreshold, "augustusRandomSeed": augustusRandomSeed} self.modelWriter = None segmentationScheme = SegmentationScheme(None, self.model) self.updateFlag = False self.aggregateUpdateFlag = False producerAlgorithm = dict(config.producerAlgorithmDefaults) for pa in producerAlgorithm.values(): validationResult = pa.validate() assert validationResult is None # set up output model, if present in the configuration modelSetup = configuration.child(config.ModelSetup, exception=False) engineSettings["hasProducer"] = modelSetup is not None if engineSettings["hasProducer"]: self.logger.info("Setting up model updating/producing.") producerBlending = modelSetup.child(config.ProducerBlending, exception=False) producerUpdateScheme = self._getUpdateScheme(producerBlending) if producerBlending is not None and producerBlending.contains(config.MaturityThreshold): maturityConfig = producerBlending.child(config.MaturityThreshold) engineSettings["maturityThreshold"] = int(maturityConfig.attrib.get("threshold", 1)) try: engineSettings["lockingThreshold"] = int(maturityConfig.attrib["lockingThreshold"]) except KeyError: engineSettings["lockingThreshold"] = None engineSettings["lockAllSegments"] = modelSetup.attrib.get("mode", None) == "lockExisting" if engineSettings["lockAllSegments"] and segmentationScheme is not None and not segmentationScheme._generic and not segmentationScheme._whiteList: self.logger.warning("The model is locked and no new segments are specified...new model files will be unchanged.") self.modelWriter = getModelWriter(modelSetup) if self.modelWriter is not None: if self.modelWriter.baseName is None: self.fileNameOnException = self._modelExceptionIdentifier + ".pmml" else: self.fileNameOnException = "".join([self.modelWriter.baseName, self._modelExceptionIdentifier, ".pmml"]) else: self.logger.warning("There is no outputFile attribute in the ModelSetup; no new model file will be created.") segmentationScheme = SegmentationScheme(modelSetup.child(config.SegmentationSchema, exception=False), self.model) self.updateFlag = modelSetup.attrib.get("updateEvery", "event") in ("event", "both") self.aggregateUpdateFlag = modelSetup.attrib.get("updateEvery", "event") in ("aggregate", "both") for pa in modelSetup.matches(config.ProducerAlgorithm): producerAlgorithm[pa["model"]] = pa if modelSetup.attrib.get("mode", None) == "updateExisting": for pa in producerAlgorithm.values(): pa.parameters["updateExisting"] = True if modelSetup.attrib.get("mode", None) == "replaceExisting": for pa in producerAlgorithm.values(): pa.parameters["updateExisting"] = False # to score or not to score eventSettings = configuration.child(config.EventSettings, exception=False) if eventSettings is not None: self.logger.info("Setting up output.") self.scoreFlag = eventSettings["score"] self.outputFlag = eventSettings["output"] else: self.scoreFlag = False self.outputFlag = False aggregationConfig = configuration.child(config.AggregationSettings, exception=False) if aggregationConfig is not None: self.aggregateScoreFlag = aggregationConfig["score"] self.aggregateOutputFlag = aggregationConfig["output"] self.aggregationSettings = dict(aggregationConfig.attrib) else: self.aggregateScoreFlag = False self.aggregateOutputFlag = False self.aggregationSettings = None self.metadata.data["Update model"] = "true" if self.updateFlag or self.aggregateUpdateFlag else "false" # build a scoring engine once without a dataStream (to evaluate any verification blocks) self.engine = Engine(self.model, None, producerUpdateScheme, consumerUpdateScheme, segmentationScheme, producerAlgorithm, **engineSettings) self.engine.initialize() if self.outputWriter is not None: self.outputWriter.open() # begin "verification" phase for l in self.logger, self.metadata: if "verification" in l.differentLevel: l.eventLogLevel = l.differentLevel["verification"] l.setLevel(l.differentLevel["verification"]) else: l.eventLogLevel = l.naturalLevel l.setLevel(l.naturalLevel) # evaluate verification blocks modelVerificationConfig = configuration.child(config.ModelVerification, exception=False) if modelVerificationConfig is not None: verify(modelVerificationConfig, self.engine, self.logger, self.outputWriter) # verification can increment aggregate variables, but # aggregates should all start at zero at the start of real # processing, whether verification happened or not self.engine.flushAggregates() # get the dataStream, in whatever form you find it self.logger.info("Setting up data input.") if self.dataStream is None: configDataInput = configuration.child(config.DataInput, exception=None) if configDataInput is None: raise ConfigurationError("If a dataStream is not provided to MainLoop explicitly, it must be present in the configuration file.") if configDataInput.contains(config.FromFile): self.dataStream = DataStreamer(configDataInput.child(config.FromFile), self.engine.pmmlModel) elif configDataInput.contains(config.FromStandardIn): self.dataStream = DataStreamer(configDataInput.child(config.FromStandardIn), self.engine.pmmlModel) elif configDataInput.contains(config.FromHTTP): self.dataStream = AugustusHTTPDataStream(configDataInput.child(config.FromHTTP)) if self.outputWriter is None: self.dataStream.respond = False if self.dataStream.respond: self.dataStream.setupOutput(self.outputWriter) else: assert False # begin "eventLoop" phase for l in self.logger, self.metadata: if "eventloop" in l.differentLevel: l.eventLogLevel = l.differentLevel["eventloop"] l.setLevel(l.differentLevel["eventloop"]) else: l.eventLogLevel = l.naturalLevel l.setLevel(l.naturalLevel) # possibly set up custom processing self.customProcessing = configuration.child(config.CustomProcessing, exception=False) if self.customProcessing is not None: constants = self.engine.pmmlModel.child(pmml.Extension, exception=False) if constants is None: constants = NameSpaceReadOnly() else: constants = constants.child(pmml.X_ODG_CustomProcessingConstants, exception=False) if constants is None: constants = NameSpaceReadOnly() else: constants = constants.nameSpace atoms = {"INVALID": INVALID, "MISSING": MISSING, "IMMATURE": IMMATURE, "MATURE": MATURE, "LOCKED": LOCKED, "UNINITIALIZED": UNINITIALIZED} for thing in pmml.OutputField.__dict__.values() + pmml.X_ODG_OutputField.__dict__.values(): if isinstance(thing, Atom): atoms[repr(thing)] = thing self.customProcessing.initialize(self.model, self.engine.pmmlModel, constants, [s.userFriendly for s in self.engine.segmentRecords], atoms, self.logger, self.metadata, consumerUpdateScheme, producerUpdateScheme) self.engine.customProcessing = self.customProcessing self.engine.reinitialize() else: # only turn off circular garbage collection if there is no CustomProcessing or AugustusInterface gc.disable()
def main(config_file=None, rethrowExceptions=False): """From the configuration, set up the Augustus engine. Set up segments, PMML tree, I/O, and logging for the ProducerConsumer. Identify what task is to be done: Producing, Consuming, or Automatic Incremental Model updates (AIM). Identify the model type and start that model, passing the segments, PMML tree, and I/O information. Arguments: config_file (string): Path to the configuration file. """ # Get the configuration settings (as an XML instance) if isinstance(config_file, config.AugustusConfiguration): configRoot = config_file else: configRoot = xmlbase.loadfile(config_file, config.Config, lineNumbers=True) setupLogging(configRoot.matches(lambda x: x.tag in ("Logging", "Metadata"))) logger = logging.getLogger() metadata = logging.getLogger('metadata') logger.info("Loading PMML model.") child = configRoot.child(config.ModelInput, exception=False) metadata.startTiming("Time to load PMML model") pmmlModel, pmmlFileName = getModel(child) metadata.stopTiming("Time to load PMML model") metadata.data["PMML model file"] = pmmlFileName logger.info("Setting up data input.") child = configRoot.child(config.DataInput, exception=False) dataStreamer = getDataStreamer(child) child = configRoot.child(config.ConsumerBlending, exception=False) consumerUpdateScheme = getUpdateScheme(child) child = configRoot.child(config.ModelSetup, exception=False) # Default Model setup parameters modelWriter = getModelWriter(None) engineSettings = dict(maturityThreshold=0) producerParameters = {} filenameOnException = None producerUpdateScheme = getUpdateScheme(None) segmentationScheme = SegmentationScheme(None, pmmlModel) aggregateUpdateFlag = updateFlag = False # Model setup if child: logger.info("Setting up model updating/producing.") modelWriter = getModelWriter(child) segmentationScheme = SegmentationScheme(child.child(config.SegmentationSchema, exception=False), pmmlModel) if modelWriter is not None: engineSettings['lockAllSegments'] = child.attrib.get("mode", None) == "lockExisting" producerParameters['resume'] = child.attrib.get("mode", None) == "updateExisting" updateFlag = child.attrib.get("updateEvery", "event") in ("event", "both") aggregateUpdateFlag = child.attrib.get("updateEvery", "event") in ("aggregate", "both") filenameOnException = "".join([modelWriter.baseName, _modelExceptionIdentifier, ".pmml"]) child = child.child(config.ProducerBlending, exception=False) producerUpdateScheme = getUpdateScheme(child) if child and child.exists(config.MaturityThreshold): maturityConfig = child.child(config.MaturityThreshold) engineSettings['maturityThreshold'] = int(maturityConfig.attrib.get("threshold", 1)) engineSettings['lockingThreshold'] = \ None if "lockingThreshold" not in \ maturityConfig.attrib \ else int(maturityConfig["lockingThreshold"]) if engineSettings['lockAllSegments'] and not segmentationScheme._generic and not segmentationScheme.whiteList: logger.warning("The model is locked and no new segments are specified...new model files will be unchanged.") else: logger.warning("There is no outputFile attribute in the ModelSetup; no new model file will be created.") # Set up output child = configRoot.child(config.Output, exception=False) outputWriter = getOutputWriter(child, pmmlFileName) child = configRoot.child(config.EventSettings, exception=False) if child is not None: logger.info("Setting up output.") # not in a dictionary to reduce the number of lookups while looping scoreFlag = child.attrib['score'] outputFlag = child.attrib['output'] else: scoreFlag = outputFlag = False child = configRoot.child(config.AggregationSettings, exception=False) if child is not None: aggregateScoreFlag = child.attrib['score'] aggregateOutputFlag = child.attrib['output'] aggregationSettings = child.attrib else: aggregationSettings = None metadata.data['Update model'] = "true" if updateFlag or aggregateUpdateFlag else "false" # build engine once without a data stream engine = Engine(pmmlModel, None, producerUpdateScheme, consumerUpdateScheme, segmentationScheme, **engineSettings) engine.initialize(producerParameters=producerParameters) if outputWriter: outputWriter.open() # score fake data from <ModelVerifications> modelVerificationConfig = configRoot.child(config.ModelVerification, exception=False) if modelVerificationConfig is not None: augustus.engine.verification.verify(modelVerificationConfig, engine, logger, outputWriter) # start of real data logger.info("Setting up Augustus's main engine.") engine.resetDataStream(dataStreamer) dataStreamer.start_streaming() metadata.data['Events'] = 0 logger.info("Calculating.") metadata.startTiming("Run time") try: while True: try: score = engine.event(score=scoreFlag, update=updateFlag) metadata.data['Events'] += 1 if outputWriter and outputFlag: try: outputWriter.write(score) except IOError: ## FIXME: this exception should be raised to the top level; I do not ## undersand why it is handled here, nor why a 'good' model is written...--tanya if modelWriter: modelWriter.write(pmmlModel) break if modelWriter: modelWriter.serialize(pmmlModel, metadata.data['Events']) if aggregationSettings: if engine.checkPseudoeventReadiness(aggregationSettings): score = engine.pseudoevent(score=aggregateScoreFlag, update=aggregateUpdateFlag) if outputWriter and outputOnAggregate: outputWriter.write(score) except StopIteration: if modelWriter: if modelWriter.serialization: modelWriter.serialize(pmmlModel, metadata.data['Events']) else: modelWriter.write(pmmlModel) break if aggregationSettings is not None and aggregationSettings['atEnd']: score = engine.pseudoevent(score=aggregateScoreFlag, update=aggregateUpdateFlag) if outputWriter and aggregateOutputFlag: outputWriter.write(score) except (Exception, KeyboardInterrupt), err: if rethrowExceptions: raise logger.error("Shutting down on exception...") excinfo = sys.exc_info() logger.error("...%s" % excinfo[0]) logger.error("...%s" % excinfo[1]) logger.error("...%s" % traceback.format_exc()) if filenameOnException: logger.error("Writing last model in location %s" % filenameOnException) pmmlModel.write(filenameOnException) sys.exit("Shutting down on exception; for more information check the logfile (if logging is enabled)...\n%s" % traceback.format_exc())
for line in input_file: number_of_rows += 1 if number_of_rows <= 3: print "\t", line.strip() print "\t", line checkForQuit() print "To load an XML file using Augustus' xmlbase library, first include the library" print "from augustus.core import xmlbase # type this at the top of a script" print "\nthen load the file:" print """ filename = "../results/example_scores.xml" root_element = xmlbase.loadfile(filename) """ checkForQuit() root_element = xmlbase.loadfile(filename) print "To access a tag of an xml element, use 'element.tag' for example," print "if we type:" print ">>> root_element.tag" checkForQuit() print "we get:" print root_element.tag checkForQuit() print "The element's attributes are stored as a Python dictionary." print "The dictionary is named 'attrib', so for example," print ">>> root_element.attrib" checkForQuit() print "gets:" print root_element.attrib
def __init__(self, config_file=None, rethrowExceptions=False, dataStream=None): # Get the configuration settings (as an XML instance) if isinstance(config_file, config.AugustusConfiguration): configRoot = config_file else: configRoot = xmlbase.loadfile(config_file, config.Config, lineNumbers=True) if "randomSeed" in configRoot.attrib: augustusRandomSeed = configRoot.attrib["randomSeed"] random.seed(augustusRandomSeed) numpy.random.seed(augustusRandomSeed + 1) else: augustusRandomSeed = "unspecified" setupLogging(configRoot.matches(lambda x: x.tag in ("Logging", "Metadata"))) logger = logging.getLogger() metadata = logging.getLogger("metadata") for l in logger, metadata: if "initialization" in l.differentLevel: l.setLevel(l.differentLevel["initialization"]) else: l.setLevel(l.naturalLevel) logger.info("Loading PMML model.") modelInput = configRoot.child(config.ModelInput, exception=False) metadata.startTiming("Time to load PMML model") pmmlModel, pmmlFileName = getModel(modelInput) metadata.stopTiming("Time to load PMML model") metadata.data["PMML model file"] = pmmlFileName logger.info("Setting up data input.") child = configRoot.child(config.DataInput, exception=False) if dataStream is None: fromHTTP = child.child(config.FromHTTP, exception=False) if fromHTTP is None: dataStreamer = getDataStreamer(child) else: dataStreamer = AugustusHTTPDataStream(fromHTTP) else: dataStreamer = dataStream child = configRoot.child(config.ConsumerBlending, exception=False) consumerUpdateScheme = getUpdateScheme(child) child = configRoot.child(config.ModelSetup, exception=False) # Default Model setup parameters modelWriter = getModelWriter(None) engineSettings = {"maturityThreshold": modelInput.attrib.get("maturityThreshold", 0), "augustusRandomSeed": augustusRandomSeed, "hasProducer": True, } filenameOnException = None producerUpdateScheme = getUpdateScheme(None) segmentationScheme = SegmentationScheme(None, pmmlModel) aggregateUpdateFlag = updateFlag = False producerAlgorithm = config.producerAlgorithmDefaults for pa in producerAlgorithm.values(): if pa.validate() is not None: raise Exception, "Programmer error in producerAlgorithmDefaults" if child is not None: for pa in child.matches(config.ProducerAlgorithm): producerAlgorithm[pa.attrib["model"]] = pa # Model setup if child: logger.info("Setting up model updating/producing.") modelWriter = getModelWriter(child) segmentationScheme = SegmentationScheme(child.child(config.SegmentationSchema, exception=False), pmmlModel) if modelWriter is not None: engineSettings["lockAllSegments"] = child.attrib.get("mode", None) == "lockExisting" if child.attrib.get("mode", None) == "updateExisting": for pa in producerAlgorithm.values(): pa.parameters["updateExisting"] = True if child.attrib.get("mode", None) == "replaceExisting": for pa in producerAlgorithm.values(): pa.parameters["updateExisting"] = False updateFlag = child.attrib.get("updateEvery", "event") in ("event", "both") aggregateUpdateFlag = child.attrib.get("updateEvery", "event") in ("aggregate", "both") filenameOnException = "".join([modelWriter.baseName, _modelExceptionIdentifier, ".pmml"]) child = child.child(config.ProducerBlending, exception=False) producerUpdateScheme = getUpdateScheme(child) if child and child.exists(config.MaturityThreshold): maturityConfig = child.child(config.MaturityThreshold) engineSettings["maturityThreshold"] = int(maturityConfig.attrib.get("threshold", 1)) engineSettings["lockingThreshold"] = \ None if "lockingThreshold" not in \ maturityConfig.attrib \ else int(maturityConfig["lockingThreshold"]) if engineSettings["lockAllSegments"] and segmentationScheme is not None and not segmentationScheme._generic and not segmentationScheme._whiteList: logger.warning("The model is locked and no new segments are specified...new model files will be unchanged.") else: logger.warning("There is no outputFile attribute in the ModelSetup; no new model file will be created.") else: engineSettings["hasProducer"] = False # Set up output child = configRoot.child(config.Output, exception=False) outputWriter = getOutputWriter(child, pmmlFileName) child = configRoot.child(config.EventSettings, exception=False) if child is not None: logger.info("Setting up output.") # not in a dictionary to reduce the number of lookups while looping scoreFlag = child.attrib["score"] outputFlag = child.attrib["output"] else: scoreFlag = outputFlag = False child = configRoot.child(config.AggregationSettings, exception=False) if child is not None: aggregateScoreFlag = child.attrib["score"] aggregateOutputFlag = child.attrib["output"] aggregationSettings = child.attrib else: aggregateScoreFlag = False aggregateOutputFlag = False aggregationSettings = None metadata.data["Update model"] = "true" if updateFlag or aggregateUpdateFlag else "false" # build engine once without a data stream engine = Engine(pmmlModel, None, producerUpdateScheme, consumerUpdateScheme, segmentationScheme, producerAlgorithm, **engineSettings) engine.initialize() if outputWriter: outputWriter.open() for l in logger, metadata: if "verification" in l.differentLevel: l.eventLogLevel = l.differentLevel["verification"] l.setLevel(l.differentLevel["verification"]) else: l.eventLogLevel = l.naturalLevel l.setLevel(l.naturalLevel) # score fake data from <ModelVerifications> modelVerificationConfig = configRoot.child(config.ModelVerification, exception=False) if modelVerificationConfig is not None: augustus.engine.verification.verify(modelVerificationConfig, engine, logger, outputWriter) # verification can increment aggregate variables, but # aggregates should all start at zero at the start of real # processing, whether verification happened or not engine.flushAggregates() if isinstance(dataStreamer, AugustusHTTPDataStream): if outputWriter is None: dataStreamer.respond = False if dataStreamer.respond: dataStreamer.setupOutput(outputWriter) for l in logger, metadata: if "eventloop" in l.differentLevel: l.eventLogLevel = l.differentLevel["eventloop"] l.setLevel(l.differentLevel["eventloop"]) else: l.eventLogLevel = l.naturalLevel l.setLevel(l.naturalLevel) # possibly set up custom processing customProcessing = configRoot.child(config.CustomProcessing, exception=False) if customProcessing is not None: constants = engine.pmmlModel.child(pmml.Extension, exception=False) if constants is None: constants = NameSpaceReadOnly() else: constants = constants.child(pmml.X_ODG_CustomProcessingConstants, exception=False) if constants is None: constants = NameSpaceReadOnly() else: constants = constants.nameSpace atoms = {"INVALID": INVALID, "MISSING": MISSING, "IMMATURE": IMMATURE, "MATURE": MATURE, "LOCKED": LOCKED, "UNINITIALIZED": UNINITIALIZED} for thing in pmml.OutputField.__dict__.values() + pmml.X_ODG_OutputField.__dict__.values(): if isinstance(thing, Atom): atoms[repr(thing)] = thing customProcessing.initialize(pmmlModel, engine.pmmlModel, constants, [s.userFriendly for s in engine.segmentRecords], atoms, logger, metadata, consumerUpdateScheme, producerUpdateScheme) engine.customProcessing = customProcessing engine.reinitialize() else: # only shut off circular garbage collection if there is no CustomProcessing or AugustusInterface gc.disable() self.dataStreamer = dataStreamer self.logger = logger self.engine = engine self.metadata = metadata self.aggregationSettings = aggregationSettings self.rethrowExceptions = rethrowExceptions self.scoreFlag = scoreFlag self.updateFlag = updateFlag self.outputWriter = outputWriter self.outputFlag = outputFlag self.modelWriter = modelWriter self.filenameOnException = filenameOnException self.pmmlModel = pmmlModel self.aggregateScoreFlag = aggregateScoreFlag self.aggregateUpdateFlag = aggregateUpdateFlag self.aggregateOutputFlag = aggregateOutputFlag self.customProcessing = customProcessing
def pmmlDiff(name1, name2, validate=False, numSigfigs=6, header=False, extensions=False): if validate: file1 = loadfile(name1, pmml.X_ODG_PMML, lineNumbers=True) file2 = loadfile(name2, pmml.X_ODG_PMML, lineNumbers=True) else: file1 = loadfile(name1, lineNumbers=True) file2 = loadfile(name2, lineNumbers=True) if not header: index = file1.index(lambda x: x.tag == "Header") del file1[index] index = file2.index(lambda x: x.tag == "Header") del file2[index] if extensions: index1 = [i for i, x in file1.walk()] index1.insert(0, None) index2 = [i for i, x in file2.walk()] index2.insert(0, None) else: index1 = [ i for i, x in file1.walk(lambda x: isinstance(x, pmml.Extension)) ] index1.insert(0, None) index2 = [ i for i, x in file2.walk(lambda x: isinstance(x, pmml.Extension)) ] index2.insert(0, None) if len(index1) < len(index2): index1 += [BROKEN] * (len(index2) - len(index1)) if len(index2) < len(index1): index2 += [BROKEN] * (len(index1) - len(index2)) # show problems in the order that they appear in the files for i, (i1, i2) in enumerate(zip(index1, index2)): if i1 is None: elem1 = file1 elif i1 is BROKEN: elem1 = BROKEN else: elem1 = file1[i1] if i2 is None: elem2 = file2 elif i2 is BROKEN: elem2 = BROKEN else: elem2 = file2[i2] # if we have a structure problem if i1 != i2: return "Different structure:%s" % _comparitor( name1, elem1, name2, elem2) else: if elem1.tag != elem2.tag: return "Different tag: \"%s\" vs. \"%s\"%s" % ( elem1.tag, elem2.tag, _comparitor(name1, elem1, name2, elem2)) if set(elem1.attrib.keys()) != set(elem2.attrib.keys()): return "Different attributes: %s vs. %s%s" % ( sorted(elem1.attrib.keys()), sorted(elem2.attrib.keys()), _comparitor(name1, elem1, name2, elem2)) for k in sorted(elem1.attrib.keys()): value1 = elem1.attrib[k] value2 = elem2.attrib[k] if isinstance(value1, float) and isinstance(value2, float): value1 = sigfigs(value1, numSigfigs) value2 = sigfigs(value2, numSigfigs) if value1 != value2: return "Different attribute value for \"%s\": %s %s vs. %s %s%s" % ( k, value1, str(type(elem1.attrib[k])), value2, str(type(elem2.attrib[k])), _comparitor(name1, elem1, name2, elem2)) v1 = getattr(elem1, "value", NOTFOUND) v2 = getattr(elem2, "value", NOTFOUND) if v1 is not NOTFOUND or v2 is not NOTFOUND: value1, value2 = v1, v2 if isinstance(value1, (tuple, list)) and isinstance( value2, (tuple, list)) and len(value1) == len(value2): out1, out2 = [], [] for val1, val2 in zip(value1, value2): if isinstance(val1, float) and isinstance(val2, float): val1 = sigfigs(val1, numSigfigs) val2 = sigfigs(val2, numSigfigs) out1.append(val1) out2.append(val2) value1, value2 = out1, out2 elif isinstance(v1, float) and isinstance(v2, float): value1 = sigfigs(v1, numSigfigs) value2 = sigfigs(v2, numSigfigs) if value1 != value2: return "Different value for \"%s\": %s %s vs. %s %s%s" % ( k, value1, str(type(value1)), value2, str( type(v2)), _comparitor(name1, elem1, name2, elem2)) # if both elements are leaves if len(elem1.matches()) == 0 and len(elem2.matches()) == 0: content1 = elem1.content() content2 = elem2.content() if content1 != content2: return "Different content: \"%s\" vs. \"%s\"%s" % ( content1, content2, _comparitor(name1, elem1, name2, elem2)) return None
def pmmlDiff(name1, name2, validate=False, numSigfigs=6, header=False, extensions=False): if validate: file1 = loadfile(name1, pmml.X_ODG_PMML, lineNumbers=True) file2 = loadfile(name2, pmml.X_ODG_PMML, lineNumbers=True) else: file1 = loadfile(name1, lineNumbers=True) file2 = loadfile(name2, lineNumbers=True) if not header: index = file1.index(lambda x: x.tag=="Header", exception=False) if index is not None: del file1[index] index = file2.index(lambda x: x.tag=="Header", exception=False) if index is not None: del file2[index] if not extensions: while True: index = file1.index(lambda x: x.tag == "Extension", maxdepth=None, exception=False) if index is None: break else: del file1[index] while True: index = file2.index(lambda x: x.tag == "Extension", maxdepth=None, exception=False) if index is None: break else: del file2[index] index1 = [i for i, x in file1.walk()]; index1.insert(0, None) index2 = [i for i, x in file2.walk()]; index2.insert(0, None) if len(index1) < len(index2): index1 += [BROKEN] * (len(index2) - len(index1)) if len(index2) < len(index1): index2 += [BROKEN] * (len(index1) - len(index2)) # show problems in the order that they appear in the files for i, (i1, i2) in enumerate(zip(index1, index2)): if i1 is None: elem1 = file1 elif i1 is BROKEN: elem1 = BROKEN else: elem1 = file1[i1] if i2 is None: elem2 = file2 elif i2 is BROKEN: elem2 = BROKEN else: elem2 = file2[i2] # if we have a structure problem if i1 != i2: return "Different structure:%s" % _comparitor(name1, elem1, name2, elem2) else: if elem1.tag != elem2.tag: return "Different tag: \"%s\" vs. \"%s\"%s" % (elem1.tag, elem2.tag, _comparitor(name1, elem1, name2, elem2)) if set(elem1.attrib.keys()) != set(elem2.attrib.keys()): return "Different attributes: %s vs. %s%s" % (sorted(elem1.attrib.keys()), sorted(elem2.attrib.keys()), _comparitor(name1, elem1, name2, elem2)) for k in sorted(elem1.attrib.keys()): value1 = elem1.attrib[k] value2 = elem2.attrib[k] if isinstance(value1, float) and isinstance(value2, float): value1 = sigfigs(value1, numSigfigs) value2 = sigfigs(value2, numSigfigs) if value1 != value2: return "Different attribute value for \"%s\": %s %s vs. %s %s%s" % (k, value1, str(type(elem1.attrib[k])), value2, str(type(elem2.attrib[k])), _comparitor(name1, elem1, name2, elem2)) v1 = getattr(elem1, "value", NOTFOUND) v2 = getattr(elem2, "value", NOTFOUND) if v1 is not NOTFOUND or v2 is not NOTFOUND: value1, value2 = v1, v2 if isinstance(value1, (tuple, list)) and isinstance(value2, (tuple, list)) and len(value1) == len(value2): out1, out2 = [], [] for val1, val2 in zip(value1, value2): if isinstance(val1, float) and isinstance(val2, float): val1 = sigfigs(val1, numSigfigs) val2 = sigfigs(val2, numSigfigs) out1.append(val1) out2.append(val2) value1, value2 = out1, out2 elif isinstance(v1, float) and isinstance(v2, float): value1 = sigfigs(v1, numSigfigs) value2 = sigfigs(v2, numSigfigs) if value1 != value2: return "Different value for \"%s\": %s %s vs. %s %s%s" % (k, value1, str(type(value1)), value2, str(type(v2)), _comparitor(name1, elem1, name2, elem2)) # if both elements are leaves if len(elem1.matches()) == 0 and len(elem2.matches()) == 0: content1 = elem1.content() content2 = elem2.content() if content1 != content2: return "Different content: \"%s\" vs. \"%s\"%s" % (content1, content2, _comparitor(name1, elem1, name2, elem2)) return None
import glob from augustus.core.xmlbase import loadfile import augustus.core.pmml41 as pmml from cassius import * modelFiles = glob.glob("_out/modelout*.pmml") modelFiles.sort() models = [] for modelFile in modelFiles: model = loadfile(modelFile, pmml.X_ODG_PMML) models.append(model) print model.tree() plots = [] for model in models: eventNumber = model.descendant(pmml.X_ODG_Eventstamp)["number"] discretize = model.descendant(pmml.Discretize) bins = [] for discretizeBin in discretize.matches(pmml.DiscretizeBin): binName = discretizeBin["binValue"] leftMargin = discretizeBin.child(pmml.Interval)["leftMargin"] rightMargin = discretizeBin.child(pmml.Interval)["rightMargin"] bins.append((binName, leftMargin, rightMargin)) bins.sort(lambda a, b: cmp(a[1], b[1])) h = HistogramNonUniform( [(b[1], b[2]) for b in bins],