def main(config_file=None, rethrowExceptions=False): """From the configuration, set up the Augustus engine. Set up segments, PMML tree, I/O, and logging for the ProducerConsumer. Identify what task is to be done: Producing, Consuming, or Automatic Incremental Model updates (AIM). Identify the model type and start that model, passing the segments, PMML tree, and I/O information. Arguments: config_file (string): Path to the configuration file. """ # Get the configuration settings (as an XML instance) if isinstance(config_file, config.AugustusConfiguration): configRoot = config_file else: configRoot = xmlbase.loadfile(config_file, config.Config, lineNumbers=True) setupLogging(configRoot.matches(lambda x: x.tag in ("Logging", "Metadata"))) logger = logging.getLogger() metadata = logging.getLogger('metadata') logger.info("Loading PMML model.") child = configRoot.child(config.ModelInput, exception=False) metadata.startTiming("Time to load PMML model") pmmlModel, pmmlFileName = getModel(child) metadata.stopTiming("Time to load PMML model") metadata.data["PMML model file"] = pmmlFileName logger.info("Setting up data input.") child = configRoot.child(config.DataInput, exception=False) dataStreamer = getDataStreamer(child) child = configRoot.child(config.ConsumerBlending, exception=False) consumerUpdateScheme = getUpdateScheme(child) child = configRoot.child(config.ModelSetup, exception=False) # Default Model setup parameters modelWriter = getModelWriter(None) engineSettings = dict(maturityThreshold=0) producerParameters = {} filenameOnException = None producerUpdateScheme = getUpdateScheme(None) segmentationScheme = SegmentationScheme(None, pmmlModel) aggregateUpdateFlag = updateFlag = False # Model setup if child: logger.info("Setting up model updating/producing.") modelWriter = getModelWriter(child) segmentationScheme = SegmentationScheme(child.child(config.SegmentationSchema, exception=False), pmmlModel) if modelWriter is not None: engineSettings['lockAllSegments'] = child.attrib.get("mode", None) == "lockExisting" producerParameters['resume'] = child.attrib.get("mode", None) == "updateExisting" updateFlag = child.attrib.get("updateEvery", "event") in ("event", "both") aggregateUpdateFlag = child.attrib.get("updateEvery", "event") in ("aggregate", "both") filenameOnException = "".join([modelWriter.baseName, _modelExceptionIdentifier, ".pmml"]) child = child.child(config.ProducerBlending, exception=False) producerUpdateScheme = getUpdateScheme(child) if child and child.exists(config.MaturityThreshold): maturityConfig = child.child(config.MaturityThreshold) engineSettings['maturityThreshold'] = int(maturityConfig.attrib.get("threshold", 1)) engineSettings['lockingThreshold'] = \ None if "lockingThreshold" not in \ maturityConfig.attrib \ else int(maturityConfig["lockingThreshold"]) if engineSettings['lockAllSegments'] and not segmentationScheme._generic and not segmentationScheme.whiteList: logger.warning("The model is locked and no new segments are specified...new model files will be unchanged.") else: logger.warning("There is no outputFile attribute in the ModelSetup; no new model file will be created.") # Set up output child = configRoot.child(config.Output, exception=False) outputWriter = getOutputWriter(child, pmmlFileName) child = configRoot.child(config.EventSettings, exception=False) if child is not None: logger.info("Setting up output.") # not in a dictionary to reduce the number of lookups while looping scoreFlag = child.attrib['score'] outputFlag = child.attrib['output'] else: scoreFlag = outputFlag = False child = configRoot.child(config.AggregationSettings, exception=False) if child is not None: aggregateScoreFlag = child.attrib['score'] aggregateOutputFlag = child.attrib['output'] aggregationSettings = child.attrib else: aggregationSettings = None metadata.data['Update model'] = "true" if updateFlag or aggregateUpdateFlag else "false" # build engine once without a data stream engine = Engine(pmmlModel, None, producerUpdateScheme, consumerUpdateScheme, segmentationScheme, **engineSettings) engine.initialize(producerParameters=producerParameters) if outputWriter: outputWriter.open() # score fake data from <ModelVerifications> modelVerificationConfig = configRoot.child(config.ModelVerification, exception=False) if modelVerificationConfig is not None: augustus.engine.verification.verify(modelVerificationConfig, engine, logger, outputWriter) # start of real data logger.info("Setting up Augustus's main engine.") engine.resetDataStream(dataStreamer) dataStreamer.start_streaming() metadata.data['Events'] = 0 logger.info("Calculating.") metadata.startTiming("Run time") try: while True: try: score = engine.event(score=scoreFlag, update=updateFlag) metadata.data['Events'] += 1 if outputWriter and outputFlag: try: outputWriter.write(score) except IOError: ## FIXME: this exception should be raised to the top level; I do not ## undersand why it is handled here, nor why a 'good' model is written...--tanya if modelWriter: modelWriter.write(pmmlModel) break if modelWriter: modelWriter.serialize(pmmlModel, metadata.data['Events']) if aggregationSettings: if engine.checkPseudoeventReadiness(aggregationSettings): score = engine.pseudoevent(score=aggregateScoreFlag, update=aggregateUpdateFlag) if outputWriter and outputOnAggregate: outputWriter.write(score) except StopIteration: if modelWriter: if modelWriter.serialization: modelWriter.serialize(pmmlModel, metadata.data['Events']) else: modelWriter.write(pmmlModel) break if aggregationSettings is not None and aggregationSettings['atEnd']: score = engine.pseudoevent(score=aggregateScoreFlag, update=aggregateUpdateFlag) if outputWriter and aggregateOutputFlag: outputWriter.write(score) except (Exception, KeyboardInterrupt), err: if rethrowExceptions: raise logger.error("Shutting down on exception...") excinfo = sys.exc_info() logger.error("...%s" % excinfo[0]) logger.error("...%s" % excinfo[1]) logger.error("...%s" % traceback.format_exc()) if filenameOnException: logger.error("Writing last model in location %s" % filenameOnException) pmmlModel.write(filenameOnException) sys.exit("Shutting down on exception; for more information check the logfile (if logging is enabled)...\n%s" % traceback.format_exc())
def __init__(self, config_file=None, rethrowExceptions=False, dataStream=None): # Get the configuration settings (as an XML instance) if isinstance(config_file, config.AugustusConfiguration): configRoot = config_file else: configRoot = xmlbase.loadfile(config_file, config.Config, lineNumbers=True) if "randomSeed" in configRoot.attrib: augustusRandomSeed = configRoot.attrib["randomSeed"] random.seed(augustusRandomSeed) numpy.random.seed(augustusRandomSeed + 1) else: augustusRandomSeed = "unspecified" setupLogging(configRoot.matches(lambda x: x.tag in ("Logging", "Metadata"))) logger = logging.getLogger() metadata = logging.getLogger("metadata") for l in logger, metadata: if "initialization" in l.differentLevel: l.setLevel(l.differentLevel["initialization"]) else: l.setLevel(l.naturalLevel) logger.info("Loading PMML model.") modelInput = configRoot.child(config.ModelInput, exception=False) metadata.startTiming("Time to load PMML model") pmmlModel, pmmlFileName = getModel(modelInput) metadata.stopTiming("Time to load PMML model") metadata.data["PMML model file"] = pmmlFileName logger.info("Setting up data input.") child = configRoot.child(config.DataInput, exception=False) if dataStream is None: fromHTTP = child.child(config.FromHTTP, exception=False) if fromHTTP is None: dataStreamer = getDataStreamer(child) else: dataStreamer = AugustusHTTPDataStream(fromHTTP) else: dataStreamer = dataStream child = configRoot.child(config.ConsumerBlending, exception=False) consumerUpdateScheme = getUpdateScheme(child) child = configRoot.child(config.ModelSetup, exception=False) # Default Model setup parameters modelWriter = getModelWriter(None) engineSettings = {"maturityThreshold": modelInput.attrib.get("maturityThreshold", 0), "augustusRandomSeed": augustusRandomSeed, "hasProducer": True, } filenameOnException = None producerUpdateScheme = getUpdateScheme(None) segmentationScheme = SegmentationScheme(None, pmmlModel) aggregateUpdateFlag = updateFlag = False producerAlgorithm = config.producerAlgorithmDefaults for pa in producerAlgorithm.values(): if pa.validate() is not None: raise Exception, "Programmer error in producerAlgorithmDefaults" if child is not None: for pa in child.matches(config.ProducerAlgorithm): producerAlgorithm[pa.attrib["model"]] = pa # Model setup if child: logger.info("Setting up model updating/producing.") modelWriter = getModelWriter(child) segmentationScheme = SegmentationScheme(child.child(config.SegmentationSchema, exception=False), pmmlModel) if modelWriter is not None: engineSettings["lockAllSegments"] = child.attrib.get("mode", None) == "lockExisting" if child.attrib.get("mode", None) == "updateExisting": for pa in producerAlgorithm.values(): pa.parameters["updateExisting"] = True if child.attrib.get("mode", None) == "replaceExisting": for pa in producerAlgorithm.values(): pa.parameters["updateExisting"] = False updateFlag = child.attrib.get("updateEvery", "event") in ("event", "both") aggregateUpdateFlag = child.attrib.get("updateEvery", "event") in ("aggregate", "both") filenameOnException = "".join([modelWriter.baseName, _modelExceptionIdentifier, ".pmml"]) child = child.child(config.ProducerBlending, exception=False) producerUpdateScheme = getUpdateScheme(child) if child and child.exists(config.MaturityThreshold): maturityConfig = child.child(config.MaturityThreshold) engineSettings["maturityThreshold"] = int(maturityConfig.attrib.get("threshold", 1)) engineSettings["lockingThreshold"] = \ None if "lockingThreshold" not in \ maturityConfig.attrib \ else int(maturityConfig["lockingThreshold"]) if engineSettings["lockAllSegments"] and segmentationScheme is not None and not segmentationScheme._generic and not segmentationScheme._whiteList: logger.warning("The model is locked and no new segments are specified...new model files will be unchanged.") else: logger.warning("There is no outputFile attribute in the ModelSetup; no new model file will be created.") else: engineSettings["hasProducer"] = False # Set up output child = configRoot.child(config.Output, exception=False) outputWriter = getOutputWriter(child, pmmlFileName) child = configRoot.child(config.EventSettings, exception=False) if child is not None: logger.info("Setting up output.") # not in a dictionary to reduce the number of lookups while looping scoreFlag = child.attrib["score"] outputFlag = child.attrib["output"] else: scoreFlag = outputFlag = False child = configRoot.child(config.AggregationSettings, exception=False) if child is not None: aggregateScoreFlag = child.attrib["score"] aggregateOutputFlag = child.attrib["output"] aggregationSettings = child.attrib else: aggregateScoreFlag = False aggregateOutputFlag = False aggregationSettings = None metadata.data["Update model"] = "true" if updateFlag or aggregateUpdateFlag else "false" # build engine once without a data stream engine = Engine(pmmlModel, None, producerUpdateScheme, consumerUpdateScheme, segmentationScheme, producerAlgorithm, **engineSettings) engine.initialize() if outputWriter: outputWriter.open() for l in logger, metadata: if "verification" in l.differentLevel: l.eventLogLevel = l.differentLevel["verification"] l.setLevel(l.differentLevel["verification"]) else: l.eventLogLevel = l.naturalLevel l.setLevel(l.naturalLevel) # score fake data from <ModelVerifications> modelVerificationConfig = configRoot.child(config.ModelVerification, exception=False) if modelVerificationConfig is not None: augustus.engine.verification.verify(modelVerificationConfig, engine, logger, outputWriter) # verification can increment aggregate variables, but # aggregates should all start at zero at the start of real # processing, whether verification happened or not engine.flushAggregates() if isinstance(dataStreamer, AugustusHTTPDataStream): if outputWriter is None: dataStreamer.respond = False if dataStreamer.respond: dataStreamer.setupOutput(outputWriter) for l in logger, metadata: if "eventloop" in l.differentLevel: l.eventLogLevel = l.differentLevel["eventloop"] l.setLevel(l.differentLevel["eventloop"]) else: l.eventLogLevel = l.naturalLevel l.setLevel(l.naturalLevel) # possibly set up custom processing customProcessing = configRoot.child(config.CustomProcessing, exception=False) if customProcessing is not None: constants = engine.pmmlModel.child(pmml.Extension, exception=False) if constants is None: constants = NameSpaceReadOnly() else: constants = constants.child(pmml.X_ODG_CustomProcessingConstants, exception=False) if constants is None: constants = NameSpaceReadOnly() else: constants = constants.nameSpace atoms = {"INVALID": INVALID, "MISSING": MISSING, "IMMATURE": IMMATURE, "MATURE": MATURE, "LOCKED": LOCKED, "UNINITIALIZED": UNINITIALIZED} for thing in pmml.OutputField.__dict__.values() + pmml.X_ODG_OutputField.__dict__.values(): if isinstance(thing, Atom): atoms[repr(thing)] = thing customProcessing.initialize(pmmlModel, engine.pmmlModel, constants, [s.userFriendly for s in engine.segmentRecords], atoms, logger, metadata, consumerUpdateScheme, producerUpdateScheme) engine.customProcessing = customProcessing engine.reinitialize() else: # only shut off circular garbage collection if there is no CustomProcessing or AugustusInterface gc.disable() self.dataStreamer = dataStreamer self.logger = logger self.engine = engine self.metadata = metadata self.aggregationSettings = aggregationSettings self.rethrowExceptions = rethrowExceptions self.scoreFlag = scoreFlag self.updateFlag = updateFlag self.outputWriter = outputWriter self.outputFlag = outputFlag self.modelWriter = modelWriter self.filenameOnException = filenameOnException self.pmmlModel = pmmlModel self.aggregateScoreFlag = aggregateScoreFlag self.aggregateUpdateFlag = aggregateUpdateFlag self.aggregateOutputFlag = aggregateOutputFlag self.customProcessing = customProcessing
def main(config_file=None, rethrowExceptions=False): """From the configuration, set up the Augustus engine. Set up segments, PMML tree, I/O, and logging for the ProducerConsumer. Identify what task is to be done: Producing, Consuming, or Automatic Incremental Model updates (AIM). Identify the model type and start that model, passing the segments, PMML tree, and I/O information. Arguments: config_file (string): Path to the configuration file. """ # Get the configuration settings (as an XML instance) if isinstance(config_file, config.AugustusConfiguration): configRoot = config_file else: configRoot = xmlbase.loadfile(config_file, config.Config, lineNumbers=True) setupLogging( configRoot.matches(lambda x: x.tag in ("Logging", "Metadata"))) logger = logging.getLogger() metadata = logging.getLogger('metadata') logger.info("Loading PMML model.") child = configRoot.child(config.ModelInput, exception=False) metadata.startTiming("Time to load PMML model") pmmlModel, pmmlFileName = getModel(child) metadata.stopTiming("Time to load PMML model") metadata.data["PMML model file"] = pmmlFileName logger.info("Setting up data input.") child = configRoot.child(config.DataInput, exception=False) dataStreamer = getDataStreamer(child) child = configRoot.child(config.ConsumerBlending, exception=False) consumerUpdateScheme = getUpdateScheme(child) child = configRoot.child(config.ModelSetup, exception=False) # Default Model setup parameters modelWriter = getModelWriter(None) engineSettings = dict(maturityThreshold=0) producerParameters = {} filenameOnException = None producerUpdateScheme = getUpdateScheme(None) segmentationScheme = SegmentationScheme(None, pmmlModel) aggregateUpdateFlag = updateFlag = False # Model setup if child: logger.info("Setting up model updating/producing.") modelWriter = getModelWriter(child) segmentationScheme = SegmentationScheme( child.child(config.SegmentationSchema, exception=False), pmmlModel) if modelWriter is not None: engineSettings['lockAllSegments'] = child.attrib.get( "mode", None) == "lockExisting" producerParameters['resume'] = child.attrib.get( "mode", None) == "updateExisting" updateFlag = child.attrib.get("updateEvery", "event") in ("event", "both") aggregateUpdateFlag = child.attrib.get("updateEvery", "event") in ("aggregate", "both") filenameOnException = "".join( [modelWriter.baseName, _modelExceptionIdentifier, ".pmml"]) child = child.child(config.ProducerBlending, exception=False) producerUpdateScheme = getUpdateScheme(child) if child and child.exists(config.MaturityThreshold): maturityConfig = child.child(config.MaturityThreshold) engineSettings['maturityThreshold'] = int( maturityConfig.attrib.get("threshold", 1)) engineSettings['lockingThreshold'] = \ None if "lockingThreshold" not in \ maturityConfig.attrib \ else int(maturityConfig["lockingThreshold"]) if engineSettings[ 'lockAllSegments'] and not segmentationScheme._generic and not segmentationScheme.whiteList: logger.warning( "The model is locked and no new segments are specified...new model files will be unchanged." ) else: logger.warning( "There is no outputFile attribute in the ModelSetup; no new model file will be created." ) # Set up output child = configRoot.child(config.Output, exception=False) outputWriter = getOutputWriter(child, pmmlFileName) child = configRoot.child(config.EventSettings, exception=False) if child is not None: logger.info("Setting up output.") # not in a dictionary to reduce the number of lookups while looping scoreFlag = child.attrib['score'] outputFlag = child.attrib['output'] else: scoreFlag = outputFlag = False child = configRoot.child(config.AggregationSettings, exception=False) if child is not None: aggregateScoreFlag = child.attrib['score'] aggregateOutputFlag = child.attrib['output'] aggregationSettings = child.attrib else: aggregationSettings = None metadata.data[ 'Update model'] = "true" if updateFlag or aggregateUpdateFlag else "false" # build engine once without a data stream engine = Engine(pmmlModel, None, producerUpdateScheme, consumerUpdateScheme, segmentationScheme, **engineSettings) engine.initialize(producerParameters=producerParameters) if outputWriter: outputWriter.open() # score fake data from <ModelVerifications> modelVerificationConfig = configRoot.child(config.ModelVerification, exception=False) if modelVerificationConfig is not None: augustus.engine.verification.verify(modelVerificationConfig, engine, logger, outputWriter) # start of real data logger.info("Setting up Augustus's main engine.") engine.resetDataStream(dataStreamer) dataStreamer.start_streaming() metadata.data['Events'] = 0 logger.info("Calculating.") metadata.startTiming("Run time") try: while True: try: score = engine.event(score=scoreFlag, update=updateFlag) metadata.data['Events'] += 1 if outputWriter and outputFlag: try: outputWriter.write(score) except IOError: ## FIXME: this exception should be raised to the top level; I do not ## undersand why it is handled here, nor why a 'good' model is written...--tanya if modelWriter: modelWriter.write(pmmlModel) break if modelWriter: modelWriter.serialize(pmmlModel, metadata.data['Events']) if aggregationSettings: if engine.checkPseudoeventReadiness(aggregationSettings): score = engine.pseudoevent(score=aggregateScoreFlag, update=aggregateUpdateFlag) if outputWriter and outputOnAggregate: outputWriter.write(score) except StopIteration: if modelWriter: if modelWriter.serialization: modelWriter.serialize(pmmlModel, metadata.data['Events']) else: modelWriter.write(pmmlModel) break if aggregationSettings is not None and aggregationSettings['atEnd']: score = engine.pseudoevent(score=aggregateScoreFlag, update=aggregateUpdateFlag) if outputWriter and aggregateOutputFlag: outputWriter.write(score) except (Exception, KeyboardInterrupt), err: if rethrowExceptions: raise logger.error("Shutting down on exception...") excinfo = sys.exc_info() logger.error("...%s" % excinfo[0]) logger.error("...%s" % excinfo[1]) logger.error("...%s" % traceback.format_exc()) if filenameOnException: logger.error("Writing last model in location %s" % filenameOnException) pmmlModel.write(filenameOnException) sys.exit( "Shutting down on exception; for more information check the logfile (if logging is enabled)...\n%s" % traceback.format_exc())