Example #1
0
def main(config_file=None, rethrowExceptions=False):
    """From the configuration, set up the Augustus engine.
    
    Set up segments, PMML tree, I/O, and logging for the
    ProducerConsumer.  Identify what task is to be done:
    Producing, Consuming, or Automatic Incremental Model
    updates (AIM).  Identify the model type and start that
    model, passing the segments, PMML tree, and I/O information.

    Arguments:

        config_file (string):
            Path to the configuration file.
    """
    # Get the configuration settings (as an XML instance)
    if isinstance(config_file, config.AugustusConfiguration):
        configRoot = config_file
    else:
        configRoot = xmlbase.loadfile(config_file, config.Config, lineNumbers=True)

    setupLogging(configRoot.matches(lambda x: x.tag in ("Logging", "Metadata")))
    logger = logging.getLogger()
    metadata = logging.getLogger('metadata')

    logger.info("Loading PMML model.")
    child = configRoot.child(config.ModelInput, exception=False)
    metadata.startTiming("Time to load PMML model")
    pmmlModel, pmmlFileName = getModel(child)
    metadata.stopTiming("Time to load PMML model")
    metadata.data["PMML model file"] = pmmlFileName

    logger.info("Setting up data input.")
    child = configRoot.child(config.DataInput, exception=False)
    dataStreamer = getDataStreamer(child)

    child = configRoot.child(config.ConsumerBlending, exception=False)
    consumerUpdateScheme = getUpdateScheme(child)

    child = configRoot.child(config.ModelSetup, exception=False)
    # Default Model setup parameters
    modelWriter = getModelWriter(None)
    engineSettings = dict(maturityThreshold=0)
    producerParameters = {}
    filenameOnException = None
    producerUpdateScheme = getUpdateScheme(None)
    segmentationScheme = SegmentationScheme(None, pmmlModel)
    aggregateUpdateFlag = updateFlag = False

    # Model setup
    if child:
        logger.info("Setting up model updating/producing.")
        modelWriter = getModelWriter(child)
        segmentationScheme = SegmentationScheme(child.child(config.SegmentationSchema, exception=False), pmmlModel)
        if modelWriter is not None:
            engineSettings['lockAllSegments'] = child.attrib.get("mode", None) == "lockExisting"
            producerParameters['resume'] = child.attrib.get("mode", None) == "updateExisting"
            updateFlag = child.attrib.get("updateEvery", "event") in ("event", "both")
            aggregateUpdateFlag = child.attrib.get("updateEvery", "event") in ("aggregate", "both")
            filenameOnException = "".join([modelWriter.baseName, _modelExceptionIdentifier, ".pmml"])
            child = child.child(config.ProducerBlending, exception=False)
            producerUpdateScheme = getUpdateScheme(child)
            if child and child.exists(config.MaturityThreshold):
                maturityConfig = child.child(config.MaturityThreshold)
                engineSettings['maturityThreshold'] = int(maturityConfig.attrib.get("threshold", 1))
                engineSettings['lockingThreshold'] = \
                    None if "lockingThreshold" not in \
                    maturityConfig.attrib \
                    else int(maturityConfig["lockingThreshold"])
            if engineSettings['lockAllSegments'] and not segmentationScheme._generic and not segmentationScheme.whiteList:
                logger.warning("The model is locked and no new segments are specified...new model files will be unchanged.")
        else:
            logger.warning("There is no outputFile attribute in the ModelSetup; no new model file will be created.")

    # Set up output
    child = configRoot.child(config.Output, exception=False)
    outputWriter = getOutputWriter(child, pmmlFileName)
    child = configRoot.child(config.EventSettings, exception=False)
    if child is not None:
        logger.info("Setting up output.")
        # not in a dictionary to reduce the number of lookups while looping
        scoreFlag = child.attrib['score']
        outputFlag = child.attrib['output']
    else:
        scoreFlag = outputFlag = False
    child = configRoot.child(config.AggregationSettings, exception=False)
    if child is not None:
        aggregateScoreFlag = child.attrib['score']
        aggregateOutputFlag = child.attrib['output']
        aggregationSettings = child.attrib
    else:
        aggregationSettings = None

    metadata.data['Update model'] = "true" if updateFlag or aggregateUpdateFlag else "false"

    # build engine once without a data stream
    engine = Engine(pmmlModel, None, producerUpdateScheme, consumerUpdateScheme, segmentationScheme, **engineSettings)
    engine.initialize(producerParameters=producerParameters)
    if outputWriter: outputWriter.open()

    # score fake data from <ModelVerifications>
    modelVerificationConfig = configRoot.child(config.ModelVerification, exception=False)
    if modelVerificationConfig is not None:
        augustus.engine.verification.verify(modelVerificationConfig, engine, logger, outputWriter)

    # start of real data
    logger.info("Setting up Augustus's main engine.")
    engine.resetDataStream(dataStreamer)
    dataStreamer.start_streaming()

    metadata.data['Events'] = 0
    logger.info("Calculating.")
    metadata.startTiming("Run time")

    try:
        while True:
            try:
                score = engine.event(score=scoreFlag, update=updateFlag)
                metadata.data['Events'] += 1
                if outputWriter and outputFlag:
                    try:
                        outputWriter.write(score)
                    except IOError:
                        ## FIXME: this exception should be raised to the top level; I do not
                        ## undersand why it is handled here, nor why a 'good' model is written...--tanya
                        if modelWriter:
                            modelWriter.write(pmmlModel)
                        break
                if modelWriter:
                    modelWriter.serialize(pmmlModel, metadata.data['Events'])

                if aggregationSettings:
                    if engine.checkPseudoeventReadiness(aggregationSettings):
                        score = engine.pseudoevent(score=aggregateScoreFlag, update=aggregateUpdateFlag)
                        if outputWriter and outputOnAggregate:
                            outputWriter.write(score)
                        
            except StopIteration:
                if modelWriter:
                    if modelWriter.serialization:
                        modelWriter.serialize(pmmlModel, metadata.data['Events'])
                    else:
                        modelWriter.write(pmmlModel)
                break

        if aggregationSettings is not None and aggregationSettings['atEnd']:
            score = engine.pseudoevent(score=aggregateScoreFlag, update=aggregateUpdateFlag)
            if outputWriter and aggregateOutputFlag:
                outputWriter.write(score)

    except (Exception, KeyboardInterrupt), err:
        if rethrowExceptions: raise

        logger.error("Shutting down on exception...")
        excinfo = sys.exc_info()
        logger.error("...%s" % excinfo[0])
        logger.error("...%s" % excinfo[1])
        logger.error("...%s" % traceback.format_exc())
        if filenameOnException:
            logger.error("Writing last model in location %s" % filenameOnException)
            pmmlModel.write(filenameOnException)

        sys.exit("Shutting down on exception; for more information check the logfile (if logging is enabled)...\n%s" % traceback.format_exc())
Example #2
0
    def __init__(self, config_file=None, rethrowExceptions=False, dataStream=None):
        # Get the configuration settings (as an XML instance)
        if isinstance(config_file, config.AugustusConfiguration):
            configRoot = config_file
        else:
            configRoot = xmlbase.loadfile(config_file, config.Config, lineNumbers=True)

        if "randomSeed" in configRoot.attrib:
            augustusRandomSeed = configRoot.attrib["randomSeed"]
            random.seed(augustusRandomSeed)
            numpy.random.seed(augustusRandomSeed + 1)
        else:
            augustusRandomSeed = "unspecified"

        setupLogging(configRoot.matches(lambda x: x.tag in ("Logging", "Metadata")))
        logger = logging.getLogger()
        metadata = logging.getLogger("metadata")

        for l in logger, metadata:
            if "initialization" in l.differentLevel:
                l.setLevel(l.differentLevel["initialization"])
            else:
                l.setLevel(l.naturalLevel)

        logger.info("Loading PMML model.")
        modelInput = configRoot.child(config.ModelInput, exception=False)
        metadata.startTiming("Time to load PMML model")
        pmmlModel, pmmlFileName = getModel(modelInput)
        metadata.stopTiming("Time to load PMML model")
        metadata.data["PMML model file"] = pmmlFileName

        logger.info("Setting up data input.")
        child = configRoot.child(config.DataInput, exception=False)
        if dataStream is None:
            fromHTTP = child.child(config.FromHTTP, exception=False)
            if fromHTTP is None:
                dataStreamer = getDataStreamer(child)                
            else:
                dataStreamer = AugustusHTTPDataStream(fromHTTP)
        else:
            dataStreamer = dataStream

        child = configRoot.child(config.ConsumerBlending, exception=False)
        consumerUpdateScheme = getUpdateScheme(child)

        child = configRoot.child(config.ModelSetup, exception=False)
        # Default Model setup parameters
        modelWriter = getModelWriter(None)
        engineSettings = {"maturityThreshold": modelInput.attrib.get("maturityThreshold", 0),
                          "augustusRandomSeed": augustusRandomSeed,
                          "hasProducer": True,
                          }
        filenameOnException = None
        producerUpdateScheme = getUpdateScheme(None)
        segmentationScheme = SegmentationScheme(None, pmmlModel)
        aggregateUpdateFlag = updateFlag = False

        producerAlgorithm = config.producerAlgorithmDefaults
        for pa in producerAlgorithm.values():
            if pa.validate() is not None:
                raise Exception, "Programmer error in producerAlgorithmDefaults"
        if child is not None:
            for pa in child.matches(config.ProducerAlgorithm):
                producerAlgorithm[pa.attrib["model"]] = pa

        # Model setup
        if child:
            logger.info("Setting up model updating/producing.")
            modelWriter = getModelWriter(child)
            segmentationScheme = SegmentationScheme(child.child(config.SegmentationSchema, exception=False), pmmlModel)
            if modelWriter is not None:
                engineSettings["lockAllSegments"] = child.attrib.get("mode", None) == "lockExisting"
                if child.attrib.get("mode", None) == "updateExisting":
                    for pa in producerAlgorithm.values():
                        pa.parameters["updateExisting"] = True
                if child.attrib.get("mode", None) == "replaceExisting":
                    for pa in producerAlgorithm.values():
                        pa.parameters["updateExisting"] = False

                updateFlag = child.attrib.get("updateEvery", "event") in ("event", "both")
                aggregateUpdateFlag = child.attrib.get("updateEvery", "event") in ("aggregate", "both")
                filenameOnException = "".join([modelWriter.baseName, _modelExceptionIdentifier, ".pmml"])
                child = child.child(config.ProducerBlending, exception=False)
                producerUpdateScheme = getUpdateScheme(child)
                if child and child.exists(config.MaturityThreshold):
                    maturityConfig = child.child(config.MaturityThreshold)
                    engineSettings["maturityThreshold"] = int(maturityConfig.attrib.get("threshold", 1))
                    engineSettings["lockingThreshold"] = \
                        None if "lockingThreshold" not in \
                        maturityConfig.attrib \
                        else int(maturityConfig["lockingThreshold"])
                if engineSettings["lockAllSegments"] and segmentationScheme is not None and not segmentationScheme._generic and not segmentationScheme._whiteList:
                    logger.warning("The model is locked and no new segments are specified...new model files will be unchanged.")
            else:
                logger.warning("There is no outputFile attribute in the ModelSetup; no new model file will be created.")
        else:
            engineSettings["hasProducer"] = False

        # Set up output
        child = configRoot.child(config.Output, exception=False)
        outputWriter = getOutputWriter(child, pmmlFileName)
        child = configRoot.child(config.EventSettings, exception=False)
        if child is not None:
            logger.info("Setting up output.")
            # not in a dictionary to reduce the number of lookups while looping
            scoreFlag = child.attrib["score"]
            outputFlag = child.attrib["output"]
        else:
            scoreFlag = outputFlag = False
        child = configRoot.child(config.AggregationSettings, exception=False)
        if child is not None:
            aggregateScoreFlag = child.attrib["score"]
            aggregateOutputFlag = child.attrib["output"]
            aggregationSettings = child.attrib
        else:
            aggregateScoreFlag = False
            aggregateOutputFlag = False
            aggregationSettings = None

        metadata.data["Update model"] = "true" if updateFlag or aggregateUpdateFlag else "false"

        # build engine once without a data stream
        engine = Engine(pmmlModel, None, producerUpdateScheme, consumerUpdateScheme, segmentationScheme, producerAlgorithm, **engineSettings)
        engine.initialize()
        if outputWriter: outputWriter.open()

        for l in logger, metadata:
            if "verification" in l.differentLevel:
                l.eventLogLevel = l.differentLevel["verification"]
                l.setLevel(l.differentLevel["verification"])
            else:
                l.eventLogLevel = l.naturalLevel
                l.setLevel(l.naturalLevel)

        # score fake data from <ModelVerifications>
        modelVerificationConfig = configRoot.child(config.ModelVerification, exception=False)
        if modelVerificationConfig is not None:
            augustus.engine.verification.verify(modelVerificationConfig, engine, logger, outputWriter)

        # verification can increment aggregate variables, but
        # aggregates should all start at zero at the start of real
        # processing, whether verification happened or not
        engine.flushAggregates()

        if isinstance(dataStreamer, AugustusHTTPDataStream):
            if outputWriter is None:
                dataStreamer.respond = False
            if dataStreamer.respond:
                dataStreamer.setupOutput(outputWriter)

        for l in logger, metadata:
            if "eventloop" in l.differentLevel:
                l.eventLogLevel = l.differentLevel["eventloop"]
                l.setLevel(l.differentLevel["eventloop"])
            else:
                l.eventLogLevel = l.naturalLevel
                l.setLevel(l.naturalLevel)

        # possibly set up custom processing
        customProcessing = configRoot.child(config.CustomProcessing, exception=False)
        if customProcessing is not None:
            constants = engine.pmmlModel.child(pmml.Extension, exception=False)
            if constants is None:
                constants = NameSpaceReadOnly()
            else:
                constants = constants.child(pmml.X_ODG_CustomProcessingConstants, exception=False)
                if constants is None:
                    constants = NameSpaceReadOnly()
                else:
                    constants = constants.nameSpace

            atoms = {"INVALID": INVALID, "MISSING": MISSING, "IMMATURE": IMMATURE, "MATURE": MATURE, "LOCKED": LOCKED, "UNINITIALIZED": UNINITIALIZED}
            for thing in pmml.OutputField.__dict__.values() + pmml.X_ODG_OutputField.__dict__.values():
                if isinstance(thing, Atom):
                    atoms[repr(thing)] = thing

            customProcessing.initialize(pmmlModel, engine.pmmlModel, constants, [s.userFriendly for s in engine.segmentRecords], atoms, logger, metadata, consumerUpdateScheme, producerUpdateScheme)
            engine.customProcessing = customProcessing
            engine.reinitialize()

        else:
            # only shut off circular garbage collection if there is no CustomProcessing or AugustusInterface
            gc.disable()

        self.dataStreamer = dataStreamer
        self.logger = logger
        self.engine = engine
        self.metadata = metadata
        self.aggregationSettings = aggregationSettings
        self.rethrowExceptions = rethrowExceptions
        self.scoreFlag = scoreFlag
        self.updateFlag = updateFlag
        self.outputWriter = outputWriter
        self.outputFlag = outputFlag
        self.modelWriter = modelWriter
        self.filenameOnException = filenameOnException
        self.pmmlModel = pmmlModel
        self.aggregateScoreFlag = aggregateScoreFlag
        self.aggregateUpdateFlag = aggregateUpdateFlag
        self.aggregateOutputFlag = aggregateOutputFlag
        self.customProcessing = customProcessing
Example #3
0
def main(config_file=None, rethrowExceptions=False):
    """From the configuration, set up the Augustus engine.
    
    Set up segments, PMML tree, I/O, and logging for the
    ProducerConsumer.  Identify what task is to be done:
    Producing, Consuming, or Automatic Incremental Model
    updates (AIM).  Identify the model type and start that
    model, passing the segments, PMML tree, and I/O information.

    Arguments:

        config_file (string):
            Path to the configuration file.
    """
    # Get the configuration settings (as an XML instance)
    if isinstance(config_file, config.AugustusConfiguration):
        configRoot = config_file
    else:
        configRoot = xmlbase.loadfile(config_file,
                                      config.Config,
                                      lineNumbers=True)

    setupLogging(
        configRoot.matches(lambda x: x.tag in ("Logging", "Metadata")))
    logger = logging.getLogger()
    metadata = logging.getLogger('metadata')

    logger.info("Loading PMML model.")
    child = configRoot.child(config.ModelInput, exception=False)
    metadata.startTiming("Time to load PMML model")
    pmmlModel, pmmlFileName = getModel(child)
    metadata.stopTiming("Time to load PMML model")
    metadata.data["PMML model file"] = pmmlFileName

    logger.info("Setting up data input.")
    child = configRoot.child(config.DataInput, exception=False)
    dataStreamer = getDataStreamer(child)

    child = configRoot.child(config.ConsumerBlending, exception=False)
    consumerUpdateScheme = getUpdateScheme(child)

    child = configRoot.child(config.ModelSetup, exception=False)
    # Default Model setup parameters
    modelWriter = getModelWriter(None)
    engineSettings = dict(maturityThreshold=0)
    producerParameters = {}
    filenameOnException = None
    producerUpdateScheme = getUpdateScheme(None)
    segmentationScheme = SegmentationScheme(None, pmmlModel)
    aggregateUpdateFlag = updateFlag = False

    # Model setup
    if child:
        logger.info("Setting up model updating/producing.")
        modelWriter = getModelWriter(child)
        segmentationScheme = SegmentationScheme(
            child.child(config.SegmentationSchema, exception=False), pmmlModel)
        if modelWriter is not None:
            engineSettings['lockAllSegments'] = child.attrib.get(
                "mode", None) == "lockExisting"
            producerParameters['resume'] = child.attrib.get(
                "mode", None) == "updateExisting"
            updateFlag = child.attrib.get("updateEvery",
                                          "event") in ("event", "both")
            aggregateUpdateFlag = child.attrib.get("updateEvery",
                                                   "event") in ("aggregate",
                                                                "both")
            filenameOnException = "".join(
                [modelWriter.baseName, _modelExceptionIdentifier, ".pmml"])
            child = child.child(config.ProducerBlending, exception=False)
            producerUpdateScheme = getUpdateScheme(child)
            if child and child.exists(config.MaturityThreshold):
                maturityConfig = child.child(config.MaturityThreshold)
                engineSettings['maturityThreshold'] = int(
                    maturityConfig.attrib.get("threshold", 1))
                engineSettings['lockingThreshold'] = \
                    None if "lockingThreshold" not in \
                    maturityConfig.attrib \
                    else int(maturityConfig["lockingThreshold"])
            if engineSettings[
                    'lockAllSegments'] and not segmentationScheme._generic and not segmentationScheme.whiteList:
                logger.warning(
                    "The model is locked and no new segments are specified...new model files will be unchanged."
                )
        else:
            logger.warning(
                "There is no outputFile attribute in the ModelSetup; no new model file will be created."
            )

    # Set up output
    child = configRoot.child(config.Output, exception=False)
    outputWriter = getOutputWriter(child, pmmlFileName)
    child = configRoot.child(config.EventSettings, exception=False)
    if child is not None:
        logger.info("Setting up output.")
        # not in a dictionary to reduce the number of lookups while looping
        scoreFlag = child.attrib['score']
        outputFlag = child.attrib['output']
    else:
        scoreFlag = outputFlag = False
    child = configRoot.child(config.AggregationSettings, exception=False)
    if child is not None:
        aggregateScoreFlag = child.attrib['score']
        aggregateOutputFlag = child.attrib['output']
        aggregationSettings = child.attrib
    else:
        aggregationSettings = None

    metadata.data[
        'Update model'] = "true" if updateFlag or aggregateUpdateFlag else "false"

    # build engine once without a data stream
    engine = Engine(pmmlModel, None, producerUpdateScheme,
                    consumerUpdateScheme, segmentationScheme, **engineSettings)
    engine.initialize(producerParameters=producerParameters)
    if outputWriter: outputWriter.open()

    # score fake data from <ModelVerifications>
    modelVerificationConfig = configRoot.child(config.ModelVerification,
                                               exception=False)
    if modelVerificationConfig is not None:
        augustus.engine.verification.verify(modelVerificationConfig, engine,
                                            logger, outputWriter)

    # start of real data
    logger.info("Setting up Augustus's main engine.")
    engine.resetDataStream(dataStreamer)
    dataStreamer.start_streaming()

    metadata.data['Events'] = 0
    logger.info("Calculating.")
    metadata.startTiming("Run time")

    try:
        while True:
            try:
                score = engine.event(score=scoreFlag, update=updateFlag)
                metadata.data['Events'] += 1
                if outputWriter and outputFlag:
                    try:
                        outputWriter.write(score)
                    except IOError:
                        ## FIXME: this exception should be raised to the top level; I do not
                        ## undersand why it is handled here, nor why a 'good' model is written...--tanya
                        if modelWriter:
                            modelWriter.write(pmmlModel)
                        break
                if modelWriter:
                    modelWriter.serialize(pmmlModel, metadata.data['Events'])

                if aggregationSettings:
                    if engine.checkPseudoeventReadiness(aggregationSettings):
                        score = engine.pseudoevent(score=aggregateScoreFlag,
                                                   update=aggregateUpdateFlag)
                        if outputWriter and outputOnAggregate:
                            outputWriter.write(score)

            except StopIteration:
                if modelWriter:
                    if modelWriter.serialization:
                        modelWriter.serialize(pmmlModel,
                                              metadata.data['Events'])
                    else:
                        modelWriter.write(pmmlModel)
                break

        if aggregationSettings is not None and aggregationSettings['atEnd']:
            score = engine.pseudoevent(score=aggregateScoreFlag,
                                       update=aggregateUpdateFlag)
            if outputWriter and aggregateOutputFlag:
                outputWriter.write(score)

    except (Exception, KeyboardInterrupt), err:
        if rethrowExceptions: raise

        logger.error("Shutting down on exception...")
        excinfo = sys.exc_info()
        logger.error("...%s" % excinfo[0])
        logger.error("...%s" % excinfo[1])
        logger.error("...%s" % traceback.format_exc())
        if filenameOnException:
            logger.error("Writing last model in location %s" %
                         filenameOnException)
            pmmlModel.write(filenameOnException)

        sys.exit(
            "Shutting down on exception; for more information check the logfile (if logging is enabled)...\n%s"
            % traceback.format_exc())