Example #1
0
    def __init__(self, configuration, dataStream=None, connect="", exceptions=True):
        if isinstance(configuration, config.AugustusConfiguration):
            configuration.validate(exception=True)
        else:
            try:
                configuration = xmlbase.loadfile(configuration, config.Config, lineNumbers=True)
            except IOError:
                configuration = xmlbase.load(configuration, config.Config)

        if configuration.exists(config.CustomProcessing):
            raise Exception("The Augustus class defines its own <CustomProcessing>; please leave this out of the configuration file.")

        self.dataStream = dataStream
        if configuration.child(config.DataInput).exists(config.Interactive):
            if dataStream is None:
                raise Exception("If the configuration has a DataInput <Interactive> block, then a DataStream object must be provided.")
        else:
            if dataStream is not None:
                raise Exception("If the configuration has no DataInput <Interactive> block, then a DataStream object must not be provided.")

        persistentStorage = config.PersistentStorage(connect=connect)
        persistentStorage.validate()
        customProcessing = config.CustomProcessing(persistentStorage)
        customProcessing.code = None
        customProcessing.callbackClass = self
        configuration.children.append(customProcessing)

        self.configuration = configuration
        self.mainLoop = augustus.engine.mainloop.MainLoop(self.configuration, rethrowExceptions=exceptions, dataStream=self.dataStream)
Example #2
0
    def post_validate(self):
        if "validate" not in self.attrib:
            self["validate"] = True

        try:
            self.data = xmlbase.loadfile(self["fileName"], pmml.X_ODG_PMML, validation=self["validate"])
        except XMLValidationError, err:
            raise RuntimeError, "PMML file %s failed validation: %s" % (self["fileName"], str(err))
Example #3
0
    def post_validate(self):
        if "validate" not in self.attrib:
            self["validate"] = True

        try:
            self.data = xmlbase.loadfile(self["fileName"], pmml.X_ODG_PMML, validation=self["validate"])
        except XMLValidationError, err:
            raise RuntimeError("PMML file %s failed validation: %s" % (self["fileName"], str(err)))
Example #4
0
    def __init__(self,
                 configuration,
                 dataStream=None,
                 connect="",
                 exceptions=True):
        if isinstance(configuration, config.AugustusConfiguration):
            configuration.validate(exception=True)
        else:
            try:
                configuration = xmlbase.loadfile(configuration,
                                                 config.Config,
                                                 lineNumbers=True)
            except IOError:
                configuration = xmlbase.load(configuration, config.Config)

        if configuration.exists(config.CustomProcessing):
            raise Exception(
                "The Augustus class defines its own <CustomProcessing>; please leave this out of the configuration file."
            )

        self.dataStream = dataStream
        if configuration.child(config.DataInput).exists(config.Interactive):
            if dataStream is None:
                raise Exception(
                    "If the configuration has a DataInput <Interactive> block, then a DataStream object must be provided."
                )
        else:
            if dataStream is not None:
                raise Exception(
                    "If the configuration has no DataInput <Interactive> block, then a DataStream object must not be provided."
                )

        persistentStorage = config.PersistentStorage(connect=connect)
        persistentStorage.validate()
        customProcessing = config.CustomProcessing(persistentStorage)
        customProcessing.code = None
        customProcessing.callbackClass = self
        configuration.children.append(customProcessing)

        self.configuration = configuration
        self.mainLoop = augustus.engine.mainloop.MainLoop(
            self.configuration,
            rethrowExceptions=exceptions,
            dataStream=self.dataStream)
Example #5
0
def getModel(configOptions):
    """Return a pmmlElement object: the root of the model.

    Arguments:

        configOptions (XML object, defined in xmlbase):
            The XML element <ModelInput>...</ModelInput> which
            contains the source location for the PMML model.
    """
    #xsd: Assume FromFile/FromFifo, with file name required.
    sourceElement = configOptions.child(lambda x: x.tag.startswith("From"))
    filename = sourceElement["name"]
    if sourceElement.tag.endswith("File"):
        selectmode = sourceElement.attrib.get(
            "selectmode", "lastAlphabetic")

        if filename.startswith("http://") or filename.startswith("https://"):
            pass

        else:
            filelist = glob.glob(filename)
            if len(filelist) > 1:
                filelist = [
                    f for f in filelist if _modelExceptionIdentifier not in f]
            if len(filelist) == 0 :
                raise RuntimeError, "no files matched the given filename/glob: %s" % filename

            if selectmode == "mostRecent":
                filename = max(filelist, key=lambda x: os.stat(x).st_mtime)
            else:
                filelist.sort()
                filename = filelist[-1]

            if _modelExceptionIdentifier in filename:
                logging.getLogger().warning("Using a PMML model that was written on exception:File name: %s" % filename)

    try:
        # TODO: make lineNumbers optional (better diagnostics with them, better performance without them)
        model = xmlbase.loadfile(filename, pmml.X_ODG_PMML, lineNumbers=True)
    except:
        logging.getLogger().error("Error loading PMML model from %s." % filename)
        raise
    return model, filename
Example #6
0
import glob

from augustus.core.xmlbase import loadfile
import augustus.core.pmml41 as pmml
from cassius import *

modelFiles = glob.glob("_out/modelout*.pmml")
modelFiles.sort()

models = []
for modelFile in modelFiles:
    model = loadfile(modelFile, pmml.X_ODG_PMML)
    models.append(model)

print model.tree()

plots = []
for model in models:
    eventNumber = model.descendant(pmml.X_ODG_Eventstamp)["number"]

    discretize = model.descendant(pmml.Discretize)
    bins = []
    for discretizeBin in discretize.matches(pmml.DiscretizeBin):
        binName = discretizeBin["binValue"]
        leftMargin = discretizeBin.child(pmml.Interval)["leftMargin"]
        rightMargin = discretizeBin.child(pmml.Interval)["rightMargin"]
        bins.append((binName, leftMargin, rightMargin))
    bins.sort(lambda a, b: cmp(a[1], b[1]))

    h = HistogramNonUniform([(b[1], b[2]) for b in bins],
                            fillcolor="yellow",
Example #7
0
    def __init__(self,
                 configuration,
                 model=None,
                 dataStream=None,
                 rethrowExceptions=None):
        self.model = model
        self.dataStream = dataStream
        self.rethrowExceptions = rethrowExceptions
        self.fileNameOnException = None

        # get the configuration, in whatever form you find it
        if isinstance(configuration, config.AugustusConfiguration):
            pass
        elif isinstance(configuration, basestring):
            try:
                configuration = xmlbase.loadfile(configuration,
                                                 config.Config,
                                                 lineNumbers=True)
            except IOError:
                configuration = xmlbase.load(configuration,
                                             config.Config,
                                             lineNumbers=True)
        else:
            raise ConfigurationError(
                "Configuration must be a pre-validated XML object, a fileName, or a literal configuration string."
            )

        # set up logging
        setupLogging(
            configuration.matches(
                lambda x: isinstance(x, (config.Logging, config.Metadata))))
        self.logger = logging.getLogger()
        self.metadata = logging.getLogger("metadata")

        # begin "initialization" phase
        for l in self.logger, self.metadata:
            if "initialization" in l.differentLevel:
                l.setLevel(l.differentLevel["initialization"])
            else:
                l.setLevel(l.naturalLevel)

        # get the model, in whatever form you find it
        self.logger.info("Loading PMML model.")
        self.metadata.startTiming("Time to load PMML model")
        modelFileName = "(none)"
        maturityThreshold = 0
        if self.model is None:
            modelInput = configuration.child(config.ModelInput, exception=None)
            if modelInput is None:
                raise ConfigurationError(
                    "If a model is not provided to MainLoop explicitly, it must be present in the configuration file."
                )

            fileLocation = modelInput["fileLocation"]
            if not fileLocation.startswith(
                    "http://") and not fileLocation.startswith("https://"):
                fileList = glob.glob(fileLocation)
                if len(fileList) > 1:
                    fileList = [
                        f for f in fileList
                        if self._modelExceptionIdentifier not in f
                    ]
                if len(fileList) == 0:
                    raise IOError(
                        "No files matched the ModelInput fileLocation \"%s\"."
                        % fileLocation)

                selectmode = modelInput.attrib.get("selectmode",
                                                   "lastAlphabetic")
                if selectmode == "mostRecent":
                    fileLocation = max(fileList,
                                       key=lambda x: os.stat(x).st_mtime)
                elif selectmode == "lastAlphabetic":
                    fileList.sort()
                    fileLocation = fileList[-1]
                else:
                    assert False

                if self._modelExceptionIdentifier in fileLocation:
                    self.logger.warning(
                        "Using a PMML model that was written on exception (fileName \"%s\")"
                        % fileLocation)

            self.model = xmlbase.loadfile(fileLocation,
                                          pmml.X_ODG_PMML,
                                          lineNumbers=True)

            if "maturityThreshold" in modelInput.attrib:
                maturityThreshold = modelInput["maturityThreshold"]

        elif isinstance(self.model, pmml.PMML):
            pass
        elif isinstance(self.model, basestring):
            try:
                self.model, modelFileName = xmlbase.loadfile(
                    self.model, pmml.X_ODG_PMML, lineNumbers=True), self.model
            except IOError:
                self.model = xmlbase.load(self.model,
                                          pmml.X_ODG_PMML,
                                          lineNumbers=True)
        else:
            raise ConfigurationError(
                "Model must be a pre-validated XML object, a fileName, or a literal PMML string."
            )
        self.metadata.stopTiming("Time to load PMML model")
        self.metadata.data["PMML model file"] = modelFileName

        # globally set random number seeds
        if "randomSeed" in configuration.attrib:
            augustusRandomSeed = configuration["randomSeed"]
            random.seed(augustusRandomSeed)
            numpy.random.seed(augustusRandomSeed + 1)
        else:
            augustusRandomSeed = "unspecified"

        # globally set numpy error handling
        numpy.seterr(divide="raise",
                     over="raise",
                     under="ignore",
                     invalid="raise")

        # update schemes (producerUpdateScheme may be redefined below)
        consumerUpdateScheme = self._getUpdateScheme(
            configuration.child(config.ConsumerBlending, exception=False))
        producerUpdateScheme = self._getUpdateScheme(None)

        # set up scoring output
        outputConfig = configuration.child(config.Output, exception=False)
        if outputConfig is None:
            self.outputWriter = None
        else:
            outputParams = {
                "pmmlFileName": modelFileName,
                "mode": outputConfig.destination.attrib.get("type",
                                                            "XML").lower()
            }

            if isinstance(outputConfig.destination, config.ToFile):
                if outputConfig.destination.attrib.get("overwrite", False):
                    outputStream = codecs.open(
                        outputConfig.destination["name"],
                        "w",
                        encoding="utf-8")
                else:
                    outputStream = codecs.open(
                        outputConfig.destination["name"],
                        "a",
                        encoding="utf-8")
            elif isinstance(outputConfig.destination, config.ToStandardError):
                outputStream = sys.stderr
            elif isinstance(outputConfig.destination, config.ToStandardOut):
                outputStream = sys.stdout
            else:
                assert False

            reportTag = outputConfig.child("ReportTag", exception=False)
            if reportTag:
                outputParams["reportName"] = reportTag.attrib.get(
                    "name", "Report")

            eventTag = outputConfig.child("EventTag", exception=False)
            if eventTag:
                outputParams["eventName"] = eventTag.attrib.get(
                    "name", "Event")
                outputParams["pseudoEventName"] = eventTag.attrib.get(
                    "pseudoName", "pseudoEvent")

            self.outputWriter = OutputWriter(outputStream, **outputParams)

        # initialize for the case of no output model
        engineSettings = {
            "maturityThreshold": maturityThreshold,
            "augustusRandomSeed": augustusRandomSeed
        }
        self.modelWriter = None
        segmentationScheme = SegmentationScheme(None, self.model)
        self.updateFlag = False
        self.aggregateUpdateFlag = False

        producerAlgorithm = dict(config.producerAlgorithmDefaults)
        for pa in producerAlgorithm.values():
            validationResult = pa.validate()
            assert validationResult is None

        # set up output model, if present in the configuration
        modelSetup = configuration.child(config.ModelSetup, exception=False)
        engineSettings["hasProducer"] = modelSetup is not None
        if engineSettings["hasProducer"]:
            self.logger.info("Setting up model updating/producing.")

            producerBlending = modelSetup.child(config.ProducerBlending,
                                                exception=False)
            producerUpdateScheme = self._getUpdateScheme(producerBlending)
            if producerBlending is not None and producerBlending.contains(
                    config.MaturityThreshold):
                maturityConfig = producerBlending.child(
                    config.MaturityThreshold)
                engineSettings["maturityThreshold"] = int(
                    maturityConfig.attrib.get("threshold", 1))
                try:
                    engineSettings["lockingThreshold"] = int(
                        maturityConfig.attrib["lockingThreshold"])
                except KeyError:
                    engineSettings["lockingThreshold"] = None

            engineSettings["lockAllSegments"] = modelSetup.attrib.get(
                "mode", None) == "lockExisting"
            if engineSettings[
                    "lockAllSegments"] and segmentationScheme is not None and not segmentationScheme._generic and not segmentationScheme._whiteList:
                self.logger.warning(
                    "The model is locked and no new segments are specified...new model files will be unchanged."
                )

            self.modelWriter = getModelWriter(modelSetup)
            if self.modelWriter is not None:
                if self.modelWriter.baseName is None:
                    self.fileNameOnException = self._modelExceptionIdentifier + ".pmml"
                else:
                    self.fileNameOnException = "".join([
                        self.modelWriter.baseName,
                        self._modelExceptionIdentifier, ".pmml"
                    ])
            else:
                self.logger.warning(
                    "There is no outputFile attribute in the ModelSetup; no new model file will be created."
                )

            segmentationScheme = SegmentationScheme(
                modelSetup.child(config.SegmentationSchema, exception=False),
                self.model)
            self.updateFlag = modelSetup.attrib.get("updateEvery",
                                                    "event") in ("event",
                                                                 "both")
            self.aggregateUpdateFlag = modelSetup.attrib.get(
                "updateEvery", "event") in ("aggregate", "both")

            for pa in modelSetup.matches(config.ProducerAlgorithm):
                producerAlgorithm[pa["model"]] = pa
            if modelSetup.attrib.get("mode", None) == "updateExisting":
                for pa in producerAlgorithm.values():
                    pa.parameters["updateExisting"] = True
            if modelSetup.attrib.get("mode", None) == "replaceExisting":
                for pa in producerAlgorithm.values():
                    pa.parameters["updateExisting"] = False

        # to score or not to score
        eventSettings = configuration.child(config.EventSettings,
                                            exception=False)
        if eventSettings is not None:
            self.logger.info("Setting up output.")
            self.scoreFlag = eventSettings["score"]
            self.outputFlag = eventSettings["output"]
        else:
            self.scoreFlag = False
            self.outputFlag = False

        aggregationConfig = configuration.child(config.AggregationSettings,
                                                exception=False)
        if aggregationConfig is not None:
            self.aggregateScoreFlag = aggregationConfig["score"]
            self.aggregateOutputFlag = aggregationConfig["output"]
            self.aggregationSettings = dict(aggregationConfig.attrib)
        else:
            self.aggregateScoreFlag = False
            self.aggregateOutputFlag = False
            self.aggregationSettings = None

        self.metadata.data[
            "Update model"] = "true" if self.updateFlag or self.aggregateUpdateFlag else "false"

        # build a scoring engine once without a dataStream (to evaluate any verification blocks)
        self.engine = Engine(self.model, None, producerUpdateScheme,
                             consumerUpdateScheme, segmentationScheme,
                             producerAlgorithm, **engineSettings)
        self.engine.initialize()
        if self.outputWriter is not None: self.outputWriter.open()

        # begin "verification" phase
        for l in self.logger, self.metadata:
            if "verification" in l.differentLevel:
                l.eventLogLevel = l.differentLevel["verification"]
                l.setLevel(l.differentLevel["verification"])
            else:
                l.eventLogLevel = l.naturalLevel
                l.setLevel(l.naturalLevel)

        # evaluate verification blocks
        modelVerificationConfig = configuration.child(config.ModelVerification,
                                                      exception=False)
        if modelVerificationConfig is not None:
            verify(modelVerificationConfig, self.engine, self.logger,
                   self.outputWriter)

        # verification can increment aggregate variables, but
        # aggregates should all start at zero at the start of real
        # processing, whether verification happened or not
        self.engine.flushAggregates()

        # get the dataStream, in whatever form you find it
        self.logger.info("Setting up data input.")
        if self.dataStream is None:
            configDataInput = configuration.child(config.DataInput,
                                                  exception=None)
            if configDataInput is None:
                raise ConfigurationError(
                    "If a dataStream is not provided to MainLoop explicitly, it must be present in the configuration file."
                )
            if configDataInput.contains(config.FromFile):
                self.dataStream = DataStreamer(
                    configDataInput.child(config.FromFile),
                    self.engine.pmmlModel)
            elif configDataInput.contains(config.FromStandardIn):
                self.dataStream = DataStreamer(
                    configDataInput.child(config.FromStandardIn),
                    self.engine.pmmlModel)
            elif configDataInput.contains(config.FromHTTP):
                self.dataStream = AugustusHTTPDataStream(
                    configDataInput.child(config.FromHTTP))
                if self.outputWriter is None:
                    self.dataStream.respond = False
                if self.dataStream.respond:
                    self.dataStream.setupOutput(self.outputWriter)
            else:
                assert False

        # begin "eventLoop" phase
        for l in self.logger, self.metadata:
            if "eventloop" in l.differentLevel:
                l.eventLogLevel = l.differentLevel["eventloop"]
                l.setLevel(l.differentLevel["eventloop"])
            else:
                l.eventLogLevel = l.naturalLevel
                l.setLevel(l.naturalLevel)

        # possibly set up custom processing
        self.customProcessing = configuration.child(config.CustomProcessing,
                                                    exception=False)
        if self.customProcessing is not None:
            constants = self.engine.pmmlModel.child(pmml.Extension,
                                                    exception=False)
            if constants is None:
                constants = NameSpaceReadOnly()
            else:
                constants = constants.child(
                    pmml.X_ODG_CustomProcessingConstants, exception=False)
                if constants is None:
                    constants = NameSpaceReadOnly()
                else:
                    constants = constants.nameSpace

            atoms = {
                "INVALID": INVALID,
                "MISSING": MISSING,
                "IMMATURE": IMMATURE,
                "MATURE": MATURE,
                "LOCKED": LOCKED,
                "UNINITIALIZED": UNINITIALIZED
            }
            for thing in pmml.OutputField.__dict__.values(
            ) + pmml.X_ODG_OutputField.__dict__.values():
                if isinstance(thing, Atom):
                    atoms[repr(thing)] = thing

            self.customProcessing.initialize(
                self.model, self.engine.pmmlModel, constants,
                [s.userFriendly
                 for s in self.engine.segmentRecords], atoms, self.logger,
                self.metadata, consumerUpdateScheme, producerUpdateScheme)
            self.engine.customProcessing = self.customProcessing
            self.engine.reinitialize()

        else:
            # only turn off circular garbage collection if there is no CustomProcessing or AugustusInterface
            gc.disable()
Example #8
0
 def post_validate(self):
     self.config = xmlbase.loadfile(self["fileName"], augustus.applications.scoresAwk.root, lineNumbers=True)
Example #9
0
 def post_validate(self):
     self.config = xmlbase.loadfile(self["fileName"], augustus.core.config.Config, lineNumbers=True)
Example #10
0
 def post_validate(self):
     self.data = xmlbase.loadfile(self["fileName"],
                                  pmml.X_ODG_PMML,
                                  validation=False)
Example #11
0
 def post_validate(self):
     self.config = xmlbase.loadfile(self["fileName"], augustus.applications.scoresAwk.root, lineNumbers=True)
Example #12
0
def main(config_file=None, rethrowExceptions=False):
    """From the configuration, set up the Augustus engine.
    
    Set up segments, PMML tree, I/O, and logging for the
    ProducerConsumer.  Identify what task is to be done:
    Producing, Consuming, or Automatic Incremental Model
    updates (AIM).  Identify the model type and start that
    model, passing the segments, PMML tree, and I/O information.

    Arguments:

        config_file (string):
            Path to the configuration file.
    """
    # Get the configuration settings (as an XML instance)
    if isinstance(config_file, config.AugustusConfiguration):
        configRoot = config_file
    else:
        configRoot = xmlbase.loadfile(config_file,
                                      config.Config,
                                      lineNumbers=True)

    setupLogging(
        configRoot.matches(lambda x: x.tag in ("Logging", "Metadata")))
    logger = logging.getLogger()
    metadata = logging.getLogger('metadata')

    logger.info("Loading PMML model.")
    child = configRoot.child(config.ModelInput, exception=False)
    metadata.startTiming("Time to load PMML model")
    pmmlModel, pmmlFileName = getModel(child)
    metadata.stopTiming("Time to load PMML model")
    metadata.data["PMML model file"] = pmmlFileName

    logger.info("Setting up data input.")
    child = configRoot.child(config.DataInput, exception=False)
    dataStreamer = getDataStreamer(child)

    child = configRoot.child(config.ConsumerBlending, exception=False)
    consumerUpdateScheme = getUpdateScheme(child)

    child = configRoot.child(config.ModelSetup, exception=False)
    # Default Model setup parameters
    modelWriter = getModelWriter(None)
    engineSettings = dict(maturityThreshold=0)
    producerParameters = {}
    filenameOnException = None
    producerUpdateScheme = getUpdateScheme(None)
    segmentationScheme = SegmentationScheme(None, pmmlModel)
    aggregateUpdateFlag = updateFlag = False

    # Model setup
    if child:
        logger.info("Setting up model updating/producing.")
        modelWriter = getModelWriter(child)
        segmentationScheme = SegmentationScheme(
            child.child(config.SegmentationSchema, exception=False), pmmlModel)
        if modelWriter is not None:
            engineSettings['lockAllSegments'] = child.attrib.get(
                "mode", None) == "lockExisting"
            producerParameters['resume'] = child.attrib.get(
                "mode", None) == "updateExisting"
            updateFlag = child.attrib.get("updateEvery",
                                          "event") in ("event", "both")
            aggregateUpdateFlag = child.attrib.get("updateEvery",
                                                   "event") in ("aggregate",
                                                                "both")
            filenameOnException = "".join(
                [modelWriter.baseName, _modelExceptionIdentifier, ".pmml"])
            child = child.child(config.ProducerBlending, exception=False)
            producerUpdateScheme = getUpdateScheme(child)
            if child and child.exists(config.MaturityThreshold):
                maturityConfig = child.child(config.MaturityThreshold)
                engineSettings['maturityThreshold'] = int(
                    maturityConfig.attrib.get("threshold", 1))
                engineSettings['lockingThreshold'] = \
                    None if "lockingThreshold" not in \
                    maturityConfig.attrib \
                    else int(maturityConfig["lockingThreshold"])
            if engineSettings[
                    'lockAllSegments'] and not segmentationScheme._generic and not segmentationScheme.whiteList:
                logger.warning(
                    "The model is locked and no new segments are specified...new model files will be unchanged."
                )
        else:
            logger.warning(
                "There is no outputFile attribute in the ModelSetup; no new model file will be created."
            )

    # Set up output
    child = configRoot.child(config.Output, exception=False)
    outputWriter = getOutputWriter(child, pmmlFileName)
    child = configRoot.child(config.EventSettings, exception=False)
    if child is not None:
        logger.info("Setting up output.")
        # not in a dictionary to reduce the number of lookups while looping
        scoreFlag = child.attrib['score']
        outputFlag = child.attrib['output']
    else:
        scoreFlag = outputFlag = False
    child = configRoot.child(config.AggregationSettings, exception=False)
    if child is not None:
        aggregateScoreFlag = child.attrib['score']
        aggregateOutputFlag = child.attrib['output']
        aggregationSettings = child.attrib
    else:
        aggregationSettings = None

    metadata.data[
        'Update model'] = "true" if updateFlag or aggregateUpdateFlag else "false"

    # build engine once without a data stream
    engine = Engine(pmmlModel, None, producerUpdateScheme,
                    consumerUpdateScheme, segmentationScheme, **engineSettings)
    engine.initialize(producerParameters=producerParameters)
    if outputWriter: outputWriter.open()

    # score fake data from <ModelVerifications>
    modelVerificationConfig = configRoot.child(config.ModelVerification,
                                               exception=False)
    if modelVerificationConfig is not None:
        augustus.engine.verification.verify(modelVerificationConfig, engine,
                                            logger, outputWriter)

    # start of real data
    logger.info("Setting up Augustus's main engine.")
    engine.resetDataStream(dataStreamer)
    dataStreamer.start_streaming()

    metadata.data['Events'] = 0
    logger.info("Calculating.")
    metadata.startTiming("Run time")

    try:
        while True:
            try:
                score = engine.event(score=scoreFlag, update=updateFlag)
                metadata.data['Events'] += 1
                if outputWriter and outputFlag:
                    try:
                        outputWriter.write(score)
                    except IOError:
                        ## FIXME: this exception should be raised to the top level; I do not
                        ## undersand why it is handled here, nor why a 'good' model is written...--tanya
                        if modelWriter:
                            modelWriter.write(pmmlModel)
                        break
                if modelWriter:
                    modelWriter.serialize(pmmlModel, metadata.data['Events'])

                if aggregationSettings:
                    if engine.checkPseudoeventReadiness(aggregationSettings):
                        score = engine.pseudoevent(score=aggregateScoreFlag,
                                                   update=aggregateUpdateFlag)
                        if outputWriter and outputOnAggregate:
                            outputWriter.write(score)

            except StopIteration:
                if modelWriter:
                    if modelWriter.serialization:
                        modelWriter.serialize(pmmlModel,
                                              metadata.data['Events'])
                    else:
                        modelWriter.write(pmmlModel)
                break

        if aggregationSettings is not None and aggregationSettings['atEnd']:
            score = engine.pseudoevent(score=aggregateScoreFlag,
                                       update=aggregateUpdateFlag)
            if outputWriter and aggregateOutputFlag:
                outputWriter.write(score)

    except (Exception, KeyboardInterrupt), err:
        if rethrowExceptions: raise

        logger.error("Shutting down on exception...")
        excinfo = sys.exc_info()
        logger.error("...%s" % excinfo[0])
        logger.error("...%s" % excinfo[1])
        logger.error("...%s" % traceback.format_exc())
        if filenameOnException:
            logger.error("Writing last model in location %s" %
                         filenameOnException)
            pmmlModel.write(filenameOnException)

        sys.exit(
            "Shutting down on exception; for more information check the logfile (if logging is enabled)...\n%s"
            % traceback.format_exc())
Example #13
0
        for line in input_file:
            number_of_rows += 1
            if  number_of_rows <= 3:
                print "\t", line.strip()
    print "\t", line
    checkForQuit()

    print "To load an XML file using Augustus' xmlbase library, first include the library"
    print "from augustus.core import xmlbase  # type this at the top of a script"
    print "\nthen load the file:"
    print """
filename = "../results/example_scores.xml"
root_element = xmlbase.loadfile(filename)
    """
    checkForQuit()
    root_element = xmlbase.loadfile(filename)
    
    
    print "To access a tag of an xml element, use 'element.tag' for example,"
    print "if we type:"
    print ">>> root_element.tag"
    checkForQuit()
    print "we get:"
    print root_element.tag
    checkForQuit()
    
    print "The element's attributes are stored as a Python dictionary."
    print "The dictionary is named 'attrib', so for example,"
    print ">>> root_element.attrib"
    checkForQuit()
    print "gets:"
Example #14
0
 def post_validate(self):
     self.data = xmlbase.loadfile(self["fileName"], pmml.X_ODG_PMML, validation=False)
Example #15
0
    def __init__(self, configuration, model=None, dataStream=None, rethrowExceptions=None):
        self.model = model
        self.dataStream = dataStream
        self.rethrowExceptions = rethrowExceptions
        self.fileNameOnException = None

        # get the configuration, in whatever form you find it
        if isinstance(configuration, config.AugustusConfiguration):
            pass
        elif isinstance(configuration, basestring):
            try:
                configuration = xmlbase.loadfile(configuration, config.Config, lineNumbers=True)
            except IOError:
                configuration = xmlbase.load(configuration, config.Config, lineNumbers=True)
        else:
            raise ConfigurationError("Configuration must be a pre-validated XML object, a fileName, or a literal configuration string.")
    
        # set up logging
        setupLogging(configuration.matches(lambda x: isinstance(x, (config.Logging, config.Metadata))))
        self.logger = logging.getLogger()
        self.metadata = logging.getLogger("metadata")

        # begin "initialization" phase
        for l in self.logger, self.metadata:
            if "initialization" in l.differentLevel:
                l.setLevel(l.differentLevel["initialization"])
            else:
                l.setLevel(l.naturalLevel)

        # get the model, in whatever form you find it
        self.logger.info("Loading PMML model.")
        self.metadata.startTiming("Time to load PMML model")
        modelFileName = "(none)"
        maturityThreshold = 0
        if self.model is None:
            modelInput = configuration.child(config.ModelInput, exception=None)
            if modelInput is None:
                raise ConfigurationError("If a model is not provided to MainLoop explicitly, it must be present in the configuration file.")

            fileLocation = modelInput["fileLocation"]
            if not fileLocation.startswith("http://") and not fileLocation.startswith("https://"):
                fileList = glob.glob(fileLocation)
                if len(fileList) > 1:
                    fileList = [f for f in fileList if self._modelExceptionIdentifier not in f]
                if len(fileList) == 0:
                    raise IOError("No files matched the ModelInput fileLocation \"%s\"." % fileLocation)

                selectmode = modelInput.attrib.get("selectmode", "lastAlphabetic")
                if selectmode == "mostRecent":
                    fileLocation = max(fileList, key=lambda x: os.stat(x).st_mtime)
                elif selectmode == "lastAlphabetic":
                    fileList.sort()
                    fileLocation = fileList[-1]
                else:
                    assert False

                if self._modelExceptionIdentifier in fileLocation:
                    self.logger.warning("Using a PMML model that was written on exception (fileName \"%s\")" % fileLocation)

            self.model = xmlbase.loadfile(fileLocation, pmml.X_ODG_PMML, lineNumbers=True)

            if "maturityThreshold" in modelInput.attrib: maturityThreshold = modelInput["maturityThreshold"]

        elif isinstance(self.model, pmml.PMML):
            pass
        elif isinstance(self.model, basestring):
            try:
                self.model, modelFileName = xmlbase.loadfile(self.model, pmml.X_ODG_PMML, lineNumbers=True), self.model
            except IOError:
                self.model = xmlbase.load(self.model, pmml.X_ODG_PMML, lineNumbers=True)
        else:
            raise ConfigurationError("Model must be a pre-validated XML object, a fileName, or a literal PMML string.")
        self.metadata.stopTiming("Time to load PMML model")
        self.metadata.data["PMML model file"] = modelFileName

        # globally set random number seeds
        if "randomSeed" in configuration.attrib:
            augustusRandomSeed = configuration["randomSeed"]
            random.seed(augustusRandomSeed)
            numpy.random.seed(augustusRandomSeed + 1)
        else:
            augustusRandomSeed = "unspecified"

        # globally set numpy error handling
        numpy.seterr(divide="raise", over="raise", under="ignore", invalid="raise")

        # update schemes (producerUpdateScheme may be redefined below)
        consumerUpdateScheme = self._getUpdateScheme(configuration.child(config.ConsumerBlending, exception=False))
        producerUpdateScheme = self._getUpdateScheme(None)

        # set up scoring output
        outputConfig = configuration.child(config.Output, exception=False)
        if outputConfig is None:
            self.outputWriter = None
        else:
            outputParams = {"pmmlFileName": modelFileName, "mode": outputConfig.destination.attrib.get("type", "XML").lower()}

            if isinstance(outputConfig.destination, config.ToFile):
                if outputConfig.destination.attrib.get("overwrite", False):
                    outputStream = codecs.open(outputConfig.destination["name"], "w", encoding="utf-8")
                else:
                    outputStream = codecs.open(outputConfig.destination["name"], "a", encoding="utf-8")
            elif isinstance(outputConfig.destination, config.ToStandardError):
                outputStream = sys.stderr
            elif isinstance(outputConfig.destination, config.ToStandardOut):
                outputStream = sys.stdout
            else:
                assert False

            reportTag = outputConfig.child("ReportTag", exception=False)
            if reportTag:
                outputParams["reportName"] = reportTag.attrib.get("name", "Report")

            eventTag = outputConfig.child("EventTag", exception=False)
            if eventTag:
                outputParams["eventName"] = eventTag.attrib.get("name", "Event")
                outputParams["pseudoEventName"] = eventTag.attrib.get("pseudoName", "pseudoEvent")

            self.outputWriter = OutputWriter(outputStream, **outputParams)

        # initialize for the case of no output model
        engineSettings = {"maturityThreshold": maturityThreshold, "augustusRandomSeed": augustusRandomSeed}
        self.modelWriter = None
        segmentationScheme = SegmentationScheme(None, self.model)
        self.updateFlag = False
        self.aggregateUpdateFlag = False

        producerAlgorithm = dict(config.producerAlgorithmDefaults)
        for pa in producerAlgorithm.values():
            validationResult = pa.validate()
            assert validationResult is None

        # set up output model, if present in the configuration
        modelSetup = configuration.child(config.ModelSetup, exception=False)
        engineSettings["hasProducer"] = modelSetup is not None
        if engineSettings["hasProducer"]:
            self.logger.info("Setting up model updating/producing.")

            producerBlending = modelSetup.child(config.ProducerBlending, exception=False)
            producerUpdateScheme = self._getUpdateScheme(producerBlending)
            if producerBlending is not None and producerBlending.contains(config.MaturityThreshold):
                maturityConfig = producerBlending.child(config.MaturityThreshold)
                engineSettings["maturityThreshold"] = int(maturityConfig.attrib.get("threshold", 1))
                try:
                    engineSettings["lockingThreshold"] = int(maturityConfig.attrib["lockingThreshold"])
                except KeyError:
                    engineSettings["lockingThreshold"] = None

            engineSettings["lockAllSegments"] = modelSetup.attrib.get("mode", None) == "lockExisting"
            if engineSettings["lockAllSegments"] and segmentationScheme is not None and not segmentationScheme._generic and not segmentationScheme._whiteList:
                self.logger.warning("The model is locked and no new segments are specified...new model files will be unchanged.")

            self.modelWriter = getModelWriter(modelSetup)
            if self.modelWriter is not None:
                if self.modelWriter.baseName is None:
                    self.fileNameOnException = self._modelExceptionIdentifier + ".pmml"
                else:
                    self.fileNameOnException = "".join([self.modelWriter.baseName, self._modelExceptionIdentifier, ".pmml"])
            else:
                self.logger.warning("There is no outputFile attribute in the ModelSetup; no new model file will be created.")

            segmentationScheme = SegmentationScheme(modelSetup.child(config.SegmentationSchema, exception=False), self.model)
            self.updateFlag = modelSetup.attrib.get("updateEvery", "event") in ("event", "both")
            self.aggregateUpdateFlag = modelSetup.attrib.get("updateEvery", "event") in ("aggregate", "both")

            for pa in modelSetup.matches(config.ProducerAlgorithm):
                producerAlgorithm[pa["model"]] = pa
            if modelSetup.attrib.get("mode", None) == "updateExisting":
                for pa in producerAlgorithm.values():
                    pa.parameters["updateExisting"] = True
            if modelSetup.attrib.get("mode", None) == "replaceExisting":
                for pa in producerAlgorithm.values():
                    pa.parameters["updateExisting"] = False

        # to score or not to score
        eventSettings = configuration.child(config.EventSettings, exception=False)
        if eventSettings is not None:
            self.logger.info("Setting up output.")
            self.scoreFlag = eventSettings["score"]
            self.outputFlag = eventSettings["output"]
        else:
            self.scoreFlag = False
            self.outputFlag = False

        aggregationConfig = configuration.child(config.AggregationSettings, exception=False)
        if aggregationConfig is not None:
            self.aggregateScoreFlag = aggregationConfig["score"]
            self.aggregateOutputFlag = aggregationConfig["output"]
            self.aggregationSettings = dict(aggregationConfig.attrib)
        else:
            self.aggregateScoreFlag = False
            self.aggregateOutputFlag = False
            self.aggregationSettings = None

        self.metadata.data["Update model"] = "true" if self.updateFlag or self.aggregateUpdateFlag else "false"

        # build a scoring engine once without a dataStream (to evaluate any verification blocks)
        self.engine = Engine(self.model, None, producerUpdateScheme, consumerUpdateScheme, segmentationScheme, producerAlgorithm, **engineSettings)
        self.engine.initialize()
        if self.outputWriter is not None: self.outputWriter.open()

        # begin "verification" phase
        for l in self.logger, self.metadata:
            if "verification" in l.differentLevel:
                l.eventLogLevel = l.differentLevel["verification"]
                l.setLevel(l.differentLevel["verification"])
            else:
                l.eventLogLevel = l.naturalLevel
                l.setLevel(l.naturalLevel)

        # evaluate verification blocks
        modelVerificationConfig = configuration.child(config.ModelVerification, exception=False)
        if modelVerificationConfig is not None:
            verify(modelVerificationConfig, self.engine, self.logger, self.outputWriter)

        # verification can increment aggregate variables, but
        # aggregates should all start at zero at the start of real
        # processing, whether verification happened or not
        self.engine.flushAggregates()

        # get the dataStream, in whatever form you find it
        self.logger.info("Setting up data input.")
        if self.dataStream is None:
            configDataInput = configuration.child(config.DataInput, exception=None)
            if configDataInput is None:
                raise ConfigurationError("If a dataStream is not provided to MainLoop explicitly, it must be present in the configuration file.")
            if configDataInput.contains(config.FromFile):
                self.dataStream = DataStreamer(configDataInput.child(config.FromFile), self.engine.pmmlModel)
            elif configDataInput.contains(config.FromStandardIn):
                self.dataStream = DataStreamer(configDataInput.child(config.FromStandardIn), self.engine.pmmlModel)
            elif configDataInput.contains(config.FromHTTP):
                self.dataStream = AugustusHTTPDataStream(configDataInput.child(config.FromHTTP))
                if self.outputWriter is None:
                    self.dataStream.respond = False
                if self.dataStream.respond:
                    self.dataStream.setupOutput(self.outputWriter)
            else:
                assert False

        # begin "eventLoop" phase
        for l in self.logger, self.metadata:
            if "eventloop" in l.differentLevel:
                l.eventLogLevel = l.differentLevel["eventloop"]
                l.setLevel(l.differentLevel["eventloop"])
            else:
                l.eventLogLevel = l.naturalLevel
                l.setLevel(l.naturalLevel)

        # possibly set up custom processing
        self.customProcessing = configuration.child(config.CustomProcessing, exception=False)
        if self.customProcessing is not None:
            constants = self.engine.pmmlModel.child(pmml.Extension, exception=False)
            if constants is None:
                constants = NameSpaceReadOnly()
            else:
                constants = constants.child(pmml.X_ODG_CustomProcessingConstants, exception=False)
                if constants is None:
                    constants = NameSpaceReadOnly()
                else:
                    constants = constants.nameSpace

            atoms = {"INVALID": INVALID, "MISSING": MISSING, "IMMATURE": IMMATURE, "MATURE": MATURE, "LOCKED": LOCKED, "UNINITIALIZED": UNINITIALIZED}
            for thing in pmml.OutputField.__dict__.values() + pmml.X_ODG_OutputField.__dict__.values():
                if isinstance(thing, Atom):
                    atoms[repr(thing)] = thing

            self.customProcessing.initialize(self.model, self.engine.pmmlModel, constants, [s.userFriendly for s in self.engine.segmentRecords], atoms, self.logger, self.metadata, consumerUpdateScheme, producerUpdateScheme)
            self.engine.customProcessing = self.customProcessing
            self.engine.reinitialize()

        else:
            # only turn off circular garbage collection if there is no CustomProcessing or AugustusInterface
            gc.disable()
Example #16
0
def main(config_file=None, rethrowExceptions=False):
    """From the configuration, set up the Augustus engine.
    
    Set up segments, PMML tree, I/O, and logging for the
    ProducerConsumer.  Identify what task is to be done:
    Producing, Consuming, or Automatic Incremental Model
    updates (AIM).  Identify the model type and start that
    model, passing the segments, PMML tree, and I/O information.

    Arguments:

        config_file (string):
            Path to the configuration file.
    """
    # Get the configuration settings (as an XML instance)
    if isinstance(config_file, config.AugustusConfiguration):
        configRoot = config_file
    else:
        configRoot = xmlbase.loadfile(config_file, config.Config, lineNumbers=True)

    setupLogging(configRoot.matches(lambda x: x.tag in ("Logging", "Metadata")))
    logger = logging.getLogger()
    metadata = logging.getLogger('metadata')

    logger.info("Loading PMML model.")
    child = configRoot.child(config.ModelInput, exception=False)
    metadata.startTiming("Time to load PMML model")
    pmmlModel, pmmlFileName = getModel(child)
    metadata.stopTiming("Time to load PMML model")
    metadata.data["PMML model file"] = pmmlFileName

    logger.info("Setting up data input.")
    child = configRoot.child(config.DataInput, exception=False)
    dataStreamer = getDataStreamer(child)

    child = configRoot.child(config.ConsumerBlending, exception=False)
    consumerUpdateScheme = getUpdateScheme(child)

    child = configRoot.child(config.ModelSetup, exception=False)
    # Default Model setup parameters
    modelWriter = getModelWriter(None)
    engineSettings = dict(maturityThreshold=0)
    producerParameters = {}
    filenameOnException = None
    producerUpdateScheme = getUpdateScheme(None)
    segmentationScheme = SegmentationScheme(None, pmmlModel)
    aggregateUpdateFlag = updateFlag = False

    # Model setup
    if child:
        logger.info("Setting up model updating/producing.")
        modelWriter = getModelWriter(child)
        segmentationScheme = SegmentationScheme(child.child(config.SegmentationSchema, exception=False), pmmlModel)
        if modelWriter is not None:
            engineSettings['lockAllSegments'] = child.attrib.get("mode", None) == "lockExisting"
            producerParameters['resume'] = child.attrib.get("mode", None) == "updateExisting"
            updateFlag = child.attrib.get("updateEvery", "event") in ("event", "both")
            aggregateUpdateFlag = child.attrib.get("updateEvery", "event") in ("aggregate", "both")
            filenameOnException = "".join([modelWriter.baseName, _modelExceptionIdentifier, ".pmml"])
            child = child.child(config.ProducerBlending, exception=False)
            producerUpdateScheme = getUpdateScheme(child)
            if child and child.exists(config.MaturityThreshold):
                maturityConfig = child.child(config.MaturityThreshold)
                engineSettings['maturityThreshold'] = int(maturityConfig.attrib.get("threshold", 1))
                engineSettings['lockingThreshold'] = \
                    None if "lockingThreshold" not in \
                    maturityConfig.attrib \
                    else int(maturityConfig["lockingThreshold"])
            if engineSettings['lockAllSegments'] and not segmentationScheme._generic and not segmentationScheme.whiteList:
                logger.warning("The model is locked and no new segments are specified...new model files will be unchanged.")
        else:
            logger.warning("There is no outputFile attribute in the ModelSetup; no new model file will be created.")

    # Set up output
    child = configRoot.child(config.Output, exception=False)
    outputWriter = getOutputWriter(child, pmmlFileName)
    child = configRoot.child(config.EventSettings, exception=False)
    if child is not None:
        logger.info("Setting up output.")
        # not in a dictionary to reduce the number of lookups while looping
        scoreFlag = child.attrib['score']
        outputFlag = child.attrib['output']
    else:
        scoreFlag = outputFlag = False
    child = configRoot.child(config.AggregationSettings, exception=False)
    if child is not None:
        aggregateScoreFlag = child.attrib['score']
        aggregateOutputFlag = child.attrib['output']
        aggregationSettings = child.attrib
    else:
        aggregationSettings = None

    metadata.data['Update model'] = "true" if updateFlag or aggregateUpdateFlag else "false"

    # build engine once without a data stream
    engine = Engine(pmmlModel, None, producerUpdateScheme, consumerUpdateScheme, segmentationScheme, **engineSettings)
    engine.initialize(producerParameters=producerParameters)
    if outputWriter: outputWriter.open()

    # score fake data from <ModelVerifications>
    modelVerificationConfig = configRoot.child(config.ModelVerification, exception=False)
    if modelVerificationConfig is not None:
        augustus.engine.verification.verify(modelVerificationConfig, engine, logger, outputWriter)

    # start of real data
    logger.info("Setting up Augustus's main engine.")
    engine.resetDataStream(dataStreamer)
    dataStreamer.start_streaming()

    metadata.data['Events'] = 0
    logger.info("Calculating.")
    metadata.startTiming("Run time")

    try:
        while True:
            try:
                score = engine.event(score=scoreFlag, update=updateFlag)
                metadata.data['Events'] += 1
                if outputWriter and outputFlag:
                    try:
                        outputWriter.write(score)
                    except IOError:
                        ## FIXME: this exception should be raised to the top level; I do not
                        ## undersand why it is handled here, nor why a 'good' model is written...--tanya
                        if modelWriter:
                            modelWriter.write(pmmlModel)
                        break
                if modelWriter:
                    modelWriter.serialize(pmmlModel, metadata.data['Events'])

                if aggregationSettings:
                    if engine.checkPseudoeventReadiness(aggregationSettings):
                        score = engine.pseudoevent(score=aggregateScoreFlag, update=aggregateUpdateFlag)
                        if outputWriter and outputOnAggregate:
                            outputWriter.write(score)
                        
            except StopIteration:
                if modelWriter:
                    if modelWriter.serialization:
                        modelWriter.serialize(pmmlModel, metadata.data['Events'])
                    else:
                        modelWriter.write(pmmlModel)
                break

        if aggregationSettings is not None and aggregationSettings['atEnd']:
            score = engine.pseudoevent(score=aggregateScoreFlag, update=aggregateUpdateFlag)
            if outputWriter and aggregateOutputFlag:
                outputWriter.write(score)

    except (Exception, KeyboardInterrupt), err:
        if rethrowExceptions: raise

        logger.error("Shutting down on exception...")
        excinfo = sys.exc_info()
        logger.error("...%s" % excinfo[0])
        logger.error("...%s" % excinfo[1])
        logger.error("...%s" % traceback.format_exc())
        if filenameOnException:
            logger.error("Writing last model in location %s" % filenameOnException)
            pmmlModel.write(filenameOnException)

        sys.exit("Shutting down on exception; for more information check the logfile (if logging is enabled)...\n%s" % traceback.format_exc())
Example #17
0
 def post_validate(self):
     self.config = xmlbase.loadfile(self["fileName"], augustus.core.config.Config, lineNumbers=True)
Example #18
0
        for line in input_file:
            number_of_rows += 1
            if number_of_rows <= 3:
                print "\t", line.strip()
    print "\t", line
    checkForQuit()

    print "To load an XML file using Augustus' xmlbase library, first include the library"
    print "from augustus.core import xmlbase  # type this at the top of a script"
    print "\nthen load the file:"
    print """
filename = "../results/example_scores.xml"
root_element = xmlbase.loadfile(filename)
    """
    checkForQuit()
    root_element = xmlbase.loadfile(filename)

    print "To access a tag of an xml element, use 'element.tag' for example,"
    print "if we type:"
    print ">>> root_element.tag"
    checkForQuit()
    print "we get:"
    print root_element.tag
    checkForQuit()

    print "The element's attributes are stored as a Python dictionary."
    print "The dictionary is named 'attrib', so for example,"
    print ">>> root_element.attrib"
    checkForQuit()
    print "gets:"
    print root_element.attrib
Example #19
0
    def __init__(self, config_file=None, rethrowExceptions=False, dataStream=None):
        # Get the configuration settings (as an XML instance)
        if isinstance(config_file, config.AugustusConfiguration):
            configRoot = config_file
        else:
            configRoot = xmlbase.loadfile(config_file, config.Config, lineNumbers=True)

        if "randomSeed" in configRoot.attrib:
            augustusRandomSeed = configRoot.attrib["randomSeed"]
            random.seed(augustusRandomSeed)
            numpy.random.seed(augustusRandomSeed + 1)
        else:
            augustusRandomSeed = "unspecified"

        setupLogging(configRoot.matches(lambda x: x.tag in ("Logging", "Metadata")))
        logger = logging.getLogger()
        metadata = logging.getLogger("metadata")

        for l in logger, metadata:
            if "initialization" in l.differentLevel:
                l.setLevel(l.differentLevel["initialization"])
            else:
                l.setLevel(l.naturalLevel)

        logger.info("Loading PMML model.")
        modelInput = configRoot.child(config.ModelInput, exception=False)
        metadata.startTiming("Time to load PMML model")
        pmmlModel, pmmlFileName = getModel(modelInput)
        metadata.stopTiming("Time to load PMML model")
        metadata.data["PMML model file"] = pmmlFileName

        logger.info("Setting up data input.")
        child = configRoot.child(config.DataInput, exception=False)
        if dataStream is None:
            fromHTTP = child.child(config.FromHTTP, exception=False)
            if fromHTTP is None:
                dataStreamer = getDataStreamer(child)                
            else:
                dataStreamer = AugustusHTTPDataStream(fromHTTP)
        else:
            dataStreamer = dataStream

        child = configRoot.child(config.ConsumerBlending, exception=False)
        consumerUpdateScheme = getUpdateScheme(child)

        child = configRoot.child(config.ModelSetup, exception=False)
        # Default Model setup parameters
        modelWriter = getModelWriter(None)
        engineSettings = {"maturityThreshold": modelInput.attrib.get("maturityThreshold", 0),
                          "augustusRandomSeed": augustusRandomSeed,
                          "hasProducer": True,
                          }
        filenameOnException = None
        producerUpdateScheme = getUpdateScheme(None)
        segmentationScheme = SegmentationScheme(None, pmmlModel)
        aggregateUpdateFlag = updateFlag = False

        producerAlgorithm = config.producerAlgorithmDefaults
        for pa in producerAlgorithm.values():
            if pa.validate() is not None:
                raise Exception, "Programmer error in producerAlgorithmDefaults"
        if child is not None:
            for pa in child.matches(config.ProducerAlgorithm):
                producerAlgorithm[pa.attrib["model"]] = pa

        # Model setup
        if child:
            logger.info("Setting up model updating/producing.")
            modelWriter = getModelWriter(child)
            segmentationScheme = SegmentationScheme(child.child(config.SegmentationSchema, exception=False), pmmlModel)
            if modelWriter is not None:
                engineSettings["lockAllSegments"] = child.attrib.get("mode", None) == "lockExisting"
                if child.attrib.get("mode", None) == "updateExisting":
                    for pa in producerAlgorithm.values():
                        pa.parameters["updateExisting"] = True
                if child.attrib.get("mode", None) == "replaceExisting":
                    for pa in producerAlgorithm.values():
                        pa.parameters["updateExisting"] = False

                updateFlag = child.attrib.get("updateEvery", "event") in ("event", "both")
                aggregateUpdateFlag = child.attrib.get("updateEvery", "event") in ("aggregate", "both")
                filenameOnException = "".join([modelWriter.baseName, _modelExceptionIdentifier, ".pmml"])
                child = child.child(config.ProducerBlending, exception=False)
                producerUpdateScheme = getUpdateScheme(child)
                if child and child.exists(config.MaturityThreshold):
                    maturityConfig = child.child(config.MaturityThreshold)
                    engineSettings["maturityThreshold"] = int(maturityConfig.attrib.get("threshold", 1))
                    engineSettings["lockingThreshold"] = \
                        None if "lockingThreshold" not in \
                        maturityConfig.attrib \
                        else int(maturityConfig["lockingThreshold"])
                if engineSettings["lockAllSegments"] and segmentationScheme is not None and not segmentationScheme._generic and not segmentationScheme._whiteList:
                    logger.warning("The model is locked and no new segments are specified...new model files will be unchanged.")
            else:
                logger.warning("There is no outputFile attribute in the ModelSetup; no new model file will be created.")
        else:
            engineSettings["hasProducer"] = False

        # Set up output
        child = configRoot.child(config.Output, exception=False)
        outputWriter = getOutputWriter(child, pmmlFileName)
        child = configRoot.child(config.EventSettings, exception=False)
        if child is not None:
            logger.info("Setting up output.")
            # not in a dictionary to reduce the number of lookups while looping
            scoreFlag = child.attrib["score"]
            outputFlag = child.attrib["output"]
        else:
            scoreFlag = outputFlag = False
        child = configRoot.child(config.AggregationSettings, exception=False)
        if child is not None:
            aggregateScoreFlag = child.attrib["score"]
            aggregateOutputFlag = child.attrib["output"]
            aggregationSettings = child.attrib
        else:
            aggregateScoreFlag = False
            aggregateOutputFlag = False
            aggregationSettings = None

        metadata.data["Update model"] = "true" if updateFlag or aggregateUpdateFlag else "false"

        # build engine once without a data stream
        engine = Engine(pmmlModel, None, producerUpdateScheme, consumerUpdateScheme, segmentationScheme, producerAlgorithm, **engineSettings)
        engine.initialize()
        if outputWriter: outputWriter.open()

        for l in logger, metadata:
            if "verification" in l.differentLevel:
                l.eventLogLevel = l.differentLevel["verification"]
                l.setLevel(l.differentLevel["verification"])
            else:
                l.eventLogLevel = l.naturalLevel
                l.setLevel(l.naturalLevel)

        # score fake data from <ModelVerifications>
        modelVerificationConfig = configRoot.child(config.ModelVerification, exception=False)
        if modelVerificationConfig is not None:
            augustus.engine.verification.verify(modelVerificationConfig, engine, logger, outputWriter)

        # verification can increment aggregate variables, but
        # aggregates should all start at zero at the start of real
        # processing, whether verification happened or not
        engine.flushAggregates()

        if isinstance(dataStreamer, AugustusHTTPDataStream):
            if outputWriter is None:
                dataStreamer.respond = False
            if dataStreamer.respond:
                dataStreamer.setupOutput(outputWriter)

        for l in logger, metadata:
            if "eventloop" in l.differentLevel:
                l.eventLogLevel = l.differentLevel["eventloop"]
                l.setLevel(l.differentLevel["eventloop"])
            else:
                l.eventLogLevel = l.naturalLevel
                l.setLevel(l.naturalLevel)

        # possibly set up custom processing
        customProcessing = configRoot.child(config.CustomProcessing, exception=False)
        if customProcessing is not None:
            constants = engine.pmmlModel.child(pmml.Extension, exception=False)
            if constants is None:
                constants = NameSpaceReadOnly()
            else:
                constants = constants.child(pmml.X_ODG_CustomProcessingConstants, exception=False)
                if constants is None:
                    constants = NameSpaceReadOnly()
                else:
                    constants = constants.nameSpace

            atoms = {"INVALID": INVALID, "MISSING": MISSING, "IMMATURE": IMMATURE, "MATURE": MATURE, "LOCKED": LOCKED, "UNINITIALIZED": UNINITIALIZED}
            for thing in pmml.OutputField.__dict__.values() + pmml.X_ODG_OutputField.__dict__.values():
                if isinstance(thing, Atom):
                    atoms[repr(thing)] = thing

            customProcessing.initialize(pmmlModel, engine.pmmlModel, constants, [s.userFriendly for s in engine.segmentRecords], atoms, logger, metadata, consumerUpdateScheme, producerUpdateScheme)
            engine.customProcessing = customProcessing
            engine.reinitialize()

        else:
            # only shut off circular garbage collection if there is no CustomProcessing or AugustusInterface
            gc.disable()

        self.dataStreamer = dataStreamer
        self.logger = logger
        self.engine = engine
        self.metadata = metadata
        self.aggregationSettings = aggregationSettings
        self.rethrowExceptions = rethrowExceptions
        self.scoreFlag = scoreFlag
        self.updateFlag = updateFlag
        self.outputWriter = outputWriter
        self.outputFlag = outputFlag
        self.modelWriter = modelWriter
        self.filenameOnException = filenameOnException
        self.pmmlModel = pmmlModel
        self.aggregateScoreFlag = aggregateScoreFlag
        self.aggregateUpdateFlag = aggregateUpdateFlag
        self.aggregateOutputFlag = aggregateOutputFlag
        self.customProcessing = customProcessing
Example #20
0
def pmmlDiff(name1,
             name2,
             validate=False,
             numSigfigs=6,
             header=False,
             extensions=False):
    if validate:
        file1 = loadfile(name1, pmml.X_ODG_PMML, lineNumbers=True)
        file2 = loadfile(name2, pmml.X_ODG_PMML, lineNumbers=True)
    else:
        file1 = loadfile(name1, lineNumbers=True)
        file2 = loadfile(name2, lineNumbers=True)

    if not header:
        index = file1.index(lambda x: x.tag == "Header")
        del file1[index]
        index = file2.index(lambda x: x.tag == "Header")
        del file2[index]

    if extensions:
        index1 = [i for i, x in file1.walk()]
        index1.insert(0, None)
        index2 = [i for i, x in file2.walk()]
        index2.insert(0, None)
    else:
        index1 = [
            i for i, x in file1.walk(lambda x: isinstance(x, pmml.Extension))
        ]
        index1.insert(0, None)
        index2 = [
            i for i, x in file2.walk(lambda x: isinstance(x, pmml.Extension))
        ]
        index2.insert(0, None)

    if len(index1) < len(index2):
        index1 += [BROKEN] * (len(index2) - len(index1))
    if len(index2) < len(index1):
        index2 += [BROKEN] * (len(index1) - len(index2))

    # show problems in the order that they appear in the files
    for i, (i1, i2) in enumerate(zip(index1, index2)):
        if i1 is None:
            elem1 = file1
        elif i1 is BROKEN:
            elem1 = BROKEN
        else:
            elem1 = file1[i1]

        if i2 is None:
            elem2 = file2
        elif i2 is BROKEN:
            elem2 = BROKEN
        else:
            elem2 = file2[i2]

        # if we have a structure problem
        if i1 != i2:
            return "Different structure:%s" % _comparitor(
                name1, elem1, name2, elem2)

        else:
            if elem1.tag != elem2.tag:
                return "Different tag: \"%s\" vs. \"%s\"%s" % (
                    elem1.tag, elem2.tag,
                    _comparitor(name1, elem1, name2, elem2))

            if set(elem1.attrib.keys()) != set(elem2.attrib.keys()):
                return "Different attributes: %s vs. %s%s" % (
                    sorted(elem1.attrib.keys()), sorted(elem2.attrib.keys()),
                    _comparitor(name1, elem1, name2, elem2))

            for k in sorted(elem1.attrib.keys()):
                value1 = elem1.attrib[k]
                value2 = elem2.attrib[k]

                if isinstance(value1, float) and isinstance(value2, float):
                    value1 = sigfigs(value1, numSigfigs)
                    value2 = sigfigs(value2, numSigfigs)

                if value1 != value2:
                    return "Different attribute value for \"%s\": %s %s vs. %s %s%s" % (
                        k, value1, str(type(elem1.attrib[k])), value2,
                        str(type(elem2.attrib[k])),
                        _comparitor(name1, elem1, name2, elem2))

            v1 = getattr(elem1, "value", NOTFOUND)
            v2 = getattr(elem2, "value", NOTFOUND)
            if v1 is not NOTFOUND or v2 is not NOTFOUND:
                value1, value2 = v1, v2

                if isinstance(value1, (tuple, list)) and isinstance(
                        value2, (tuple, list)) and len(value1) == len(value2):
                    out1, out2 = [], []
                    for val1, val2 in zip(value1, value2):
                        if isinstance(val1, float) and isinstance(val2, float):
                            val1 = sigfigs(val1, numSigfigs)
                            val2 = sigfigs(val2, numSigfigs)
                        out1.append(val1)
                        out2.append(val2)
                    value1, value2 = out1, out2

                elif isinstance(v1, float) and isinstance(v2, float):
                    value1 = sigfigs(v1, numSigfigs)
                    value2 = sigfigs(v2, numSigfigs)

                if value1 != value2:
                    return "Different value for \"%s\": %s %s vs. %s %s%s" % (
                        k, value1, str(type(value1)), value2, str(
                            type(v2)), _comparitor(name1, elem1, name2, elem2))

            # if both elements are leaves
            if len(elem1.matches()) == 0 and len(elem2.matches()) == 0:
                content1 = elem1.content()
                content2 = elem2.content()

                if content1 != content2:
                    return "Different content: \"%s\" vs. \"%s\"%s" % (
                        content1, content2,
                        _comparitor(name1, elem1, name2, elem2))

    return None
Example #21
0
def pmmlDiff(name1, name2, validate=False, numSigfigs=6, header=False, extensions=False):
    if validate:
        file1 = loadfile(name1, pmml.X_ODG_PMML, lineNumbers=True)
        file2 = loadfile(name2, pmml.X_ODG_PMML, lineNumbers=True)
    else:
        file1 = loadfile(name1, lineNumbers=True)
        file2 = loadfile(name2, lineNumbers=True)

    if not header:
        index = file1.index(lambda x: x.tag=="Header", exception=False)
        if index is not None:
            del file1[index]

        index = file2.index(lambda x: x.tag=="Header", exception=False)
        if index is not None:
            del file2[index]

    if not extensions:
        while True:
            index = file1.index(lambda x: x.tag == "Extension", maxdepth=None, exception=False)
            if index is None:
                break
            else:
                del file1[index]

        while True:
            index = file2.index(lambda x: x.tag == "Extension", maxdepth=None, exception=False)
            if index is None:
                break
            else:
                del file2[index]

    index1 = [i for i, x in file1.walk()]; index1.insert(0, None)
    index2 = [i for i, x in file2.walk()]; index2.insert(0, None)
        
    if len(index1) < len(index2):
        index1 += [BROKEN] * (len(index2) - len(index1))
    if len(index2) < len(index1):
        index2 += [BROKEN] * (len(index1) - len(index2))

    # show problems in the order that they appear in the files
    for i, (i1, i2) in enumerate(zip(index1, index2)):
        if i1 is None:
            elem1 = file1
        elif i1 is BROKEN:
            elem1 = BROKEN
        else:
            elem1 = file1[i1]

        if i2 is None:
            elem2 = file2
        elif i2 is BROKEN:
            elem2 = BROKEN
        else:
            elem2 = file2[i2]

        # if we have a structure problem
        if i1 != i2:
            return "Different structure:%s" % _comparitor(name1, elem1, name2, elem2)

        else:
            if elem1.tag != elem2.tag:
                return "Different tag: \"%s\" vs. \"%s\"%s" % (elem1.tag, elem2.tag, _comparitor(name1, elem1, name2, elem2))

            if set(elem1.attrib.keys()) !=  set(elem2.attrib.keys()):
                return "Different attributes: %s vs. %s%s" % (sorted(elem1.attrib.keys()), sorted(elem2.attrib.keys()), _comparitor(name1, elem1, name2, elem2))

            for k in sorted(elem1.attrib.keys()):
                value1 = elem1.attrib[k]
                value2 = elem2.attrib[k]

                if isinstance(value1, float) and isinstance(value2, float):
                    value1 = sigfigs(value1, numSigfigs)
                    value2 = sigfigs(value2, numSigfigs)

                if value1 != value2:
                    return "Different attribute value for \"%s\": %s %s vs. %s %s%s" % (k, value1, str(type(elem1.attrib[k])), value2, str(type(elem2.attrib[k])), _comparitor(name1, elem1, name2, elem2))

            v1 = getattr(elem1, "value", NOTFOUND)
            v2 = getattr(elem2, "value", NOTFOUND)
            if v1 is not NOTFOUND or v2 is not NOTFOUND:
                value1, value2 = v1, v2

                if isinstance(value1, (tuple, list)) and isinstance(value2, (tuple, list)) and len(value1) == len(value2):
                    out1, out2 = [], []
                    for val1, val2 in zip(value1, value2):
                        if isinstance(val1, float) and isinstance(val2, float):
                            val1 = sigfigs(val1, numSigfigs)
                            val2 = sigfigs(val2, numSigfigs)
                        out1.append(val1)
                        out2.append(val2)
                    value1, value2 = out1, out2

                elif isinstance(v1, float) and isinstance(v2, float):
                    value1 = sigfigs(v1, numSigfigs)
                    value2 = sigfigs(v2, numSigfigs)

                if value1 != value2:
                    return "Different value for \"%s\": %s %s vs. %s %s%s" % (k, value1, str(type(value1)), value2, str(type(v2)), _comparitor(name1, elem1, name2, elem2))

            # if both elements are leaves
            if len(elem1.matches()) == 0 and len(elem2.matches()) == 0:
                content1 = elem1.content()
                content2 = elem2.content()

                if content1 != content2:
                    return "Different content: \"%s\" vs. \"%s\"%s" % (content1, content2, _comparitor(name1, elem1, name2, elem2))

    return None
import glob

from augustus.core.xmlbase import loadfile
import augustus.core.pmml41 as pmml
from cassius import *

modelFiles = glob.glob("_out/modelout*.pmml")
modelFiles.sort()

models = []
for modelFile in modelFiles:
    model = loadfile(modelFile, pmml.X_ODG_PMML)
    models.append(model)

print model.tree()

plots = []
for model in models:
    eventNumber = model.descendant(pmml.X_ODG_Eventstamp)["number"]

    discretize = model.descendant(pmml.Discretize)
    bins = []
    for discretizeBin in discretize.matches(pmml.DiscretizeBin):
        binName = discretizeBin["binValue"]
        leftMargin = discretizeBin.child(pmml.Interval)["leftMargin"]
        rightMargin = discretizeBin.child(pmml.Interval)["rightMargin"]
        bins.append((binName, leftMargin, rightMargin))
    bins.sort(lambda a, b: cmp(a[1], b[1]))

    h = HistogramNonUniform(
        [(b[1], b[2]) for b in bins],