def __init__(self, errorCatcher=errorCatcher, fromHTTP=False, interactive=False, isXML=True, runForever=False, filename=None, **kwargs): self._errorCatcher = errorCatcher self._runOptions = NameSpace(fromHTTP=fromHTTP, interactive=interactive, isXML=isXML, runForever=runForever) self._fileList = filename # None or else will become a list self.currentFileNumber = 0 self._logger = logging.getLogger() self._logLevels = NameSpace( DEBUG=self._logger.getEffectiveLevel() <= logging.DEBUG, INFO=self._logger.getEffectiveLevel() <= logging.INFO, WARNING=self._logger.getEffectiveLevel() <= logging.WARNING, ERROR=self._logger.getEffectiveLevel() <= logging.ERROR) self._metadata = logging.getLogger('metadata') self._thread = None self._values = None self._tables = collections.deque() self._buffers = collections.deque() if interactive: self._reader = None elif fromHTTP: pass else: if filename == '-': self._fileList = ['-'] else: import glob self._fileList = glob.glob(filename) self._fileList = sort() self._fileList.reverse() if len(self._fileList) == 0: raise RuntimeError, "No Data Input files matched %s" % filename
def initialize(self, existingSegment=False, customProcessing=None, setModelMaturity=False): """Initialize the consumer, the producer, and start the maturity count.""" self.updator = self.engine.producerUpdateScheme.updator( COUNT) # use the producer's UpdateScheme if not existingSegment: self.lock = False self.pmmlModelMaturity.attrib["locked"] = False else: if setModelMaturity or ( "updateExisting" in self.producerParameters and self.producerParameters["updateExisting"] is True): self.updator.initialize( {COUNT: self.pmmlModelMaturity.attrib["numUpdates"]}) self.lock = self.pmmlModelMaturity.attrib["locked"] self.consumerAlgorithm.initialize() self.producerAlgorithm.initialize(**self.producerParameters) self.constants = self.pmmlModel.child(pmml.Extension, exception=False) if self.constants is None: self.constants = NameSpaceReadOnly() else: self.constants = self.constants.child( pmml.X_ODG_CustomProcessingConstants, exception=False) if self.constants is None: self.constants = NameSpaceReadOnly() else: self.constants = self.constants.nameSpace self.userFriendly = getattr(self, "userFriendly", new.instance(Segment)) self.userFriendly.name = self.name() self.userFriendly.pmmlPredicate = self.pmmlPredicate self.userFriendly.expression = self.expressionTree self.userFriendly.evaluate = self.predicateMatches self.userFriendly.pmmlModel = self.pmmlModel self.userFriendly.consumer = self.consumerAlgorithm self.userFriendly.producer = self.producerAlgorithm self.userFriendly.const = self.constants self.userFriendly.state = self.state if customProcessing is not None: db = customProcessing.persistentStorage.db if self.userFriendly.name not in db: db[self.userFriendly.name] = NameSpace() self.userFriendly.db = db[self.userFriendly.name]
def __init__( self, baseName, serialization=None, timeformat="%Y-%m-%d_%H-%M-%S", indent="", linesep="", pickle=False): """ note: serialization, if present, is a dictionary that contains {'byEventNumber':True/False, 'rollover':integer_value} in which 'rollover' has units of events or seconds. """ self.baseName = baseName self.timeformat = timeformat self.indent = indent self.linesep = linesep self.pickle = pickle self._logging = logging.getLogger() self._metadata = logging.getLogger('metadata') self.thread = None if serialization is None: self.serialization = None else: self.serialization = NameSpace(byEventNumber=False) if len(serialization) == 0: self.serialization['rollover'] = 3600 # hourly else: if 'byEventNumber' in serialization: self.serialization.byEventNumber = True if not self.serialization.byEventNumber: self.serialization['start'] = int(time.time()) self.serialization['rollover'] = serialization['rollover'] self.nameCollisions = {} # statistics on the writing process self._metadata.data["Models written"] = 0 self._metadata.data["Model write collisions"] = 0 self._metadata.data["Time writing models"] = 0 self._metadata.data["Time copying models"] = 0 self._metadata.data["Time waiting for write thread to unblock"] = 0
def initialize(self): """Interpret PMML file, set up SegmentRecords list, and initialize all algorithms.""" self.firstSegment = True # set up the header, so that our models can be stamped with time and event number header = self.pmmlFile.child(pmml.Header) if header.exists(pmml.Extension): headerExtension = header.child(pmml.Extension) else: headerExtension = pmml.Extension() header.children.insert(0, headerExtension) if headerExtension.exists(pmml.X_ODG_RandomSeed): del headerExtension[headerExtension.index(pmml.X_ODG_RandomSeed)] augustusRandomSeed = pmml.X_ODG_RandomSeed( value=self.augustusRandomSeed) headerExtension.children.append(augustusRandomSeed) if headerExtension.exists(pmml.X_ODG_Eventstamp): del headerExtension[headerExtension.index(pmml.X_ODG_Eventstamp)] self.eventStamp = pmml.X_ODG_Eventstamp(number=0) headerExtension.children.append(self.eventStamp) if header.exists(pmml.Timestamp): del header[header.index(pmml.Timestamp)] self.timeStamp = pmml.Timestamp( xmlbase.XMLText(datetime.datetime.today().isoformat())) header.children.append(self.timeStamp) # select the first model or select a model by name if self.modelName is None: self.pmmlModel = self.pmmlFile.topModels[0] else: self.pmmlModel = None for model in self.pmmlFile.topModels: if "modelName" in model.attrib and model.attrib[ "modelName"] == self.modelName: self.pmmlModel = model break if self.pmmlModel is None: raise RuntimeError, "No model named \"%s\" was found in the PMML file" % self.modelName # connect the dataContext to the dataStream, so that events will flow from the input file into the transformations self.resetDataStream(self.dataStream) # clear the cache the model DataContexts (initializes some dictionaries) self.pmmlModel.dataContext.clear() if self.pmmlModel.dataContext.transformationDictionary: self.metadata.data["Transformation dictionary elements"] = len( self.pmmlModel.dataContext.transformationDictionary.cast) else: self.metadata.data["Transformation dictionary elements"] = 0 self.segmentRecords = [] self._lookup = NameSpace(tuples={}, fields={}, other=[]) SegmentRecord.maturityThreshold = self.maturityThreshold SegmentRecord.lockingThreshold = self.lockingThreshold if self.pmmlFile.exists(pmml.TransformationDictionary): if self.pmmlFile.child(pmml.TransformationDictionary).exists( pmml.Aggregate, maxdepth=None): raise NotImplementedError, "Aggregate transformations in the TransformationDictionary are not supported" if self.pmmlFile.child(pmml.TransformationDictionary).exists( pmml.X_ODG_AggregateReduce, maxdepth=None): raise NotImplementedError, "X-ODG-AggregateReduce transformations in the TransformationDictionary are not supported" # MiningModels are special because we handle segmentation at the Engine level # Currently no support for MiningModels nested within MiningModels if isinstance(self.pmmlModel, pmml.MiningModel): self.pmmlOutput = self.pmmlModel.child(pmml.Output, exception=False) segmentation = self.pmmlModel.child(pmml.Segmentation, exception=False) # for now, assume a MiningModel without any segments will be populated through autosegmentation if self.pmmlModel.exists(pmml.LocalTransformations): if self.pmmlModel.child(pmml.LocalTransformations).exists( pmml.Aggregate, maxdepth=None): raise NotImplementedError, "Aggregate transformations in the MiningModel's LocalTransformations are not supported" if self.pmmlModel.child(pmml.LocalTransformations).exists( pmml.X_ODG_AggregateReduce, maxdepth=None): raise NotImplementedError, "X-ODG-AggregateReduce transformations in the MiningModel's LocalTransformations are not supported" if segmentation.attrib["multipleModelMethod"] == "selectFirst": self.multipleModelMethod = SELECTFIRST elif segmentation.attrib["multipleModelMethod"] == "selectAll": self.multipleModelMethod = SELECTALL else: raise NotImplementedError, "Only 'selectFirst', 'selectAll', and no segmentation have been implemented." self.metadata.data[ "Match all segments"] = self.multipleModelMethod != SELECTFIRST for pmmlSegment in segmentation.matches(pmml.Segment): self._makeSegmentRecord(pmmlSegment) else: self.multipleModelMethod = SELECTONLY segmentRecord = SegmentRecord(self.pmmlModel, None, None, self) modelClass = self.pmmlModel.__class__ algoName = self.producerAlgorithm[ modelClass.__name__].attrib["algorithm"] segmentRecord.consumerAlgorithm = consumerAlgorithmMap[modelClass]( self, segmentRecord) segmentRecord.producerAlgorithm = producerAlgorithmMap[ modelClass, algoName](self, segmentRecord) segmentRecord.producerParameters = self.producerAlgorithm[ modelClass.__name__].parameters self.setProvenance(self.pmmlModel, algoName, segmentRecord.producerAlgorithm, segmentRecord.producerParameters) localTransformations = self.pmmlModel.child( pmml.LocalTransformations, exception=False) if localTransformations is not None: segmentRecord.aggregates = localTransformations.matches( pmml.Aggregate, maxdepth=None) segmentRecord.aggregates.extend( localTransformations.matches(pmml.X_ODG_AggregateReduce, maxdepth=None)) else: segmentRecord.aggregates = [] for aggregate in segmentRecord.aggregates: aggregate.initialize(self.consumerUpdateScheme) self.segmentRecords.append(segmentRecord) self.metadata.data[ "First segment model type"] = segmentRecord.pmmlModel.tag self.reinitialize()
def __matchesPartition(matcher, partition): for bound, comparator in partition: if bound is not None and not comparator(matcher, bound): return False return True _segmentHelpers = NameSpace( lessThan=lambda x, val: x < val, lessOrEqual=lambda x, val: x <= val, greaterThan=lambda x, val: x > val, greaterOrEqual=lambda x, val: x >= val, isCompoundAnd=lambda x: isinstance(x, pmml.CompoundPredicate) and x.attrib['booleanOperator'] == "and", isSimpleEqual=lambda x: isinstance(x, pmml.SimplePredicate) and x.attrib['operator'] == "equal", isComparator=lambda x: isinstance(x, pmml.SimplePredicate) and \ x.attrib['operator'][0] in ('l', 'g'), # less|greater + Than|OrEqual matchesPartition=__matchesPartition) ########################################################### Engine class Engine: """Object called by Augustus main event loop to process one event/pseudoevent.""" def __init__(self, pmmlFile, dataStream, producerUpdateScheme, consumerUpdateScheme, segmentationScheme, producerAlgorithm,
def __init__(self, fromHTTP=False, interactive=False, isXML=True, isCSV=False, runForever=False, maxsize=0, filename=None, **kwargs): """Set up the reading function and queue for the DataStreamer. DataStreamer's constructor is typically invoked by calling getDataStreamer(config_options), defined below. Error checking for appropriate configuration settings, and for sufficient contents in **kwargs is presumed to be done during XSD validation. The reason this initialization function is separate is to allow an advanced user to call the streamer from a script and bypass having to make an XML object containing configuration settings. Arguments: fromHTTP (boolean; default False): If True, the reader will be an HTTPInterfaceServer. interactive (boolean; default False): If True, the reader will be None and the user will push data to the queue to score using self.enqueue(self, dictionary) in which dictionary is a dictionary or a UniRecord; a row in a UniTable. isXML (boolean; default False): If True, the reader will process the input stream as XML. runForever (boolean; default False): If True, run forever. Otherwise read all data and then exit. maxsize (integer; default 0): The maximum number of objects allowed in self.queue. If zero, the Queue can be arbitrarily long. **kwargs (arguments for the Reader) """ self._runOptions =\ NameSpace( fromHTTP=fromHTTP, interactive=interactive, isXML=isXML, runForever=runForever) self._fileList = filename # None or else will become a list... self.currentFileNumber = 0 self._logger = logging.getLogger() self._metadata = logging.getLogger('metadata') self._thread = None self._values = None self._queue = Queue.Queue(maxsize) callback = self._xmlCallback if isXML else self._unitableCallback if interactive: self._reader = None elif fromHTTP: def http_callback(data): wrapper = StringIO.StringIO(data) rdr =\ Reader(callback, source=wrapper, logger=self._logger, magicheader=False, unitable=not isXML, wholeUniTable=not isXML) pipe = rdr.new_pipe() try: result = rdr.feed_pipe(None, pipe) except: raise IOError("Problem reading data over HTTP.") return result self._reader =\ HTTPInterfaceServer( ('', kwargs['port']), logger=logging.getLogger('')) self._reader.register_callback(kwargs['url'], http_callback) self._reader.isCSV = isCSV else: if filename == '-': self._fileList = ['-'] else: import glob self._fileList = glob.glob(filename) self._fileList.sort() self._fileList.reverse() if len(self._fileList) == 0: raise RuntimeError, "No Data Input files matched %s" % filename self._reader = Reader(callback, unitable=not isXML, wholeUniTable=not isXML, **kwargs) self._reader.source = self._fileList.pop() self._reader.isCSV = isCSV
# See the License for the specific language governing permissions and # limitations under the License. import sys import os import new import string import re import math import augustus.core.xmlbase as xmlbase import augustus.core.pmml41 as pmml from augustus.core.xmlbase import XMLValidationError, load_xsdType, load_xsdGroup, load_xsdElement from augustus.core.defs import Atom, NameSpace globalVariables = NameSpace() class PmmlSed(xmlbase.XML): topTag = "PmmlSed" xsdType = {} xsdGroup = {} classMap = {} def __init__(self, *children, **attrib): # reverse-lookup the classMap try: pmmlName = (pmmlName for pmmlName, pythonObj in self.classMap.items() if pythonObj == self.__class__).next() except StopIteration: raise Exception, "PmmlSed class is missing from the classMap (programmer error)" xmlbase.XML.__init__(self, pmmlName, *children, **attrib)