def do_POST(self): while self.server.dataStream.request is not None: time.sleep(0) logger = logging.getLogger() logDebug = logger.getEffectiveLevel() <= logging.DEBUG try: plen = self.headers.getheader("Content-length") if plen is None: self.send_error(400, "Content-length header is missing") logger.error("HTTP request with no Content-length: %s" % str(self.headers)) return data = self.rfile.read(int(plen)) try: content = xmlbase.load(data) except (xmlbase.XMLError, xmlbase.XMLValidationError), err: self.send_error(400, "Content is not valid (%s: %s)" % (err.__class__.__name__, str(err))) logger.error("HTTP request with invalid content (%s: %s):\n%s" % (err.__class__.__name__, str(err), data)) return if content.tag == "Event": if "id" not in content.attrib: self.send_error(400, "Event must have an 'id' attribute") logger.error("HTTP request without 'id': %s" % data) return else: self.send_error(400, "Request has unrecognized tag: %s" % content.tag) logger.error("HTTP request unrecognized tag: %s" % content.tag) return
def __init__(self, configuration, dataStream=None, connect="", exceptions=True): if isinstance(configuration, config.AugustusConfiguration): configuration.validate(exception=True) else: try: configuration = xmlbase.loadfile(configuration, config.Config, lineNumbers=True) except IOError: configuration = xmlbase.load(configuration, config.Config) if configuration.exists(config.CustomProcessing): raise Exception("The Augustus class defines its own <CustomProcessing>; please leave this out of the configuration file.") self.dataStream = dataStream if configuration.child(config.DataInput).exists(config.Interactive): if dataStream is None: raise Exception("If the configuration has a DataInput <Interactive> block, then a DataStream object must be provided.") else: if dataStream is not None: raise Exception("If the configuration has no DataInput <Interactive> block, then a DataStream object must not be provided.") persistentStorage = config.PersistentStorage(connect=connect) persistentStorage.validate() customProcessing = config.CustomProcessing(persistentStorage) customProcessing.code = None customProcessing.callbackClass = self configuration.children.append(customProcessing) self.configuration = configuration self.mainLoop = augustus.engine.mainloop.MainLoop(self.configuration, rethrowExceptions=exceptions, dataStream=self.dataStream)
def post_validate(self): if "validate" not in self.attrib: self["validate"] = True try: self.data = xmlbase.load(sys.stdin.read(), pmml.X_ODG_PMML, validation=self["validate"]) except XMLValidationError, err: raise RuntimeError("StandardInput PMML failed validation: %s" % str(err))
def post_validate(self): if "validate" not in self.attrib: self["validate"] = True try: self.data = xmlbase.load(sys.stdin.read(), pmml.X_ODG_PMML, validation=self["validate"]) except XMLValidationError, err: raise RuntimeError, "StandardInput PMML failed validation: %s" % str(err)
def __init__(self, configuration, dataStream=None, connect="", exceptions=True): if isinstance(configuration, config.AugustusConfiguration): configuration.validate(exception=True) else: try: configuration = xmlbase.loadfile(configuration, config.Config, lineNumbers=True) except IOError: configuration = xmlbase.load(configuration, config.Config) if configuration.exists(config.CustomProcessing): raise Exception( "The Augustus class defines its own <CustomProcessing>; please leave this out of the configuration file." ) self.dataStream = dataStream if configuration.child(config.DataInput).exists(config.Interactive): if dataStream is None: raise Exception( "If the configuration has a DataInput <Interactive> block, then a DataStream object must be provided." ) else: if dataStream is not None: raise Exception( "If the configuration has no DataInput <Interactive> block, then a DataStream object must not be provided." ) persistentStorage = config.PersistentStorage(connect=connect) persistentStorage.validate() customProcessing = config.CustomProcessing(persistentStorage) customProcessing.code = None customProcessing.callbackClass = self configuration.children.append(customProcessing) self.configuration = configuration self.mainLoop = augustus.engine.mainloop.MainLoop( self.configuration, rethrowExceptions=exceptions, dataStream=self.dataStream)
def __init__(self, configuration, model=None, dataStream=None, rethrowExceptions=None): self.model = model self.dataStream = dataStream self.rethrowExceptions = rethrowExceptions self.fileNameOnException = None # get the configuration, in whatever form you find it if isinstance(configuration, config.AugustusConfiguration): pass elif isinstance(configuration, basestring): try: configuration = xmlbase.loadfile(configuration, config.Config, lineNumbers=True) except IOError: configuration = xmlbase.load(configuration, config.Config, lineNumbers=True) else: raise ConfigurationError( "Configuration must be a pre-validated XML object, a fileName, or a literal configuration string." ) # set up logging setupLogging( configuration.matches( lambda x: isinstance(x, (config.Logging, config.Metadata)))) self.logger = logging.getLogger() self.metadata = logging.getLogger("metadata") # begin "initialization" phase for l in self.logger, self.metadata: if "initialization" in l.differentLevel: l.setLevel(l.differentLevel["initialization"]) else: l.setLevel(l.naturalLevel) # get the model, in whatever form you find it self.logger.info("Loading PMML model.") self.metadata.startTiming("Time to load PMML model") modelFileName = "(none)" maturityThreshold = 0 if self.model is None: modelInput = configuration.child(config.ModelInput, exception=None) if modelInput is None: raise ConfigurationError( "If a model is not provided to MainLoop explicitly, it must be present in the configuration file." ) fileLocation = modelInput["fileLocation"] if not fileLocation.startswith( "http://") and not fileLocation.startswith("https://"): fileList = glob.glob(fileLocation) if len(fileList) > 1: fileList = [ f for f in fileList if self._modelExceptionIdentifier not in f ] if len(fileList) == 0: raise IOError( "No files matched the ModelInput fileLocation \"%s\"." % fileLocation) selectmode = modelInput.attrib.get("selectmode", "lastAlphabetic") if selectmode == "mostRecent": fileLocation = max(fileList, key=lambda x: os.stat(x).st_mtime) elif selectmode == "lastAlphabetic": fileList.sort() fileLocation = fileList[-1] else: assert False if self._modelExceptionIdentifier in fileLocation: self.logger.warning( "Using a PMML model that was written on exception (fileName \"%s\")" % fileLocation) self.model = xmlbase.loadfile(fileLocation, pmml.X_ODG_PMML, lineNumbers=True) if "maturityThreshold" in modelInput.attrib: maturityThreshold = modelInput["maturityThreshold"] elif isinstance(self.model, pmml.PMML): pass elif isinstance(self.model, basestring): try: self.model, modelFileName = xmlbase.loadfile( self.model, pmml.X_ODG_PMML, lineNumbers=True), self.model except IOError: self.model = xmlbase.load(self.model, pmml.X_ODG_PMML, lineNumbers=True) else: raise ConfigurationError( "Model must be a pre-validated XML object, a fileName, or a literal PMML string." ) self.metadata.stopTiming("Time to load PMML model") self.metadata.data["PMML model file"] = modelFileName # globally set random number seeds if "randomSeed" in configuration.attrib: augustusRandomSeed = configuration["randomSeed"] random.seed(augustusRandomSeed) numpy.random.seed(augustusRandomSeed + 1) else: augustusRandomSeed = "unspecified" # globally set numpy error handling numpy.seterr(divide="raise", over="raise", under="ignore", invalid="raise") # update schemes (producerUpdateScheme may be redefined below) consumerUpdateScheme = self._getUpdateScheme( configuration.child(config.ConsumerBlending, exception=False)) producerUpdateScheme = self._getUpdateScheme(None) # set up scoring output outputConfig = configuration.child(config.Output, exception=False) if outputConfig is None: self.outputWriter = None else: outputParams = { "pmmlFileName": modelFileName, "mode": outputConfig.destination.attrib.get("type", "XML").lower() } if isinstance(outputConfig.destination, config.ToFile): if outputConfig.destination.attrib.get("overwrite", False): outputStream = codecs.open( outputConfig.destination["name"], "w", encoding="utf-8") else: outputStream = codecs.open( outputConfig.destination["name"], "a", encoding="utf-8") elif isinstance(outputConfig.destination, config.ToStandardError): outputStream = sys.stderr elif isinstance(outputConfig.destination, config.ToStandardOut): outputStream = sys.stdout else: assert False reportTag = outputConfig.child("ReportTag", exception=False) if reportTag: outputParams["reportName"] = reportTag.attrib.get( "name", "Report") eventTag = outputConfig.child("EventTag", exception=False) if eventTag: outputParams["eventName"] = eventTag.attrib.get( "name", "Event") outputParams["pseudoEventName"] = eventTag.attrib.get( "pseudoName", "pseudoEvent") self.outputWriter = OutputWriter(outputStream, **outputParams) # initialize for the case of no output model engineSettings = { "maturityThreshold": maturityThreshold, "augustusRandomSeed": augustusRandomSeed } self.modelWriter = None segmentationScheme = SegmentationScheme(None, self.model) self.updateFlag = False self.aggregateUpdateFlag = False producerAlgorithm = dict(config.producerAlgorithmDefaults) for pa in producerAlgorithm.values(): validationResult = pa.validate() assert validationResult is None # set up output model, if present in the configuration modelSetup = configuration.child(config.ModelSetup, exception=False) engineSettings["hasProducer"] = modelSetup is not None if engineSettings["hasProducer"]: self.logger.info("Setting up model updating/producing.") producerBlending = modelSetup.child(config.ProducerBlending, exception=False) producerUpdateScheme = self._getUpdateScheme(producerBlending) if producerBlending is not None and producerBlending.contains( config.MaturityThreshold): maturityConfig = producerBlending.child( config.MaturityThreshold) engineSettings["maturityThreshold"] = int( maturityConfig.attrib.get("threshold", 1)) try: engineSettings["lockingThreshold"] = int( maturityConfig.attrib["lockingThreshold"]) except KeyError: engineSettings["lockingThreshold"] = None engineSettings["lockAllSegments"] = modelSetup.attrib.get( "mode", None) == "lockExisting" if engineSettings[ "lockAllSegments"] and segmentationScheme is not None and not segmentationScheme._generic and not segmentationScheme._whiteList: self.logger.warning( "The model is locked and no new segments are specified...new model files will be unchanged." ) self.modelWriter = getModelWriter(modelSetup) if self.modelWriter is not None: if self.modelWriter.baseName is None: self.fileNameOnException = self._modelExceptionIdentifier + ".pmml" else: self.fileNameOnException = "".join([ self.modelWriter.baseName, self._modelExceptionIdentifier, ".pmml" ]) else: self.logger.warning( "There is no outputFile attribute in the ModelSetup; no new model file will be created." ) segmentationScheme = SegmentationScheme( modelSetup.child(config.SegmentationSchema, exception=False), self.model) self.updateFlag = modelSetup.attrib.get("updateEvery", "event") in ("event", "both") self.aggregateUpdateFlag = modelSetup.attrib.get( "updateEvery", "event") in ("aggregate", "both") for pa in modelSetup.matches(config.ProducerAlgorithm): producerAlgorithm[pa["model"]] = pa if modelSetup.attrib.get("mode", None) == "updateExisting": for pa in producerAlgorithm.values(): pa.parameters["updateExisting"] = True if modelSetup.attrib.get("mode", None) == "replaceExisting": for pa in producerAlgorithm.values(): pa.parameters["updateExisting"] = False # to score or not to score eventSettings = configuration.child(config.EventSettings, exception=False) if eventSettings is not None: self.logger.info("Setting up output.") self.scoreFlag = eventSettings["score"] self.outputFlag = eventSettings["output"] else: self.scoreFlag = False self.outputFlag = False aggregationConfig = configuration.child(config.AggregationSettings, exception=False) if aggregationConfig is not None: self.aggregateScoreFlag = aggregationConfig["score"] self.aggregateOutputFlag = aggregationConfig["output"] self.aggregationSettings = dict(aggregationConfig.attrib) else: self.aggregateScoreFlag = False self.aggregateOutputFlag = False self.aggregationSettings = None self.metadata.data[ "Update model"] = "true" if self.updateFlag or self.aggregateUpdateFlag else "false" # build a scoring engine once without a dataStream (to evaluate any verification blocks) self.engine = Engine(self.model, None, producerUpdateScheme, consumerUpdateScheme, segmentationScheme, producerAlgorithm, **engineSettings) self.engine.initialize() if self.outputWriter is not None: self.outputWriter.open() # begin "verification" phase for l in self.logger, self.metadata: if "verification" in l.differentLevel: l.eventLogLevel = l.differentLevel["verification"] l.setLevel(l.differentLevel["verification"]) else: l.eventLogLevel = l.naturalLevel l.setLevel(l.naturalLevel) # evaluate verification blocks modelVerificationConfig = configuration.child(config.ModelVerification, exception=False) if modelVerificationConfig is not None: verify(modelVerificationConfig, self.engine, self.logger, self.outputWriter) # verification can increment aggregate variables, but # aggregates should all start at zero at the start of real # processing, whether verification happened or not self.engine.flushAggregates() # get the dataStream, in whatever form you find it self.logger.info("Setting up data input.") if self.dataStream is None: configDataInput = configuration.child(config.DataInput, exception=None) if configDataInput is None: raise ConfigurationError( "If a dataStream is not provided to MainLoop explicitly, it must be present in the configuration file." ) if configDataInput.contains(config.FromFile): self.dataStream = DataStreamer( configDataInput.child(config.FromFile), self.engine.pmmlModel) elif configDataInput.contains(config.FromStandardIn): self.dataStream = DataStreamer( configDataInput.child(config.FromStandardIn), self.engine.pmmlModel) elif configDataInput.contains(config.FromHTTP): self.dataStream = AugustusHTTPDataStream( configDataInput.child(config.FromHTTP)) if self.outputWriter is None: self.dataStream.respond = False if self.dataStream.respond: self.dataStream.setupOutput(self.outputWriter) else: assert False # begin "eventLoop" phase for l in self.logger, self.metadata: if "eventloop" in l.differentLevel: l.eventLogLevel = l.differentLevel["eventloop"] l.setLevel(l.differentLevel["eventloop"]) else: l.eventLogLevel = l.naturalLevel l.setLevel(l.naturalLevel) # possibly set up custom processing self.customProcessing = configuration.child(config.CustomProcessing, exception=False) if self.customProcessing is not None: constants = self.engine.pmmlModel.child(pmml.Extension, exception=False) if constants is None: constants = NameSpaceReadOnly() else: constants = constants.child( pmml.X_ODG_CustomProcessingConstants, exception=False) if constants is None: constants = NameSpaceReadOnly() else: constants = constants.nameSpace atoms = { "INVALID": INVALID, "MISSING": MISSING, "IMMATURE": IMMATURE, "MATURE": MATURE, "LOCKED": LOCKED, "UNINITIALIZED": UNINITIALIZED } for thing in pmml.OutputField.__dict__.values( ) + pmml.X_ODG_OutputField.__dict__.values(): if isinstance(thing, Atom): atoms[repr(thing)] = thing self.customProcessing.initialize( self.model, self.engine.pmmlModel, constants, [s.userFriendly for s in self.engine.segmentRecords], atoms, self.logger, self.metadata, consumerUpdateScheme, producerUpdateScheme) self.engine.customProcessing = self.customProcessing self.engine.reinitialize() else: # only turn off circular garbage collection if there is no CustomProcessing or AugustusInterface gc.disable()
def readUniTable(fileLocation, format=None, sorter=None, pageSize=None, mapInvalid=None, mapMissing=None, **parameters): format = getformat(fileLocation, format) ################################################################ CSV if format == "CSV": csvInput = CSVStream(fileLocation, sorter, **parameters) if csvInput.types is not None: types = csvInput.types else: types = dict((f, "string") for f in csvInput.fields) _mapInvalid = dict((f, str("INVALID") if types[f] in ("category", "string") else -1000) for f in csvInput.fields) if mapInvalid is None: mapInvalid = _mapInvalid else: _mapInvalid.update(mapInvalid) mapInvalid = _mapInvalid _mapMissing = dict((f, str("MISSING") if types[f] in ("category", "string") else -1000) for f in csvInput.fields) if mapMissing is None: mapMissing = _mapMissing else: _mapMissing.update(mapMissing) mapMissing = _mapMissing table = UniTable(csvInput.fields, types) table.initMemory(pageSize) for record in csvInput: table.fill([mapInvalid[f] if r is INVALID else mapMissing[f] if r is MISSING else r for f, r in zip(csvInput.fields, record)]) return table ################################################################ XML if format == "XML": xmlInput = XMLStream(fileLocation, sorter, **parameters) if xmlInput.types is not None: types = xmlInput.types else: types = dict((f, "string") for f in xmlInput.fields) _mapInvalid = dict((f, str("INVALID") if types[f] in ("category", "string") else -1000) for f in xmlInput.fields) if mapInvalid is None: mapInvalid = _mapInvalid else: _mapInvalid.update(mapInvalid) mapInvalid = _mapInvalid _mapMissing = dict((f, str("MISSING") if types[f] in ("category", "string") else -1000) for f in xmlInput.fields) if mapMissing is None: mapMissing = _mapMissing else: _mapMissing.update(mapMissing) mapMissing = _mapMissing table = UniTable(xmlInput.fields, types) table.initMemory(pageSize) for record in xmlInput: table.fill([mapInvalid[f] if r is INVALID else r for f, r in [(f, record.get(f, mapMissing[f])) for f in xmlInput.fields]]) return table ################################################################ NAB elif format == "NAB": fileNames = getfiles(fileLocation, sorter) if len(fileNames) == 0: raise IOError("No files match \"%s\" (even with wildcards)" % fileLocation) fields = None types = None strings = {} args = {} for fileName in fileNames: file = open(fileName, "rb") header = file.readline().rstrip() file.close() headerfields = header.decode("utf-8").split() if headerfields[0] != "RecArray": raise BadlyFormattedInputData("NAB file \"%s\" does not begin with 'RecArray'" % fileName) args[fileName] = dict(asciistr(f).split("=") for f in headerfields[1:]) if "masktype" in args.keys(): raise NotImplementedError("No support yet for NAB files (such as \"%s\") with masked NumPy arrays" % fileName) if set(args[fileName].keys()) != set(["formats", "names"]): raise BadlyFormattedInputData("NAB file \"%s\" headers are %s, rather than set([\"formats\", \"names\"])" % (fileName, str(set(args[fileName].keys())))) thisfields = args[fileName]["names"].split(",") thistypes = args[fileName]["formats"].split(",") for i in xrange(len(thistypes)): if thistypes[i][0] == "a": thistypes[i] = "string" strings[thisfields[i]] = True else: strings[thisfields[i]] = False if fields is None: fields = thisfields types = thistypes else: if fields != thisfields: raise IncompatibleFilesInChain("NAB file \"%s\" header has fields %s, which differ from the first %s" % (fileName, str(thisfields), str(fields))) if types != thistypes: raise IncompatibleFilesInChain("NAB file \"%s\" header has types %s, which differ from the first %s" % (fileName, str(thistypes), str(types))) table = UniTable(fields, dict(zip(fields, types))) table.pages = [] table.starts = [] table.length = 0 for fileName in fileNames: file = open(fileName, "rb") file.readline() data = numpy.rec.fromfile(file, **args[fileName]) table.pageSize = len(data) page = UniPage(table.fields, table.types) arrays = {} for f in table.fields: arr = data.field(f) if strings[f]: arr = [i.decode("utf-8") for i in arr] arrays[f] = arr page.initExisting(table.pageSize, arrays, copy=False, stringToCategory=True) table.pages.append(page) table.starts.append(table.length) table.length += len(data) return table ################################################################ XTBL elif format == "XTBL": fileNames = getfiles(fileLocation, sorter) if len(fileNames) == 0: raise IOError("No files match \"%s\" (even with wildcards)" % fileLocation) limitGB = parameters.get("limitGB", None) memoryMap = parameters.get("memoryMap", False) # get the footers from each file (XML) and make sure they have identical DataDictionaries footers = [] for i, fileName in enumerate(fileNames): fileSize = os.stat(fileName).st_size file = open(fileName, "rb") file.seek(max(0, fileSize - 1024)) text = file.read() m = re.search("<SeekFooter\s+byteOffset=\"([0-9]+)\"\s+/>", text) if m is not None: textStart = int(m.group(1)) else: raise IOError("File \"%s\" does not have the right format (the <SeekFooter /> element was not found in the last kilobyte)" % fileName) file.seek(textStart) footer = load(file.read(), xtbl.XTBL) footers.append(footer) if len(footers) > 1: thisDataDictionary = footer.child(xtbl.DataDictionary) firstDataDictionary = footers[0].child(xtbl.DataDictionary) if thisDataDictionary != firstDataDictionary: for x in thisDataDictionary.matches(xtbl.LookupTable, maxdepth=None) + firstDataDictionary.matches(xtbl.LookupTable, maxdepth=None): x.serialize() raise IncompatibleFilesInChain("XTBL file \"%s\" is incompatible with the first file \"%s\":%s%s%s%s" % (fileNames[i], fileNames[0], os.linesep, thisDataDictionary.xml(), os.linesep, firstDataDictionary.xml())) file.close() # set up the UniTable's fields, types, pages, starts, and length fields = [] types = {} dtypes = {} lookups = {} for dataField in footers[0].child(xtbl.DataDictionary).matches(xtbl.DataField): field = dataField.attrib["name"] fields.append(field) types[field] = dataField.attrib["type"] dtypes[field] = dataField.attrib["dtype"] lookup = dataField.child(xtbl.LookupTable, exception=False) if lookup is not None: lookups[field] = lookup.n_to_v else: lookups[field] = None categories = [] for f in fields: n_to_v = lookups[f] if n_to_v is None: categories.append(None) else: v_to_n = dict((v, n) for n, v in n_to_v.items()) categories.append((v_to_n, n_to_v)) table = UniTable(fields, types) table.pages = [] table.starts = [] table.length = 0 uniPageDiskCacheManager = UniPageDiskCacheManager(limitGB, memoryMap) for i, fileName in enumerate(fileNames): for xtblpage in footers[i].child(xtbl.Pages).matches(xtbl.Page): length = xtblpage.attrib["length"] byteOffsets = {} for pageFieldOffset in xtblpage.matches(xtbl.PageFieldOffset): byteOffsets[pageFieldOffset.attrib["name"]] = pageFieldOffset.attrib["byteOffset"] uniPage = UniPageOnDisk(fields, table.types) uniPage.initDisk(length, fileName, byteOffsets, dtypes, categories, uniPageDiskCacheManager) table.pages.append(uniPage) table.starts.append(table.length) table.length += length return table
def readUniTable(fileLocation, format=None, sorter=None, pageSize=None, mapInvalid=None, mapMissing=None, **parameters): format = getformat(fileLocation, format) ################################################################ CSV if format == "CSV": csvInput = CSVStream(fileLocation, sorter, **parameters) if csvInput.types is not None: types = csvInput.types else: types = dict((f, "string") for f in csvInput.fields) _mapInvalid = dict( (f, str("INVALID") if types[f] in ("category", "string") else -1000) for f in csvInput.fields) if mapInvalid is None: mapInvalid = _mapInvalid else: _mapInvalid.update(mapInvalid) mapInvalid = _mapInvalid _mapMissing = dict( (f, str("MISSING") if types[f] in ("category", "string") else -1000) for f in csvInput.fields) if mapMissing is None: mapMissing = _mapMissing else: _mapMissing.update(mapMissing) mapMissing = _mapMissing table = UniTable(csvInput.fields, types) table.initMemory(pageSize) for record in csvInput: table.fill([ mapInvalid[f] if r is INVALID else mapMissing[f] if r is MISSING else r for f, r in zip(csvInput.fields, record) ]) return table ################################################################ XML if format == "XML": xmlInput = XMLStream(fileLocation, sorter, **parameters) if xmlInput.types is not None: types = xmlInput.types else: types = dict((f, "string") for f in xmlInput.fields) _mapInvalid = dict( (f, str("INVALID") if types[f] in ("category", "string") else -1000) for f in xmlInput.fields) if mapInvalid is None: mapInvalid = _mapInvalid else: _mapInvalid.update(mapInvalid) mapInvalid = _mapInvalid _mapMissing = dict( (f, str("MISSING") if types[f] in ("category", "string") else -1000) for f in xmlInput.fields) if mapMissing is None: mapMissing = _mapMissing else: _mapMissing.update(mapMissing) mapMissing = _mapMissing table = UniTable(xmlInput.fields, types) table.initMemory(pageSize) for record in xmlInput: table.fill([ mapInvalid[f] if r is INVALID else r for f, r in [(f, record.get(f, mapMissing[f])) for f in xmlInput.fields] ]) return table ################################################################ NAB elif format == "NAB": fileNames = getfiles(fileLocation, sorter) if len(fileNames) == 0: raise IOError("No files match \"%s\" (even with wildcards)" % fileLocation) fields = None types = None strings = {} args = {} for fileName in fileNames: file = open(fileName, "rb") header = file.readline().rstrip() file.close() headerfields = header.decode("utf-8").split() if headerfields[0] != "RecArray": raise BadlyFormattedInputData( "NAB file \"%s\" does not begin with 'RecArray'" % fileName) args[fileName] = dict( asciistr(f).split("=") for f in headerfields[1:]) if "masktype" in args.keys(): raise NotImplementedError( "No support yet for NAB files (such as \"%s\") with masked NumPy arrays" % fileName) if set(args[fileName].keys()) != set(["formats", "names"]): raise BadlyFormattedInputData( "NAB file \"%s\" headers are %s, rather than set([\"formats\", \"names\"])" % (fileName, str(set(args[fileName].keys())))) thisfields = args[fileName]["names"].split(",") thistypes = args[fileName]["formats"].split(",") for i in xrange(len(thistypes)): if thistypes[i][0] == "a": thistypes[i] = "string" strings[thisfields[i]] = True else: strings[thisfields[i]] = False if fields is None: fields = thisfields types = thistypes else: if fields != thisfields: raise IncompatibleFilesInChain( "NAB file \"%s\" header has fields %s, which differ from the first %s" % (fileName, str(thisfields), str(fields))) if types != thistypes: raise IncompatibleFilesInChain( "NAB file \"%s\" header has types %s, which differ from the first %s" % (fileName, str(thistypes), str(types))) table = UniTable(fields, dict(zip(fields, types))) table.pages = [] table.starts = [] table.length = 0 for fileName in fileNames: file = open(fileName, "rb") file.readline() data = numpy.rec.fromfile(file, **args[fileName]) table.pageSize = len(data) page = UniPage(table.fields, table.types) arrays = {} for f in table.fields: arr = data.field(f) if strings[f]: arr = [i.decode("utf-8") for i in arr] arrays[f] = arr page.initExisting(table.pageSize, arrays, copy=False, stringToCategory=True) table.pages.append(page) table.starts.append(table.length) table.length += len(data) return table ################################################################ XTBL elif format == "XTBL": fileNames = getfiles(fileLocation, sorter) if len(fileNames) == 0: raise IOError("No files match \"%s\" (even with wildcards)" % fileLocation) limitGB = parameters.get("limitGB", None) memoryMap = parameters.get("memoryMap", False) # get the footers from each file (XML) and make sure they have identical DataDictionaries footers = [] for i, fileName in enumerate(fileNames): fileSize = os.stat(fileName).st_size file = open(fileName, "rb") file.seek(max(0, fileSize - 1024)) text = file.read() m = re.search("<SeekFooter\s+byteOffset=\"([0-9]+)\"\s+/>", text) if m is not None: textStart = int(m.group(1)) else: raise IOError( "File \"%s\" does not have the right format (the <SeekFooter /> element was not found in the last kilobyte)" % fileName) file.seek(textStart) footer = load(file.read(), xtbl.XTBL) footers.append(footer) if len(footers) > 1: thisDataDictionary = footer.child(xtbl.DataDictionary) firstDataDictionary = footers[0].child(xtbl.DataDictionary) if thisDataDictionary != firstDataDictionary: for x in thisDataDictionary.matches( xtbl.LookupTable, maxdepth=None) + firstDataDictionary.matches( xtbl.LookupTable, maxdepth=None): x.serialize() raise IncompatibleFilesInChain( "XTBL file \"%s\" is incompatible with the first file \"%s\":%s%s%s%s" % (fileNames[i], fileNames[0], os.linesep, thisDataDictionary.xml(), os.linesep, firstDataDictionary.xml())) file.close() # set up the UniTable's fields, types, pages, starts, and length fields = [] types = {} dtypes = {} lookups = {} for dataField in footers[0].child(xtbl.DataDictionary).matches( xtbl.DataField): field = dataField.attrib["name"] fields.append(field) types[field] = dataField.attrib["type"] dtypes[field] = dataField.attrib["dtype"] lookup = dataField.child(xtbl.LookupTable, exception=False) if lookup is not None: lookups[field] = lookup.n_to_v else: lookups[field] = None categories = [] for f in fields: n_to_v = lookups[f] if n_to_v is None: categories.append(None) else: v_to_n = dict((v, n) for n, v in n_to_v.items()) categories.append((v_to_n, n_to_v)) table = UniTable(fields, types) table.pages = [] table.starts = [] table.length = 0 uniPageDiskCacheManager = UniPageDiskCacheManager(limitGB, memoryMap) for i, fileName in enumerate(fileNames): for xtblpage in footers[i].child(xtbl.Pages).matches(xtbl.Page): length = xtblpage.attrib["length"] byteOffsets = {} for pageFieldOffset in xtblpage.matches(xtbl.PageFieldOffset): byteOffsets[pageFieldOffset.attrib[ "name"]] = pageFieldOffset.attrib["byteOffset"] uniPage = UniPageOnDisk(fields, table.types) uniPage.initDisk(length, fileName, byteOffsets, dtypes, categories, uniPageDiskCacheManager) table.pages.append(uniPage) table.starts.append(table.length) table.length += length return table
def __init__(self, configuration, model=None, dataStream=None, rethrowExceptions=None): self.model = model self.dataStream = dataStream self.rethrowExceptions = rethrowExceptions self.fileNameOnException = None # get the configuration, in whatever form you find it if isinstance(configuration, config.AugustusConfiguration): pass elif isinstance(configuration, basestring): try: configuration = xmlbase.loadfile(configuration, config.Config, lineNumbers=True) except IOError: configuration = xmlbase.load(configuration, config.Config, lineNumbers=True) else: raise ConfigurationError("Configuration must be a pre-validated XML object, a fileName, or a literal configuration string.") # set up logging setupLogging(configuration.matches(lambda x: isinstance(x, (config.Logging, config.Metadata)))) self.logger = logging.getLogger() self.metadata = logging.getLogger("metadata") # begin "initialization" phase for l in self.logger, self.metadata: if "initialization" in l.differentLevel: l.setLevel(l.differentLevel["initialization"]) else: l.setLevel(l.naturalLevel) # get the model, in whatever form you find it self.logger.info("Loading PMML model.") self.metadata.startTiming("Time to load PMML model") modelFileName = "(none)" maturityThreshold = 0 if self.model is None: modelInput = configuration.child(config.ModelInput, exception=None) if modelInput is None: raise ConfigurationError("If a model is not provided to MainLoop explicitly, it must be present in the configuration file.") fileLocation = modelInput["fileLocation"] if not fileLocation.startswith("http://") and not fileLocation.startswith("https://"): fileList = glob.glob(fileLocation) if len(fileList) > 1: fileList = [f for f in fileList if self._modelExceptionIdentifier not in f] if len(fileList) == 0: raise IOError("No files matched the ModelInput fileLocation \"%s\"." % fileLocation) selectmode = modelInput.attrib.get("selectmode", "lastAlphabetic") if selectmode == "mostRecent": fileLocation = max(fileList, key=lambda x: os.stat(x).st_mtime) elif selectmode == "lastAlphabetic": fileList.sort() fileLocation = fileList[-1] else: assert False if self._modelExceptionIdentifier in fileLocation: self.logger.warning("Using a PMML model that was written on exception (fileName \"%s\")" % fileLocation) self.model = xmlbase.loadfile(fileLocation, pmml.X_ODG_PMML, lineNumbers=True) if "maturityThreshold" in modelInput.attrib: maturityThreshold = modelInput["maturityThreshold"] elif isinstance(self.model, pmml.PMML): pass elif isinstance(self.model, basestring): try: self.model, modelFileName = xmlbase.loadfile(self.model, pmml.X_ODG_PMML, lineNumbers=True), self.model except IOError: self.model = xmlbase.load(self.model, pmml.X_ODG_PMML, lineNumbers=True) else: raise ConfigurationError("Model must be a pre-validated XML object, a fileName, or a literal PMML string.") self.metadata.stopTiming("Time to load PMML model") self.metadata.data["PMML model file"] = modelFileName # globally set random number seeds if "randomSeed" in configuration.attrib: augustusRandomSeed = configuration["randomSeed"] random.seed(augustusRandomSeed) numpy.random.seed(augustusRandomSeed + 1) else: augustusRandomSeed = "unspecified" # globally set numpy error handling numpy.seterr(divide="raise", over="raise", under="ignore", invalid="raise") # update schemes (producerUpdateScheme may be redefined below) consumerUpdateScheme = self._getUpdateScheme(configuration.child(config.ConsumerBlending, exception=False)) producerUpdateScheme = self._getUpdateScheme(None) # set up scoring output outputConfig = configuration.child(config.Output, exception=False) if outputConfig is None: self.outputWriter = None else: outputParams = {"pmmlFileName": modelFileName, "mode": outputConfig.destination.attrib.get("type", "XML").lower()} if isinstance(outputConfig.destination, config.ToFile): if outputConfig.destination.attrib.get("overwrite", False): outputStream = codecs.open(outputConfig.destination["name"], "w", encoding="utf-8") else: outputStream = codecs.open(outputConfig.destination["name"], "a", encoding="utf-8") elif isinstance(outputConfig.destination, config.ToStandardError): outputStream = sys.stderr elif isinstance(outputConfig.destination, config.ToStandardOut): outputStream = sys.stdout else: assert False reportTag = outputConfig.child("ReportTag", exception=False) if reportTag: outputParams["reportName"] = reportTag.attrib.get("name", "Report") eventTag = outputConfig.child("EventTag", exception=False) if eventTag: outputParams["eventName"] = eventTag.attrib.get("name", "Event") outputParams["pseudoEventName"] = eventTag.attrib.get("pseudoName", "pseudoEvent") self.outputWriter = OutputWriter(outputStream, **outputParams) # initialize for the case of no output model engineSettings = {"maturityThreshold": maturityThreshold, "augustusRandomSeed": augustusRandomSeed} self.modelWriter = None segmentationScheme = SegmentationScheme(None, self.model) self.updateFlag = False self.aggregateUpdateFlag = False producerAlgorithm = dict(config.producerAlgorithmDefaults) for pa in producerAlgorithm.values(): validationResult = pa.validate() assert validationResult is None # set up output model, if present in the configuration modelSetup = configuration.child(config.ModelSetup, exception=False) engineSettings["hasProducer"] = modelSetup is not None if engineSettings["hasProducer"]: self.logger.info("Setting up model updating/producing.") producerBlending = modelSetup.child(config.ProducerBlending, exception=False) producerUpdateScheme = self._getUpdateScheme(producerBlending) if producerBlending is not None and producerBlending.contains(config.MaturityThreshold): maturityConfig = producerBlending.child(config.MaturityThreshold) engineSettings["maturityThreshold"] = int(maturityConfig.attrib.get("threshold", 1)) try: engineSettings["lockingThreshold"] = int(maturityConfig.attrib["lockingThreshold"]) except KeyError: engineSettings["lockingThreshold"] = None engineSettings["lockAllSegments"] = modelSetup.attrib.get("mode", None) == "lockExisting" if engineSettings["lockAllSegments"] and segmentationScheme is not None and not segmentationScheme._generic and not segmentationScheme._whiteList: self.logger.warning("The model is locked and no new segments are specified...new model files will be unchanged.") self.modelWriter = getModelWriter(modelSetup) if self.modelWriter is not None: if self.modelWriter.baseName is None: self.fileNameOnException = self._modelExceptionIdentifier + ".pmml" else: self.fileNameOnException = "".join([self.modelWriter.baseName, self._modelExceptionIdentifier, ".pmml"]) else: self.logger.warning("There is no outputFile attribute in the ModelSetup; no new model file will be created.") segmentationScheme = SegmentationScheme(modelSetup.child(config.SegmentationSchema, exception=False), self.model) self.updateFlag = modelSetup.attrib.get("updateEvery", "event") in ("event", "both") self.aggregateUpdateFlag = modelSetup.attrib.get("updateEvery", "event") in ("aggregate", "both") for pa in modelSetup.matches(config.ProducerAlgorithm): producerAlgorithm[pa["model"]] = pa if modelSetup.attrib.get("mode", None) == "updateExisting": for pa in producerAlgorithm.values(): pa.parameters["updateExisting"] = True if modelSetup.attrib.get("mode", None) == "replaceExisting": for pa in producerAlgorithm.values(): pa.parameters["updateExisting"] = False # to score or not to score eventSettings = configuration.child(config.EventSettings, exception=False) if eventSettings is not None: self.logger.info("Setting up output.") self.scoreFlag = eventSettings["score"] self.outputFlag = eventSettings["output"] else: self.scoreFlag = False self.outputFlag = False aggregationConfig = configuration.child(config.AggregationSettings, exception=False) if aggregationConfig is not None: self.aggregateScoreFlag = aggregationConfig["score"] self.aggregateOutputFlag = aggregationConfig["output"] self.aggregationSettings = dict(aggregationConfig.attrib) else: self.aggregateScoreFlag = False self.aggregateOutputFlag = False self.aggregationSettings = None self.metadata.data["Update model"] = "true" if self.updateFlag or self.aggregateUpdateFlag else "false" # build a scoring engine once without a dataStream (to evaluate any verification blocks) self.engine = Engine(self.model, None, producerUpdateScheme, consumerUpdateScheme, segmentationScheme, producerAlgorithm, **engineSettings) self.engine.initialize() if self.outputWriter is not None: self.outputWriter.open() # begin "verification" phase for l in self.logger, self.metadata: if "verification" in l.differentLevel: l.eventLogLevel = l.differentLevel["verification"] l.setLevel(l.differentLevel["verification"]) else: l.eventLogLevel = l.naturalLevel l.setLevel(l.naturalLevel) # evaluate verification blocks modelVerificationConfig = configuration.child(config.ModelVerification, exception=False) if modelVerificationConfig is not None: verify(modelVerificationConfig, self.engine, self.logger, self.outputWriter) # verification can increment aggregate variables, but # aggregates should all start at zero at the start of real # processing, whether verification happened or not self.engine.flushAggregates() # get the dataStream, in whatever form you find it self.logger.info("Setting up data input.") if self.dataStream is None: configDataInput = configuration.child(config.DataInput, exception=None) if configDataInput is None: raise ConfigurationError("If a dataStream is not provided to MainLoop explicitly, it must be present in the configuration file.") if configDataInput.contains(config.FromFile): self.dataStream = DataStreamer(configDataInput.child(config.FromFile), self.engine.pmmlModel) elif configDataInput.contains(config.FromStandardIn): self.dataStream = DataStreamer(configDataInput.child(config.FromStandardIn), self.engine.pmmlModel) elif configDataInput.contains(config.FromHTTP): self.dataStream = AugustusHTTPDataStream(configDataInput.child(config.FromHTTP)) if self.outputWriter is None: self.dataStream.respond = False if self.dataStream.respond: self.dataStream.setupOutput(self.outputWriter) else: assert False # begin "eventLoop" phase for l in self.logger, self.metadata: if "eventloop" in l.differentLevel: l.eventLogLevel = l.differentLevel["eventloop"] l.setLevel(l.differentLevel["eventloop"]) else: l.eventLogLevel = l.naturalLevel l.setLevel(l.naturalLevel) # possibly set up custom processing self.customProcessing = configuration.child(config.CustomProcessing, exception=False) if self.customProcessing is not None: constants = self.engine.pmmlModel.child(pmml.Extension, exception=False) if constants is None: constants = NameSpaceReadOnly() else: constants = constants.child(pmml.X_ODG_CustomProcessingConstants, exception=False) if constants is None: constants = NameSpaceReadOnly() else: constants = constants.nameSpace atoms = {"INVALID": INVALID, "MISSING": MISSING, "IMMATURE": IMMATURE, "MATURE": MATURE, "LOCKED": LOCKED, "UNINITIALIZED": UNINITIALIZED} for thing in pmml.OutputField.__dict__.values() + pmml.X_ODG_OutputField.__dict__.values(): if isinstance(thing, Atom): atoms[repr(thing)] = thing self.customProcessing.initialize(self.model, self.engine.pmmlModel, constants, [s.userFriendly for s in self.engine.segmentRecords], atoms, self.logger, self.metadata, consumerUpdateScheme, producerUpdateScheme) self.engine.customProcessing = self.customProcessing self.engine.reinitialize() else: # only turn off circular garbage collection if there is no CustomProcessing or AugustusInterface gc.disable()