Beispiel #1
0
    def do_POST(self):
        while self.server.dataStream.request is not None:
            time.sleep(0)

        logger = logging.getLogger()
        logDebug = logger.getEffectiveLevel() <= logging.DEBUG

        try:
            plen = self.headers.getheader("Content-length")
            if plen is None:
                self.send_error(400, "Content-length header is missing")
                logger.error("HTTP request with no Content-length: %s" % str(self.headers))
                return

            data = self.rfile.read(int(plen))

            try:
                content = xmlbase.load(data)
            except (xmlbase.XMLError, xmlbase.XMLValidationError), err:
                self.send_error(400, "Content is not valid (%s: %s)" % (err.__class__.__name__, str(err)))
                logger.error("HTTP request with invalid content (%s: %s):\n%s" % (err.__class__.__name__, str(err), data))
                return

            if content.tag == "Event":
                if "id" not in content.attrib:
                    self.send_error(400, "Event must have an 'id' attribute")
                    logger.error("HTTP request without 'id': %s" % data)
                    return
            else:
                self.send_error(400, "Request has unrecognized tag: %s" % content.tag)
                logger.error("HTTP request unrecognized tag: %s" % content.tag)
                return
Beispiel #2
0
    def __init__(self, configuration, dataStream=None, connect="", exceptions=True):
        if isinstance(configuration, config.AugustusConfiguration):
            configuration.validate(exception=True)
        else:
            try:
                configuration = xmlbase.loadfile(configuration, config.Config, lineNumbers=True)
            except IOError:
                configuration = xmlbase.load(configuration, config.Config)

        if configuration.exists(config.CustomProcessing):
            raise Exception("The Augustus class defines its own <CustomProcessing>; please leave this out of the configuration file.")

        self.dataStream = dataStream
        if configuration.child(config.DataInput).exists(config.Interactive):
            if dataStream is None:
                raise Exception("If the configuration has a DataInput <Interactive> block, then a DataStream object must be provided.")
        else:
            if dataStream is not None:
                raise Exception("If the configuration has no DataInput <Interactive> block, then a DataStream object must not be provided.")

        persistentStorage = config.PersistentStorage(connect=connect)
        persistentStorage.validate()
        customProcessing = config.CustomProcessing(persistentStorage)
        customProcessing.code = None
        customProcessing.callbackClass = self
        configuration.children.append(customProcessing)

        self.configuration = configuration
        self.mainLoop = augustus.engine.mainloop.MainLoop(self.configuration, rethrowExceptions=exceptions, dataStream=self.dataStream)
Beispiel #3
0
    def post_validate(self):
        if "validate" not in self.attrib:
            self["validate"] = True

        try:
            self.data = xmlbase.load(sys.stdin.read(), pmml.X_ODG_PMML, validation=self["validate"])
        except XMLValidationError, err:
            raise RuntimeError("StandardInput PMML failed validation: %s" % str(err))
Beispiel #4
0
    def post_validate(self):
        if "validate" not in self.attrib:
            self["validate"] = True

        try:
            self.data = xmlbase.load(sys.stdin.read(), pmml.X_ODG_PMML, validation=self["validate"])
        except XMLValidationError, err:
            raise RuntimeError, "StandardInput PMML failed validation: %s" % str(err)
Beispiel #5
0
    def __init__(self,
                 configuration,
                 dataStream=None,
                 connect="",
                 exceptions=True):
        if isinstance(configuration, config.AugustusConfiguration):
            configuration.validate(exception=True)
        else:
            try:
                configuration = xmlbase.loadfile(configuration,
                                                 config.Config,
                                                 lineNumbers=True)
            except IOError:
                configuration = xmlbase.load(configuration, config.Config)

        if configuration.exists(config.CustomProcessing):
            raise Exception(
                "The Augustus class defines its own <CustomProcessing>; please leave this out of the configuration file."
            )

        self.dataStream = dataStream
        if configuration.child(config.DataInput).exists(config.Interactive):
            if dataStream is None:
                raise Exception(
                    "If the configuration has a DataInput <Interactive> block, then a DataStream object must be provided."
                )
        else:
            if dataStream is not None:
                raise Exception(
                    "If the configuration has no DataInput <Interactive> block, then a DataStream object must not be provided."
                )

        persistentStorage = config.PersistentStorage(connect=connect)
        persistentStorage.validate()
        customProcessing = config.CustomProcessing(persistentStorage)
        customProcessing.code = None
        customProcessing.callbackClass = self
        configuration.children.append(customProcessing)

        self.configuration = configuration
        self.mainLoop = augustus.engine.mainloop.MainLoop(
            self.configuration,
            rethrowExceptions=exceptions,
            dataStream=self.dataStream)
Beispiel #6
0
    def __init__(self,
                 configuration,
                 model=None,
                 dataStream=None,
                 rethrowExceptions=None):
        self.model = model
        self.dataStream = dataStream
        self.rethrowExceptions = rethrowExceptions
        self.fileNameOnException = None

        # get the configuration, in whatever form you find it
        if isinstance(configuration, config.AugustusConfiguration):
            pass
        elif isinstance(configuration, basestring):
            try:
                configuration = xmlbase.loadfile(configuration,
                                                 config.Config,
                                                 lineNumbers=True)
            except IOError:
                configuration = xmlbase.load(configuration,
                                             config.Config,
                                             lineNumbers=True)
        else:
            raise ConfigurationError(
                "Configuration must be a pre-validated XML object, a fileName, or a literal configuration string."
            )

        # set up logging
        setupLogging(
            configuration.matches(
                lambda x: isinstance(x, (config.Logging, config.Metadata))))
        self.logger = logging.getLogger()
        self.metadata = logging.getLogger("metadata")

        # begin "initialization" phase
        for l in self.logger, self.metadata:
            if "initialization" in l.differentLevel:
                l.setLevel(l.differentLevel["initialization"])
            else:
                l.setLevel(l.naturalLevel)

        # get the model, in whatever form you find it
        self.logger.info("Loading PMML model.")
        self.metadata.startTiming("Time to load PMML model")
        modelFileName = "(none)"
        maturityThreshold = 0
        if self.model is None:
            modelInput = configuration.child(config.ModelInput, exception=None)
            if modelInput is None:
                raise ConfigurationError(
                    "If a model is not provided to MainLoop explicitly, it must be present in the configuration file."
                )

            fileLocation = modelInput["fileLocation"]
            if not fileLocation.startswith(
                    "http://") and not fileLocation.startswith("https://"):
                fileList = glob.glob(fileLocation)
                if len(fileList) > 1:
                    fileList = [
                        f for f in fileList
                        if self._modelExceptionIdentifier not in f
                    ]
                if len(fileList) == 0:
                    raise IOError(
                        "No files matched the ModelInput fileLocation \"%s\"."
                        % fileLocation)

                selectmode = modelInput.attrib.get("selectmode",
                                                   "lastAlphabetic")
                if selectmode == "mostRecent":
                    fileLocation = max(fileList,
                                       key=lambda x: os.stat(x).st_mtime)
                elif selectmode == "lastAlphabetic":
                    fileList.sort()
                    fileLocation = fileList[-1]
                else:
                    assert False

                if self._modelExceptionIdentifier in fileLocation:
                    self.logger.warning(
                        "Using a PMML model that was written on exception (fileName \"%s\")"
                        % fileLocation)

            self.model = xmlbase.loadfile(fileLocation,
                                          pmml.X_ODG_PMML,
                                          lineNumbers=True)

            if "maturityThreshold" in modelInput.attrib:
                maturityThreshold = modelInput["maturityThreshold"]

        elif isinstance(self.model, pmml.PMML):
            pass
        elif isinstance(self.model, basestring):
            try:
                self.model, modelFileName = xmlbase.loadfile(
                    self.model, pmml.X_ODG_PMML, lineNumbers=True), self.model
            except IOError:
                self.model = xmlbase.load(self.model,
                                          pmml.X_ODG_PMML,
                                          lineNumbers=True)
        else:
            raise ConfigurationError(
                "Model must be a pre-validated XML object, a fileName, or a literal PMML string."
            )
        self.metadata.stopTiming("Time to load PMML model")
        self.metadata.data["PMML model file"] = modelFileName

        # globally set random number seeds
        if "randomSeed" in configuration.attrib:
            augustusRandomSeed = configuration["randomSeed"]
            random.seed(augustusRandomSeed)
            numpy.random.seed(augustusRandomSeed + 1)
        else:
            augustusRandomSeed = "unspecified"

        # globally set numpy error handling
        numpy.seterr(divide="raise",
                     over="raise",
                     under="ignore",
                     invalid="raise")

        # update schemes (producerUpdateScheme may be redefined below)
        consumerUpdateScheme = self._getUpdateScheme(
            configuration.child(config.ConsumerBlending, exception=False))
        producerUpdateScheme = self._getUpdateScheme(None)

        # set up scoring output
        outputConfig = configuration.child(config.Output, exception=False)
        if outputConfig is None:
            self.outputWriter = None
        else:
            outputParams = {
                "pmmlFileName": modelFileName,
                "mode": outputConfig.destination.attrib.get("type",
                                                            "XML").lower()
            }

            if isinstance(outputConfig.destination, config.ToFile):
                if outputConfig.destination.attrib.get("overwrite", False):
                    outputStream = codecs.open(
                        outputConfig.destination["name"],
                        "w",
                        encoding="utf-8")
                else:
                    outputStream = codecs.open(
                        outputConfig.destination["name"],
                        "a",
                        encoding="utf-8")
            elif isinstance(outputConfig.destination, config.ToStandardError):
                outputStream = sys.stderr
            elif isinstance(outputConfig.destination, config.ToStandardOut):
                outputStream = sys.stdout
            else:
                assert False

            reportTag = outputConfig.child("ReportTag", exception=False)
            if reportTag:
                outputParams["reportName"] = reportTag.attrib.get(
                    "name", "Report")

            eventTag = outputConfig.child("EventTag", exception=False)
            if eventTag:
                outputParams["eventName"] = eventTag.attrib.get(
                    "name", "Event")
                outputParams["pseudoEventName"] = eventTag.attrib.get(
                    "pseudoName", "pseudoEvent")

            self.outputWriter = OutputWriter(outputStream, **outputParams)

        # initialize for the case of no output model
        engineSettings = {
            "maturityThreshold": maturityThreshold,
            "augustusRandomSeed": augustusRandomSeed
        }
        self.modelWriter = None
        segmentationScheme = SegmentationScheme(None, self.model)
        self.updateFlag = False
        self.aggregateUpdateFlag = False

        producerAlgorithm = dict(config.producerAlgorithmDefaults)
        for pa in producerAlgorithm.values():
            validationResult = pa.validate()
            assert validationResult is None

        # set up output model, if present in the configuration
        modelSetup = configuration.child(config.ModelSetup, exception=False)
        engineSettings["hasProducer"] = modelSetup is not None
        if engineSettings["hasProducer"]:
            self.logger.info("Setting up model updating/producing.")

            producerBlending = modelSetup.child(config.ProducerBlending,
                                                exception=False)
            producerUpdateScheme = self._getUpdateScheme(producerBlending)
            if producerBlending is not None and producerBlending.contains(
                    config.MaturityThreshold):
                maturityConfig = producerBlending.child(
                    config.MaturityThreshold)
                engineSettings["maturityThreshold"] = int(
                    maturityConfig.attrib.get("threshold", 1))
                try:
                    engineSettings["lockingThreshold"] = int(
                        maturityConfig.attrib["lockingThreshold"])
                except KeyError:
                    engineSettings["lockingThreshold"] = None

            engineSettings["lockAllSegments"] = modelSetup.attrib.get(
                "mode", None) == "lockExisting"
            if engineSettings[
                    "lockAllSegments"] and segmentationScheme is not None and not segmentationScheme._generic and not segmentationScheme._whiteList:
                self.logger.warning(
                    "The model is locked and no new segments are specified...new model files will be unchanged."
                )

            self.modelWriter = getModelWriter(modelSetup)
            if self.modelWriter is not None:
                if self.modelWriter.baseName is None:
                    self.fileNameOnException = self._modelExceptionIdentifier + ".pmml"
                else:
                    self.fileNameOnException = "".join([
                        self.modelWriter.baseName,
                        self._modelExceptionIdentifier, ".pmml"
                    ])
            else:
                self.logger.warning(
                    "There is no outputFile attribute in the ModelSetup; no new model file will be created."
                )

            segmentationScheme = SegmentationScheme(
                modelSetup.child(config.SegmentationSchema, exception=False),
                self.model)
            self.updateFlag = modelSetup.attrib.get("updateEvery",
                                                    "event") in ("event",
                                                                 "both")
            self.aggregateUpdateFlag = modelSetup.attrib.get(
                "updateEvery", "event") in ("aggregate", "both")

            for pa in modelSetup.matches(config.ProducerAlgorithm):
                producerAlgorithm[pa["model"]] = pa
            if modelSetup.attrib.get("mode", None) == "updateExisting":
                for pa in producerAlgorithm.values():
                    pa.parameters["updateExisting"] = True
            if modelSetup.attrib.get("mode", None) == "replaceExisting":
                for pa in producerAlgorithm.values():
                    pa.parameters["updateExisting"] = False

        # to score or not to score
        eventSettings = configuration.child(config.EventSettings,
                                            exception=False)
        if eventSettings is not None:
            self.logger.info("Setting up output.")
            self.scoreFlag = eventSettings["score"]
            self.outputFlag = eventSettings["output"]
        else:
            self.scoreFlag = False
            self.outputFlag = False

        aggregationConfig = configuration.child(config.AggregationSettings,
                                                exception=False)
        if aggregationConfig is not None:
            self.aggregateScoreFlag = aggregationConfig["score"]
            self.aggregateOutputFlag = aggregationConfig["output"]
            self.aggregationSettings = dict(aggregationConfig.attrib)
        else:
            self.aggregateScoreFlag = False
            self.aggregateOutputFlag = False
            self.aggregationSettings = None

        self.metadata.data[
            "Update model"] = "true" if self.updateFlag or self.aggregateUpdateFlag else "false"

        # build a scoring engine once without a dataStream (to evaluate any verification blocks)
        self.engine = Engine(self.model, None, producerUpdateScheme,
                             consumerUpdateScheme, segmentationScheme,
                             producerAlgorithm, **engineSettings)
        self.engine.initialize()
        if self.outputWriter is not None: self.outputWriter.open()

        # begin "verification" phase
        for l in self.logger, self.metadata:
            if "verification" in l.differentLevel:
                l.eventLogLevel = l.differentLevel["verification"]
                l.setLevel(l.differentLevel["verification"])
            else:
                l.eventLogLevel = l.naturalLevel
                l.setLevel(l.naturalLevel)

        # evaluate verification blocks
        modelVerificationConfig = configuration.child(config.ModelVerification,
                                                      exception=False)
        if modelVerificationConfig is not None:
            verify(modelVerificationConfig, self.engine, self.logger,
                   self.outputWriter)

        # verification can increment aggregate variables, but
        # aggregates should all start at zero at the start of real
        # processing, whether verification happened or not
        self.engine.flushAggregates()

        # get the dataStream, in whatever form you find it
        self.logger.info("Setting up data input.")
        if self.dataStream is None:
            configDataInput = configuration.child(config.DataInput,
                                                  exception=None)
            if configDataInput is None:
                raise ConfigurationError(
                    "If a dataStream is not provided to MainLoop explicitly, it must be present in the configuration file."
                )
            if configDataInput.contains(config.FromFile):
                self.dataStream = DataStreamer(
                    configDataInput.child(config.FromFile),
                    self.engine.pmmlModel)
            elif configDataInput.contains(config.FromStandardIn):
                self.dataStream = DataStreamer(
                    configDataInput.child(config.FromStandardIn),
                    self.engine.pmmlModel)
            elif configDataInput.contains(config.FromHTTP):
                self.dataStream = AugustusHTTPDataStream(
                    configDataInput.child(config.FromHTTP))
                if self.outputWriter is None:
                    self.dataStream.respond = False
                if self.dataStream.respond:
                    self.dataStream.setupOutput(self.outputWriter)
            else:
                assert False

        # begin "eventLoop" phase
        for l in self.logger, self.metadata:
            if "eventloop" in l.differentLevel:
                l.eventLogLevel = l.differentLevel["eventloop"]
                l.setLevel(l.differentLevel["eventloop"])
            else:
                l.eventLogLevel = l.naturalLevel
                l.setLevel(l.naturalLevel)

        # possibly set up custom processing
        self.customProcessing = configuration.child(config.CustomProcessing,
                                                    exception=False)
        if self.customProcessing is not None:
            constants = self.engine.pmmlModel.child(pmml.Extension,
                                                    exception=False)
            if constants is None:
                constants = NameSpaceReadOnly()
            else:
                constants = constants.child(
                    pmml.X_ODG_CustomProcessingConstants, exception=False)
                if constants is None:
                    constants = NameSpaceReadOnly()
                else:
                    constants = constants.nameSpace

            atoms = {
                "INVALID": INVALID,
                "MISSING": MISSING,
                "IMMATURE": IMMATURE,
                "MATURE": MATURE,
                "LOCKED": LOCKED,
                "UNINITIALIZED": UNINITIALIZED
            }
            for thing in pmml.OutputField.__dict__.values(
            ) + pmml.X_ODG_OutputField.__dict__.values():
                if isinstance(thing, Atom):
                    atoms[repr(thing)] = thing

            self.customProcessing.initialize(
                self.model, self.engine.pmmlModel, constants,
                [s.userFriendly
                 for s in self.engine.segmentRecords], atoms, self.logger,
                self.metadata, consumerUpdateScheme, producerUpdateScheme)
            self.engine.customProcessing = self.customProcessing
            self.engine.reinitialize()

        else:
            # only turn off circular garbage collection if there is no CustomProcessing or AugustusInterface
            gc.disable()
Beispiel #7
0
def readUniTable(fileLocation, format=None, sorter=None, pageSize=None, mapInvalid=None, mapMissing=None, **parameters):
    format = getformat(fileLocation, format)

    ################################################################ CSV
    if format == "CSV":
        csvInput = CSVStream(fileLocation, sorter, **parameters)

        if csvInput.types is not None:
            types = csvInput.types
        else:
            types = dict((f, "string") for f in csvInput.fields)

        _mapInvalid = dict((f, str("INVALID") if types[f] in ("category", "string") else -1000) for f in csvInput.fields)
        if mapInvalid is None:
            mapInvalid = _mapInvalid
        else:
            _mapInvalid.update(mapInvalid)
            mapInvalid = _mapInvalid

        _mapMissing = dict((f, str("MISSING") if types[f] in ("category", "string") else -1000) for f in csvInput.fields)
        if mapMissing is None:
            mapMissing = _mapMissing
        else:
            _mapMissing.update(mapMissing)
            mapMissing = _mapMissing

        table = UniTable(csvInput.fields, types)
        table.initMemory(pageSize)

        for record in csvInput:
            table.fill([mapInvalid[f] if r is INVALID else mapMissing[f] if r is MISSING else r for f, r in zip(csvInput.fields, record)])

        return table

    ################################################################ XML
    if format == "XML":
        xmlInput = XMLStream(fileLocation, sorter, **parameters)

        if xmlInput.types is not None:
            types = xmlInput.types
        else:
            types = dict((f, "string") for f in xmlInput.fields)

        _mapInvalid = dict((f, str("INVALID") if types[f] in ("category", "string") else -1000) for f in xmlInput.fields)
        if mapInvalid is None:
            mapInvalid = _mapInvalid
        else:
            _mapInvalid.update(mapInvalid)
            mapInvalid = _mapInvalid

        _mapMissing = dict((f, str("MISSING") if types[f] in ("category", "string") else -1000) for f in xmlInput.fields)
        if mapMissing is None:
            mapMissing = _mapMissing
        else:
            _mapMissing.update(mapMissing)
            mapMissing = _mapMissing

        table = UniTable(xmlInput.fields, types)
        table.initMemory(pageSize)

        for record in xmlInput:
            table.fill([mapInvalid[f] if r is INVALID else r for f, r in [(f, record.get(f, mapMissing[f])) for f in xmlInput.fields]])

        return table

    ################################################################ NAB
    elif format == "NAB":
        fileNames = getfiles(fileLocation, sorter)
        if len(fileNames) == 0:
            raise IOError("No files match \"%s\" (even with wildcards)" % fileLocation)

        fields = None
        types = None
        strings = {}
        args = {}
        for fileName in fileNames:
            file = open(fileName, "rb")
            header = file.readline().rstrip()
            file.close()

            headerfields = header.decode("utf-8").split()
            if headerfields[0] != "RecArray":
                raise BadlyFormattedInputData("NAB file \"%s\" does not begin with 'RecArray'" % fileName)

            args[fileName] = dict(asciistr(f).split("=") for f in headerfields[1:])

            if "masktype" in args.keys():
                raise NotImplementedError("No support yet for NAB files (such as \"%s\") with masked NumPy arrays" % fileName)

            if set(args[fileName].keys()) != set(["formats", "names"]):
                raise BadlyFormattedInputData("NAB file \"%s\" headers are %s, rather than set([\"formats\", \"names\"])" % (fileName, str(set(args[fileName].keys()))))

            thisfields = args[fileName]["names"].split(",")
            thistypes = args[fileName]["formats"].split(",")
            for i in xrange(len(thistypes)):
                if thistypes[i][0] == "a":
                    thistypes[i] = "string"
                    strings[thisfields[i]] = True
                else:
                    strings[thisfields[i]] = False

            if fields is None:
                fields = thisfields
                types = thistypes
            else:
                if fields != thisfields:
                    raise IncompatibleFilesInChain("NAB file \"%s\" header has fields %s, which differ from the first %s" % (fileName, str(thisfields), str(fields)))
                if types != thistypes:
                    raise IncompatibleFilesInChain("NAB file \"%s\" header has types %s, which differ from the first %s" % (fileName, str(thistypes), str(types)))

        table = UniTable(fields, dict(zip(fields, types)))
        table.pages = []
        table.starts = []
        table.length = 0

        for fileName in fileNames:
            file = open(fileName, "rb")
            file.readline()
            data = numpy.rec.fromfile(file, **args[fileName])
            
            table.pageSize = len(data)
            page = UniPage(table.fields, table.types)

            arrays = {}
            for f in table.fields:
                arr = data.field(f)
                if strings[f]:
                    arr = [i.decode("utf-8") for i in arr]
                arrays[f] = arr

            page.initExisting(table.pageSize, arrays, copy=False, stringToCategory=True)
            table.pages.append(page)
            table.starts.append(table.length)
            table.length += len(data)

        return table

    ################################################################ XTBL
    elif format == "XTBL":
        fileNames = getfiles(fileLocation, sorter)
        if len(fileNames) == 0:
            raise IOError("No files match \"%s\" (even with wildcards)" % fileLocation)

        limitGB = parameters.get("limitGB", None)
        memoryMap = parameters.get("memoryMap", False)

        # get the footers from each file (XML) and make sure they have identical DataDictionaries
        footers = []
        for i, fileName in enumerate(fileNames):
            fileSize = os.stat(fileName).st_size
            file = open(fileName, "rb")

            file.seek(max(0, fileSize - 1024))
            text = file.read()
            m = re.search("<SeekFooter\s+byteOffset=\"([0-9]+)\"\s+/>", text)
            if m is not None:
                textStart = int(m.group(1))
            else:
                raise IOError("File \"%s\" does not have the right format (the <SeekFooter /> element was not found in the last kilobyte)" % fileName)

            file.seek(textStart)

            footer = load(file.read(), xtbl.XTBL)
            footers.append(footer)
            if len(footers) > 1:
                thisDataDictionary = footer.child(xtbl.DataDictionary)
                firstDataDictionary = footers[0].child(xtbl.DataDictionary)

                if thisDataDictionary != firstDataDictionary:
                    for x in thisDataDictionary.matches(xtbl.LookupTable, maxdepth=None) + firstDataDictionary.matches(xtbl.LookupTable, maxdepth=None):
                        x.serialize()
                    raise IncompatibleFilesInChain("XTBL file \"%s\" is incompatible with the first file \"%s\":%s%s%s%s" % (fileNames[i], fileNames[0], os.linesep, thisDataDictionary.xml(), os.linesep, firstDataDictionary.xml()))

            file.close()

        # set up the UniTable's fields, types, pages, starts, and length
        fields = []
        types = {}
        dtypes = {}
        lookups = {}

        for dataField in footers[0].child(xtbl.DataDictionary).matches(xtbl.DataField):
            field = dataField.attrib["name"]
            fields.append(field)
            types[field] = dataField.attrib["type"]
            dtypes[field] = dataField.attrib["dtype"]

            lookup = dataField.child(xtbl.LookupTable, exception=False)
            if lookup is not None:
                lookups[field] = lookup.n_to_v
            else:
                lookups[field] = None

        categories = []
        for f in fields:
            n_to_v = lookups[f]
            if n_to_v is None:
                categories.append(None)
            else:
                v_to_n = dict((v, n) for n, v in n_to_v.items())
                categories.append((v_to_n, n_to_v))

        table = UniTable(fields, types)
        table.pages = []
        table.starts = []
        table.length = 0

        uniPageDiskCacheManager = UniPageDiskCacheManager(limitGB, memoryMap)

        for i, fileName in enumerate(fileNames):
            for xtblpage in footers[i].child(xtbl.Pages).matches(xtbl.Page):
                length = xtblpage.attrib["length"]

                byteOffsets = {}
                for pageFieldOffset in xtblpage.matches(xtbl.PageFieldOffset):
                    byteOffsets[pageFieldOffset.attrib["name"]] = pageFieldOffset.attrib["byteOffset"]

                uniPage = UniPageOnDisk(fields, table.types)
                uniPage.initDisk(length, fileName, byteOffsets, dtypes, categories, uniPageDiskCacheManager)

                table.pages.append(uniPage)
                table.starts.append(table.length)
                table.length += length

        return table
Beispiel #8
0
def readUniTable(fileLocation,
                 format=None,
                 sorter=None,
                 pageSize=None,
                 mapInvalid=None,
                 mapMissing=None,
                 **parameters):
    format = getformat(fileLocation, format)

    ################################################################ CSV
    if format == "CSV":
        csvInput = CSVStream(fileLocation, sorter, **parameters)

        if csvInput.types is not None:
            types = csvInput.types
        else:
            types = dict((f, "string") for f in csvInput.fields)

        _mapInvalid = dict(
            (f, str("INVALID") if types[f] in ("category",
                                               "string") else -1000)
            for f in csvInput.fields)
        if mapInvalid is None:
            mapInvalid = _mapInvalid
        else:
            _mapInvalid.update(mapInvalid)
            mapInvalid = _mapInvalid

        _mapMissing = dict(
            (f, str("MISSING") if types[f] in ("category",
                                               "string") else -1000)
            for f in csvInput.fields)
        if mapMissing is None:
            mapMissing = _mapMissing
        else:
            _mapMissing.update(mapMissing)
            mapMissing = _mapMissing

        table = UniTable(csvInput.fields, types)
        table.initMemory(pageSize)

        for record in csvInput:
            table.fill([
                mapInvalid[f]
                if r is INVALID else mapMissing[f] if r is MISSING else r
                for f, r in zip(csvInput.fields, record)
            ])

        return table

    ################################################################ XML
    if format == "XML":
        xmlInput = XMLStream(fileLocation, sorter, **parameters)

        if xmlInput.types is not None:
            types = xmlInput.types
        else:
            types = dict((f, "string") for f in xmlInput.fields)

        _mapInvalid = dict(
            (f, str("INVALID") if types[f] in ("category",
                                               "string") else -1000)
            for f in xmlInput.fields)
        if mapInvalid is None:
            mapInvalid = _mapInvalid
        else:
            _mapInvalid.update(mapInvalid)
            mapInvalid = _mapInvalid

        _mapMissing = dict(
            (f, str("MISSING") if types[f] in ("category",
                                               "string") else -1000)
            for f in xmlInput.fields)
        if mapMissing is None:
            mapMissing = _mapMissing
        else:
            _mapMissing.update(mapMissing)
            mapMissing = _mapMissing

        table = UniTable(xmlInput.fields, types)
        table.initMemory(pageSize)

        for record in xmlInput:
            table.fill([
                mapInvalid[f] if r is INVALID else r
                for f, r in [(f, record.get(f, mapMissing[f]))
                             for f in xmlInput.fields]
            ])

        return table

    ################################################################ NAB
    elif format == "NAB":
        fileNames = getfiles(fileLocation, sorter)
        if len(fileNames) == 0:
            raise IOError("No files match \"%s\" (even with wildcards)" %
                          fileLocation)

        fields = None
        types = None
        strings = {}
        args = {}
        for fileName in fileNames:
            file = open(fileName, "rb")
            header = file.readline().rstrip()
            file.close()

            headerfields = header.decode("utf-8").split()
            if headerfields[0] != "RecArray":
                raise BadlyFormattedInputData(
                    "NAB file \"%s\" does not begin with 'RecArray'" %
                    fileName)

            args[fileName] = dict(
                asciistr(f).split("=") for f in headerfields[1:])

            if "masktype" in args.keys():
                raise NotImplementedError(
                    "No support yet for NAB files (such as \"%s\") with masked NumPy arrays"
                    % fileName)

            if set(args[fileName].keys()) != set(["formats", "names"]):
                raise BadlyFormattedInputData(
                    "NAB file \"%s\" headers are %s, rather than set([\"formats\", \"names\"])"
                    % (fileName, str(set(args[fileName].keys()))))

            thisfields = args[fileName]["names"].split(",")
            thistypes = args[fileName]["formats"].split(",")
            for i in xrange(len(thistypes)):
                if thistypes[i][0] == "a":
                    thistypes[i] = "string"
                    strings[thisfields[i]] = True
                else:
                    strings[thisfields[i]] = False

            if fields is None:
                fields = thisfields
                types = thistypes
            else:
                if fields != thisfields:
                    raise IncompatibleFilesInChain(
                        "NAB file \"%s\" header has fields %s, which differ from the first %s"
                        % (fileName, str(thisfields), str(fields)))
                if types != thistypes:
                    raise IncompatibleFilesInChain(
                        "NAB file \"%s\" header has types %s, which differ from the first %s"
                        % (fileName, str(thistypes), str(types)))

        table = UniTable(fields, dict(zip(fields, types)))
        table.pages = []
        table.starts = []
        table.length = 0

        for fileName in fileNames:
            file = open(fileName, "rb")
            file.readline()
            data = numpy.rec.fromfile(file, **args[fileName])

            table.pageSize = len(data)
            page = UniPage(table.fields, table.types)

            arrays = {}
            for f in table.fields:
                arr = data.field(f)
                if strings[f]:
                    arr = [i.decode("utf-8") for i in arr]
                arrays[f] = arr

            page.initExisting(table.pageSize,
                              arrays,
                              copy=False,
                              stringToCategory=True)
            table.pages.append(page)
            table.starts.append(table.length)
            table.length += len(data)

        return table

    ################################################################ XTBL
    elif format == "XTBL":
        fileNames = getfiles(fileLocation, sorter)
        if len(fileNames) == 0:
            raise IOError("No files match \"%s\" (even with wildcards)" %
                          fileLocation)

        limitGB = parameters.get("limitGB", None)
        memoryMap = parameters.get("memoryMap", False)

        # get the footers from each file (XML) and make sure they have identical DataDictionaries
        footers = []
        for i, fileName in enumerate(fileNames):
            fileSize = os.stat(fileName).st_size
            file = open(fileName, "rb")

            file.seek(max(0, fileSize - 1024))
            text = file.read()
            m = re.search("<SeekFooter\s+byteOffset=\"([0-9]+)\"\s+/>", text)
            if m is not None:
                textStart = int(m.group(1))
            else:
                raise IOError(
                    "File \"%s\" does not have the right format (the <SeekFooter /> element was not found in the last kilobyte)"
                    % fileName)

            file.seek(textStart)

            footer = load(file.read(), xtbl.XTBL)
            footers.append(footer)
            if len(footers) > 1:
                thisDataDictionary = footer.child(xtbl.DataDictionary)
                firstDataDictionary = footers[0].child(xtbl.DataDictionary)

                if thisDataDictionary != firstDataDictionary:
                    for x in thisDataDictionary.matches(
                            xtbl.LookupTable,
                            maxdepth=None) + firstDataDictionary.matches(
                                xtbl.LookupTable, maxdepth=None):
                        x.serialize()
                    raise IncompatibleFilesInChain(
                        "XTBL file \"%s\" is incompatible with the first file \"%s\":%s%s%s%s"
                        % (fileNames[i], fileNames[0], os.linesep,
                           thisDataDictionary.xml(), os.linesep,
                           firstDataDictionary.xml()))

            file.close()

        # set up the UniTable's fields, types, pages, starts, and length
        fields = []
        types = {}
        dtypes = {}
        lookups = {}

        for dataField in footers[0].child(xtbl.DataDictionary).matches(
                xtbl.DataField):
            field = dataField.attrib["name"]
            fields.append(field)
            types[field] = dataField.attrib["type"]
            dtypes[field] = dataField.attrib["dtype"]

            lookup = dataField.child(xtbl.LookupTable, exception=False)
            if lookup is not None:
                lookups[field] = lookup.n_to_v
            else:
                lookups[field] = None

        categories = []
        for f in fields:
            n_to_v = lookups[f]
            if n_to_v is None:
                categories.append(None)
            else:
                v_to_n = dict((v, n) for n, v in n_to_v.items())
                categories.append((v_to_n, n_to_v))

        table = UniTable(fields, types)
        table.pages = []
        table.starts = []
        table.length = 0

        uniPageDiskCacheManager = UniPageDiskCacheManager(limitGB, memoryMap)

        for i, fileName in enumerate(fileNames):
            for xtblpage in footers[i].child(xtbl.Pages).matches(xtbl.Page):
                length = xtblpage.attrib["length"]

                byteOffsets = {}
                for pageFieldOffset in xtblpage.matches(xtbl.PageFieldOffset):
                    byteOffsets[pageFieldOffset.attrib[
                        "name"]] = pageFieldOffset.attrib["byteOffset"]

                uniPage = UniPageOnDisk(fields, table.types)
                uniPage.initDisk(length, fileName, byteOffsets, dtypes,
                                 categories, uniPageDiskCacheManager)

                table.pages.append(uniPage)
                table.starts.append(table.length)
                table.length += length

        return table
Beispiel #9
0
    def __init__(self, configuration, model=None, dataStream=None, rethrowExceptions=None):
        self.model = model
        self.dataStream = dataStream
        self.rethrowExceptions = rethrowExceptions
        self.fileNameOnException = None

        # get the configuration, in whatever form you find it
        if isinstance(configuration, config.AugustusConfiguration):
            pass
        elif isinstance(configuration, basestring):
            try:
                configuration = xmlbase.loadfile(configuration, config.Config, lineNumbers=True)
            except IOError:
                configuration = xmlbase.load(configuration, config.Config, lineNumbers=True)
        else:
            raise ConfigurationError("Configuration must be a pre-validated XML object, a fileName, or a literal configuration string.")
    
        # set up logging
        setupLogging(configuration.matches(lambda x: isinstance(x, (config.Logging, config.Metadata))))
        self.logger = logging.getLogger()
        self.metadata = logging.getLogger("metadata")

        # begin "initialization" phase
        for l in self.logger, self.metadata:
            if "initialization" in l.differentLevel:
                l.setLevel(l.differentLevel["initialization"])
            else:
                l.setLevel(l.naturalLevel)

        # get the model, in whatever form you find it
        self.logger.info("Loading PMML model.")
        self.metadata.startTiming("Time to load PMML model")
        modelFileName = "(none)"
        maturityThreshold = 0
        if self.model is None:
            modelInput = configuration.child(config.ModelInput, exception=None)
            if modelInput is None:
                raise ConfigurationError("If a model is not provided to MainLoop explicitly, it must be present in the configuration file.")

            fileLocation = modelInput["fileLocation"]
            if not fileLocation.startswith("http://") and not fileLocation.startswith("https://"):
                fileList = glob.glob(fileLocation)
                if len(fileList) > 1:
                    fileList = [f for f in fileList if self._modelExceptionIdentifier not in f]
                if len(fileList) == 0:
                    raise IOError("No files matched the ModelInput fileLocation \"%s\"." % fileLocation)

                selectmode = modelInput.attrib.get("selectmode", "lastAlphabetic")
                if selectmode == "mostRecent":
                    fileLocation = max(fileList, key=lambda x: os.stat(x).st_mtime)
                elif selectmode == "lastAlphabetic":
                    fileList.sort()
                    fileLocation = fileList[-1]
                else:
                    assert False

                if self._modelExceptionIdentifier in fileLocation:
                    self.logger.warning("Using a PMML model that was written on exception (fileName \"%s\")" % fileLocation)

            self.model = xmlbase.loadfile(fileLocation, pmml.X_ODG_PMML, lineNumbers=True)

            if "maturityThreshold" in modelInput.attrib: maturityThreshold = modelInput["maturityThreshold"]

        elif isinstance(self.model, pmml.PMML):
            pass
        elif isinstance(self.model, basestring):
            try:
                self.model, modelFileName = xmlbase.loadfile(self.model, pmml.X_ODG_PMML, lineNumbers=True), self.model
            except IOError:
                self.model = xmlbase.load(self.model, pmml.X_ODG_PMML, lineNumbers=True)
        else:
            raise ConfigurationError("Model must be a pre-validated XML object, a fileName, or a literal PMML string.")
        self.metadata.stopTiming("Time to load PMML model")
        self.metadata.data["PMML model file"] = modelFileName

        # globally set random number seeds
        if "randomSeed" in configuration.attrib:
            augustusRandomSeed = configuration["randomSeed"]
            random.seed(augustusRandomSeed)
            numpy.random.seed(augustusRandomSeed + 1)
        else:
            augustusRandomSeed = "unspecified"

        # globally set numpy error handling
        numpy.seterr(divide="raise", over="raise", under="ignore", invalid="raise")

        # update schemes (producerUpdateScheme may be redefined below)
        consumerUpdateScheme = self._getUpdateScheme(configuration.child(config.ConsumerBlending, exception=False))
        producerUpdateScheme = self._getUpdateScheme(None)

        # set up scoring output
        outputConfig = configuration.child(config.Output, exception=False)
        if outputConfig is None:
            self.outputWriter = None
        else:
            outputParams = {"pmmlFileName": modelFileName, "mode": outputConfig.destination.attrib.get("type", "XML").lower()}

            if isinstance(outputConfig.destination, config.ToFile):
                if outputConfig.destination.attrib.get("overwrite", False):
                    outputStream = codecs.open(outputConfig.destination["name"], "w", encoding="utf-8")
                else:
                    outputStream = codecs.open(outputConfig.destination["name"], "a", encoding="utf-8")
            elif isinstance(outputConfig.destination, config.ToStandardError):
                outputStream = sys.stderr
            elif isinstance(outputConfig.destination, config.ToStandardOut):
                outputStream = sys.stdout
            else:
                assert False

            reportTag = outputConfig.child("ReportTag", exception=False)
            if reportTag:
                outputParams["reportName"] = reportTag.attrib.get("name", "Report")

            eventTag = outputConfig.child("EventTag", exception=False)
            if eventTag:
                outputParams["eventName"] = eventTag.attrib.get("name", "Event")
                outputParams["pseudoEventName"] = eventTag.attrib.get("pseudoName", "pseudoEvent")

            self.outputWriter = OutputWriter(outputStream, **outputParams)

        # initialize for the case of no output model
        engineSettings = {"maturityThreshold": maturityThreshold, "augustusRandomSeed": augustusRandomSeed}
        self.modelWriter = None
        segmentationScheme = SegmentationScheme(None, self.model)
        self.updateFlag = False
        self.aggregateUpdateFlag = False

        producerAlgorithm = dict(config.producerAlgorithmDefaults)
        for pa in producerAlgorithm.values():
            validationResult = pa.validate()
            assert validationResult is None

        # set up output model, if present in the configuration
        modelSetup = configuration.child(config.ModelSetup, exception=False)
        engineSettings["hasProducer"] = modelSetup is not None
        if engineSettings["hasProducer"]:
            self.logger.info("Setting up model updating/producing.")

            producerBlending = modelSetup.child(config.ProducerBlending, exception=False)
            producerUpdateScheme = self._getUpdateScheme(producerBlending)
            if producerBlending is not None and producerBlending.contains(config.MaturityThreshold):
                maturityConfig = producerBlending.child(config.MaturityThreshold)
                engineSettings["maturityThreshold"] = int(maturityConfig.attrib.get("threshold", 1))
                try:
                    engineSettings["lockingThreshold"] = int(maturityConfig.attrib["lockingThreshold"])
                except KeyError:
                    engineSettings["lockingThreshold"] = None

            engineSettings["lockAllSegments"] = modelSetup.attrib.get("mode", None) == "lockExisting"
            if engineSettings["lockAllSegments"] and segmentationScheme is not None and not segmentationScheme._generic and not segmentationScheme._whiteList:
                self.logger.warning("The model is locked and no new segments are specified...new model files will be unchanged.")

            self.modelWriter = getModelWriter(modelSetup)
            if self.modelWriter is not None:
                if self.modelWriter.baseName is None:
                    self.fileNameOnException = self._modelExceptionIdentifier + ".pmml"
                else:
                    self.fileNameOnException = "".join([self.modelWriter.baseName, self._modelExceptionIdentifier, ".pmml"])
            else:
                self.logger.warning("There is no outputFile attribute in the ModelSetup; no new model file will be created.")

            segmentationScheme = SegmentationScheme(modelSetup.child(config.SegmentationSchema, exception=False), self.model)
            self.updateFlag = modelSetup.attrib.get("updateEvery", "event") in ("event", "both")
            self.aggregateUpdateFlag = modelSetup.attrib.get("updateEvery", "event") in ("aggregate", "both")

            for pa in modelSetup.matches(config.ProducerAlgorithm):
                producerAlgorithm[pa["model"]] = pa
            if modelSetup.attrib.get("mode", None) == "updateExisting":
                for pa in producerAlgorithm.values():
                    pa.parameters["updateExisting"] = True
            if modelSetup.attrib.get("mode", None) == "replaceExisting":
                for pa in producerAlgorithm.values():
                    pa.parameters["updateExisting"] = False

        # to score or not to score
        eventSettings = configuration.child(config.EventSettings, exception=False)
        if eventSettings is not None:
            self.logger.info("Setting up output.")
            self.scoreFlag = eventSettings["score"]
            self.outputFlag = eventSettings["output"]
        else:
            self.scoreFlag = False
            self.outputFlag = False

        aggregationConfig = configuration.child(config.AggregationSettings, exception=False)
        if aggregationConfig is not None:
            self.aggregateScoreFlag = aggregationConfig["score"]
            self.aggregateOutputFlag = aggregationConfig["output"]
            self.aggregationSettings = dict(aggregationConfig.attrib)
        else:
            self.aggregateScoreFlag = False
            self.aggregateOutputFlag = False
            self.aggregationSettings = None

        self.metadata.data["Update model"] = "true" if self.updateFlag or self.aggregateUpdateFlag else "false"

        # build a scoring engine once without a dataStream (to evaluate any verification blocks)
        self.engine = Engine(self.model, None, producerUpdateScheme, consumerUpdateScheme, segmentationScheme, producerAlgorithm, **engineSettings)
        self.engine.initialize()
        if self.outputWriter is not None: self.outputWriter.open()

        # begin "verification" phase
        for l in self.logger, self.metadata:
            if "verification" in l.differentLevel:
                l.eventLogLevel = l.differentLevel["verification"]
                l.setLevel(l.differentLevel["verification"])
            else:
                l.eventLogLevel = l.naturalLevel
                l.setLevel(l.naturalLevel)

        # evaluate verification blocks
        modelVerificationConfig = configuration.child(config.ModelVerification, exception=False)
        if modelVerificationConfig is not None:
            verify(modelVerificationConfig, self.engine, self.logger, self.outputWriter)

        # verification can increment aggregate variables, but
        # aggregates should all start at zero at the start of real
        # processing, whether verification happened or not
        self.engine.flushAggregates()

        # get the dataStream, in whatever form you find it
        self.logger.info("Setting up data input.")
        if self.dataStream is None:
            configDataInput = configuration.child(config.DataInput, exception=None)
            if configDataInput is None:
                raise ConfigurationError("If a dataStream is not provided to MainLoop explicitly, it must be present in the configuration file.")
            if configDataInput.contains(config.FromFile):
                self.dataStream = DataStreamer(configDataInput.child(config.FromFile), self.engine.pmmlModel)
            elif configDataInput.contains(config.FromStandardIn):
                self.dataStream = DataStreamer(configDataInput.child(config.FromStandardIn), self.engine.pmmlModel)
            elif configDataInput.contains(config.FromHTTP):
                self.dataStream = AugustusHTTPDataStream(configDataInput.child(config.FromHTTP))
                if self.outputWriter is None:
                    self.dataStream.respond = False
                if self.dataStream.respond:
                    self.dataStream.setupOutput(self.outputWriter)
            else:
                assert False

        # begin "eventLoop" phase
        for l in self.logger, self.metadata:
            if "eventloop" in l.differentLevel:
                l.eventLogLevel = l.differentLevel["eventloop"]
                l.setLevel(l.differentLevel["eventloop"])
            else:
                l.eventLogLevel = l.naturalLevel
                l.setLevel(l.naturalLevel)

        # possibly set up custom processing
        self.customProcessing = configuration.child(config.CustomProcessing, exception=False)
        if self.customProcessing is not None:
            constants = self.engine.pmmlModel.child(pmml.Extension, exception=False)
            if constants is None:
                constants = NameSpaceReadOnly()
            else:
                constants = constants.child(pmml.X_ODG_CustomProcessingConstants, exception=False)
                if constants is None:
                    constants = NameSpaceReadOnly()
                else:
                    constants = constants.nameSpace

            atoms = {"INVALID": INVALID, "MISSING": MISSING, "IMMATURE": IMMATURE, "MATURE": MATURE, "LOCKED": LOCKED, "UNINITIALIZED": UNINITIALIZED}
            for thing in pmml.OutputField.__dict__.values() + pmml.X_ODG_OutputField.__dict__.values():
                if isinstance(thing, Atom):
                    atoms[repr(thing)] = thing

            self.customProcessing.initialize(self.model, self.engine.pmmlModel, constants, [s.userFriendly for s in self.engine.segmentRecords], atoms, self.logger, self.metadata, consumerUpdateScheme, producerUpdateScheme)
            self.engine.customProcessing = self.customProcessing
            self.engine.reinitialize()

        else:
            # only turn off circular garbage collection if there is no CustomProcessing or AugustusInterface
            gc.disable()