def __init__(self, pmmlModel, pmmlPredicate, parentPmmlOutput, engine, name=None): """Called by Engine when a PMML file is loaded or when a new segment is observed.""" self.pmmlModel = pmmlModel self.pmmlPredicate = pmmlPredicate if pmmlPredicate is not None: streamlined = True if pmmlPredicate.exists( lambda x: isinstance(x, pmml.CompoundPredicate) and x. attrib["booleanOperator"] == "surrogate", maxdepth=None): streamlined = False self.predicateMatches = pmmlPredicate.createTest(streamlined) self.expressionTree = pmmlPredicate.expressionTree() else: self.predicateMatches = None self.expressionTree = None # merge this <Output> section with the parent's thisOutput = pmmlModel.child(pmml.Output, exception=False) if thisOutput is None: self.pmmlOutput = parentPmmlOutput elif parentPmmlOutput is None: self.pmmlOutput = thisOutput else: self.pmmlOutput = parentPmmlOutput.copy() for outputField in thisOutput.matches(pmml.OutputField): self.pmmlOutput.children.append(outputField) self.pmmlOutput.validate() self.engine = engine self.segmentNameRegistry.register(name, id(self)) # make an X-ODG-ModelMaturity object to keep track of how many updates this segment has seen pmmlExtension = self.pmmlModel.child(pmml.Extension, exception=False) if pmmlExtension is not None: self.pmmlModelMaturity = pmmlExtension.child( pmml.X_ODG_ModelMaturity, exception=False) if self.pmmlModelMaturity is None: self.pmmlModelMaturity = pmml.X_ODG_ModelMaturity( numUpdates=0, locked=self.engine.lockAllSegments) pmmlExtension.children.append(self.pmmlModelMaturity) elif self.engine.lockAllSegments: # Always lock if the user asked for it in the configuration file self.pmmlModelMaturity.attrib["locked"] = True else: pmmlExtension = pmml.Extension() self.pmmlModelMaturity = pmml.X_ODG_ModelMaturity( numUpdates=0, locked=self.engine.lockAllSegments) pmmlExtension.children.append(self.pmmlModelMaturity) self.pmmlModel.children.insert(0, pmmlExtension)
def _updateDistribution_first(self): if isinstance(self.baseline, (pmml.PoissonDistribution, pmml.GaussianDistribution)): self.baselinePartialSums = self.baseline.descendant( pmml.X_ODG_PartialSums, exception=False, maxdepth=2) if self.baselinePartialSums is None: self.baselinePartialSums = pmml.X_ODG_PartialSums() if not self.baseline.exists(pmml.Extension): self.baseline.children.append(pmml.Extension()) self.baseline.child(pmml.Extension).children.append( self.baselinePartialSums) if isinstance(self.baseline, pmml.PoissonDistribution): self.baselineUpdator = self.engine.producerUpdateScheme.updator( SUM1, SUMX) if self.updateExisting: self.baselineUpdator.initialize({ COUNT: self.baselinePartialSums.attrib.get("COUNT", 0), SUM1: self.baselinePartialSums.attrib.get("SUM1", 0.), SUMX: self.baselinePartialSums.attrib.get("SUMX", 0.) }) if COUNT in self.baselineUpdator.counters: self.baselinePartialSums.attrib[ "COUNT"] = self.baselineUpdator.counters[COUNT] self.baselinePartialSums.attrib[ "SUM1"] = self.baselineUpdator.counters[SUM1] self.baselinePartialSums.attrib[ "SUMX"] = self.baselineUpdator.counters[SUMX] elif isinstance(self.baseline, pmml.GaussianDistribution): self.baselineUpdator = self.engine.producerUpdateScheme.updator( SUM1, SUMX, SUMXX) if self.updateExisting: self.baselineUpdator.initialize({ COUNT: self.baselinePartialSums.attrib.get("COUNT", 0), SUM1: self.baselinePartialSums.attrib.get("SUM1", 0.), SUMX: self.baselinePartialSums.attrib.get("SUMX", 0.), SUMXX: self.baselinePartialSums.attrib.get("SUMXX", 0.) }) if COUNT in self.baselineUpdator.counters: self.baselinePartialSums.attrib[ "COUNT"] = self.baselineUpdator.counters[COUNT] self.baselinePartialSums.attrib[ "SUM1"] = self.baselineUpdator.counters[SUM1] self.baselinePartialSums.attrib[ "SUMX"] = self.baselineUpdator.counters[SUMX] self.baselinePartialSums.attrib[ "SUMXX"] = self.baselineUpdator.counters[SUMXX] elif isinstance(self.baseline, pmml.UniformDistribution): self.baselineUpdator = self.engine.producerUpdateScheme.updator( MIN, MAX) if self.updateExisting: self.baselineUpdator.initialize({ MIN: self.baseline.attrib["lower"], MAX: self.baseline.attrib["upper"] }) else: raise NotImplementedError, "Only production of Gaussian, Poisson, and Uniform distributions has been implemented." if self.alternateField is not None: if not testDistributions.exists(pmml.Alternate): raise RuntimeError, "alternateField requested but there is no <Alternate/> distribution in the PMML" self.alternate = testDistributions.child(pmml.Alternate).child() if isinstance( self.alternate, (pmml.PoissonDistribution, pmml.GaussianDistribution)): self.alternatePartialSums = self.alternate.descendant( pmml.X_ODG_PartialSums, exception=False, maxdepth=2) if self.alternatePartialSums is None: self.alternatePartialSums = pmml.X_ODG_PartialSums() if not self.alternate.exists(pmml.Extension): self.alternate.children.append(pmml.Extension()) self.alternate.child(pmml.Extension).children.append( self.alternatePartialSums) if isinstance(self.alternate, pmml.PoissonDistribution): self.alternateUpdator = self.engine.producerUpdateScheme.updator( SUM1, SUMX) if self.updateExisting: self.alternateUpdator.initialize({ COUNT: self.alternatePartialSums.attrib.get("COUNT", 0), SUM1: self.alternatePartialSums.attrib.get("SUM1", 0.), SUMX: self.alternatePartialSums.attrib.get("SUMX", 0.) }) if COUNT in self.alternateUpdator.counters: self.alternatePartialSums.attrib[ "COUNT"] = self.alternateUpdator.counters[COUNT] self.alternatePartialSums.attrib[ "SUM1"] = self.alternateUpdator.counters[SUM1] self.alternatePartialSums.attrib[ "SUMX"] = self.alternateUpdator.counters[SUMX] elif isinstance(self.alternate, pmml.GaussianDistribution): self.alternateUpdator = self.engine.producerUpdateScheme.updator( SUM1, SUMX, SUMXX) if self.updateExisting: self.alternateUpdator.initialize({ COUNT: self.alternatePartialSums.attrib.get("COUNT", 0), SUM1: self.alternatePartialSums.attrib.get("SUM1", 0.), SUMX: self.alternatePartialSums.attrib.get("SUMX", 0.), SUMXX: self.alternatePartialSums.attrib.get("SUMXX", 0.) }) if COUNT in self.alternateUpdator.counters: self.alternatePartialSums.attrib[ "COUNT"] = self.alternateUpdator.counters[COUNT] self.alternatePartialSums.attrib[ "SUM1"] = self.alternateUpdator.counters[SUM1] self.alternatePartialSums.attrib[ "SUMX"] = self.alternateUpdator.counters[SUMX] self.alternatePartialSums.attrib[ "SUMXX"] = self.alternateUpdator.counters[SUMXX] elif isinstance(self.alternate, pmml.UniformDistribution): self.alternateUpdator = self.engine.producerUpdateScheme.updator( MIN, MAX) if self.updateExisting: self.alteranteUpdator.initialize({ MIN: self.alterante.attrib["lower"], MAX: self.alterante.attrib["upper"] }) else: raise NotImplementedError, "Only production of Gaussian, Poisson, and Uniform distributions has been implemented." else: self.alternate = None
def initialize(self): """Interpret PMML file, set up SegmentRecords list, and initialize all algorithms.""" self.firstSegment = True # set up the header, so that our models can be stamped with time and event number header = self.pmmlFile.child(pmml.Header) if header.exists(pmml.Extension): headerExtension = header.child(pmml.Extension) else: headerExtension = pmml.Extension() header.children.insert(0, headerExtension) if headerExtension.exists(pmml.X_ODG_RandomSeed): del headerExtension[headerExtension.index(pmml.X_ODG_RandomSeed)] augustusRandomSeed = pmml.X_ODG_RandomSeed( value=self.augustusRandomSeed) headerExtension.children.append(augustusRandomSeed) if headerExtension.exists(pmml.X_ODG_Eventstamp): del headerExtension[headerExtension.index(pmml.X_ODG_Eventstamp)] self.eventStamp = pmml.X_ODG_Eventstamp(number=0) headerExtension.children.append(self.eventStamp) if header.exists(pmml.Timestamp): del header[header.index(pmml.Timestamp)] self.timeStamp = pmml.Timestamp( xmlbase.XMLText(datetime.datetime.today().isoformat())) header.children.append(self.timeStamp) # select the first model or select a model by name if self.modelName is None: self.pmmlModel = self.pmmlFile.topModels[0] else: self.pmmlModel = None for model in self.pmmlFile.topModels: if "modelName" in model.attrib and model.attrib[ "modelName"] == self.modelName: self.pmmlModel = model break if self.pmmlModel is None: raise RuntimeError, "No model named \"%s\" was found in the PMML file" % self.modelName # connect the dataContext to the dataStream, so that events will flow from the input file into the transformations self.resetDataStream(self.dataStream) # clear the cache the model DataContexts (initializes some dictionaries) self.pmmlModel.dataContext.clear() if self.pmmlModel.dataContext.transformationDictionary: self.metadata.data["Transformation dictionary elements"] = len( self.pmmlModel.dataContext.transformationDictionary.cast) else: self.metadata.data["Transformation dictionary elements"] = 0 self.segmentRecords = [] self._lookup = NameSpace(tuples={}, fields={}, other=[]) SegmentRecord.maturityThreshold = self.maturityThreshold SegmentRecord.lockingThreshold = self.lockingThreshold if self.pmmlFile.exists(pmml.TransformationDictionary): if self.pmmlFile.child(pmml.TransformationDictionary).exists( pmml.Aggregate, maxdepth=None): raise NotImplementedError, "Aggregate transformations in the TransformationDictionary are not supported" if self.pmmlFile.child(pmml.TransformationDictionary).exists( pmml.X_ODG_AggregateReduce, maxdepth=None): raise NotImplementedError, "X-ODG-AggregateReduce transformations in the TransformationDictionary are not supported" # MiningModels are special because we handle segmentation at the Engine level # Currently no support for MiningModels nested within MiningModels if isinstance(self.pmmlModel, pmml.MiningModel): self.pmmlOutput = self.pmmlModel.child(pmml.Output, exception=False) segmentation = self.pmmlModel.child(pmml.Segmentation, exception=False) # for now, assume a MiningModel without any segments will be populated through autosegmentation if self.pmmlModel.exists(pmml.LocalTransformations): if self.pmmlModel.child(pmml.LocalTransformations).exists( pmml.Aggregate, maxdepth=None): raise NotImplementedError, "Aggregate transformations in the MiningModel's LocalTransformations are not supported" if self.pmmlModel.child(pmml.LocalTransformations).exists( pmml.X_ODG_AggregateReduce, maxdepth=None): raise NotImplementedError, "X-ODG-AggregateReduce transformations in the MiningModel's LocalTransformations are not supported" if segmentation.attrib["multipleModelMethod"] == "selectFirst": self.multipleModelMethod = SELECTFIRST elif segmentation.attrib["multipleModelMethod"] == "selectAll": self.multipleModelMethod = SELECTALL else: raise NotImplementedError, "Only 'selectFirst', 'selectAll', and no segmentation have been implemented." self.metadata.data[ "Match all segments"] = self.multipleModelMethod != SELECTFIRST for pmmlSegment in segmentation.matches(pmml.Segment): self._makeSegmentRecord(pmmlSegment) else: self.multipleModelMethod = SELECTONLY segmentRecord = SegmentRecord(self.pmmlModel, None, None, self) modelClass = self.pmmlModel.__class__ algoName = self.producerAlgorithm[ modelClass.__name__].attrib["algorithm"] segmentRecord.consumerAlgorithm = consumerAlgorithmMap[modelClass]( self, segmentRecord) segmentRecord.producerAlgorithm = producerAlgorithmMap[ modelClass, algoName](self, segmentRecord) segmentRecord.producerParameters = self.producerAlgorithm[ modelClass.__name__].parameters self.setProvenance(self.pmmlModel, algoName, segmentRecord.producerAlgorithm, segmentRecord.producerParameters) localTransformations = self.pmmlModel.child( pmml.LocalTransformations, exception=False) if localTransformations is not None: segmentRecord.aggregates = localTransformations.matches( pmml.Aggregate, maxdepth=None) segmentRecord.aggregates.extend( localTransformations.matches(pmml.X_ODG_AggregateReduce, maxdepth=None)) else: segmentRecord.aggregates = [] for aggregate in segmentRecord.aggregates: aggregate.initialize(self.consumerUpdateScheme) self.segmentRecords.append(segmentRecord) self.metadata.data[ "First segment model type"] = segmentRecord.pmmlModel.tag self.reinitialize()
def initialize(self, **params): """Initialize a clustering model producer.""" if "resume" in params: self.resume = pmml.boolCheck(params["resume"]) del params["resume"] else: self.resume = False if "numberOfTrials" in params: self.numberOfTrials = int(params["numberOfTrials"]) del params["numberOfTrials"] else: self.numberOfTrials = 10 if "numberToKeep" in params: self.numberToKeep = int(params["numberToKeep"]) del params["numberToKeep"] else: self.numberToKeep = 3 if "maturityThreshold" in params: self.maturityThreshold = int(params["maturityThreshold"]) del params["maturityThreshold"] else: self.maturityThreshold = 100 if "initialStability" in params: self.initialStability = int(params["initialStability"]) del params["initialStability"] else: self.initialStability = 100 if "overrideSignificance" in params: self.overrideSignificance = float(params["overrideSignificance"]) del params["overrideSignificance"] if self.overrideSignificance == 0.: self.overrideSignificance = None else: self.overrideSignificance = 5. self.model = self.segmentRecord.pmmlModel self.dataDistribution = self.engine.producerUpdateScheme.updator(COVARIANCE(self.model.numberOfFields)) self.distanceMeasure = (self.model.child(pmml.ComparisonMeasure).attrib["kind"] == "distance") # put PartialSums in the model if they're not already there; pick up old values if you're resuming extension = self.model.child(pmml.Extension, exception=False) if extension is None: extension = pmml.Extension() self.model.children.append(extension) if self.resume: self.sumOfDistances = extension.child(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == "SumOfDistances", exception=False) else: index = extension.index(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == "SumOfDistances", exception=False) if index is not None: del extension[index] self.sumOfDistances = None if self.sumOfDistances is None: self.sumOfDistances = pmml.X_ODG_PartialSums(name="SumOfDistances", COUNT=0, SUM1=0., SUMX=0., SUMXX=0.) extension.children.append(self.sumOfDistances) self.partialSums = {} for theid, cluster in zip(self.model.ids, self.model.cluster): for i, field in enumerate(self.model.fields): fullname = "%s.%s" % (theid, field) if self.resume: partialSum = extension.child(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == fullname, exception=False) else: index = extension.index(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == fullname, exception=False) if index is not None: del extension[index] partialSum = None if partialSum is None: partialSum = pmml.X_ODG_PartialSums(name=fullname, SUM1=1., SUMX=cluster.value[i]) extension.children.append(partialSum) self.partialSums[fullname] = partialSum # create the first trial using the values constructed above (they come from the PMML file if resume is True) trialFromPmml = new.instance(TrialClusterSet) trialFromPmml.updator = self.engine.producerUpdateScheme.updator(COUNT, SUM1, SUMX, SUMXX) trialFromPmml.updator.initialize({COUNT: self.sumOfDistances.attrib["COUNT"], SUM1: self.sumOfDistances.attrib["SUM1"], SUMX: self.sumOfDistances.attrib["SUMX"], SUMXX: self.sumOfDistances.attrib["SUMXX"]}) trialFromPmml.clusters = [] for theid, cluster in zip(self.model.ids, self.model.cluster): trialCluster = new.instance(TrialCluster) trialCluster.fields = [] trialCluster.initialPosition = [] for field in self.model.fields: partialSum = self.partialSums["%s.%s" % (theid, field)] u = self.engine.producerUpdateScheme.updator(SUM1, SUMX) u.initialize({SUM1: partialSum.attrib["SUM1"], SUMX: partialSum.attrib["SUMX"]}) trialCluster.fields.append(u) trialCluster.initialPosition = list(cluster.value) trialFromPmml.clusters.append(trialCluster) self.trials = [trialFromPmml] if len(params) > 0: raise TypeError, "Unrecognized parameters %s" % params
def produce(self): self.resetLoggerLevels() extension = self.model.child(pmml.Extension, exception=False) if extension is None: extension = pmml.Extension() self.model.children.append(extension) convergence = extension.child(pmml.X_ODG_Convergence, exception=False) if convergence is None: convergence = pmml.X_ODG_Convergence() extension.children.append(convergence) numRecords = len(self.buffer[self.SYNCNUMBER]) if self.logDebug: self.logger.debug( "KMeansClustering.produce: this segment has %d data records; setting up for cluster production." % numRecords) if numRecords == 0: self.logger.debug( "KMeansClustering.produce: no data in this segment, so there are no clusters to produce." ) return if self.numberOfClusters is not None: if self.numberOfClusters > numRecords: self.logger.info( "KMeansClustering.produce: number of desired clusters (%d) exceeds number of data records (%d), reducing number of clusters to match." % (self.model.numberOfClusters, numRecords)) self.model.changeNumberOfClusters(numRecords) elif self.numberOfClusters != self.model.numberOfClusters: self.model.changeNumberOfClusters(self.numberOfClusters) elif self.model.numberOfClusters > numRecords: self.logger.info( "KMeansClustering.produce: number of desired clusters (%d) exceeds number of data records (%d), reducing number of clusters to match." % (self.model.numberOfClusters, numRecords)) self.model.changeNumberOfClusters(numRecords) # special case that should be easy, but it can cause the standard k-means algorithm to infinite loop: if self.model.numberOfClusters == numRecords: self.logger.debug( "KMeansClustering.produce: number of records equals the number of clusters (%d), so we skip the standard algorithm and just assign data points to clusters" % numRecords) for i, pmmlCluster in enumerate(self.model.cluster): pmmlCluster.value = [ self.buffer[field][i] for field in self.model.fields ] pmmlCluster.attrib["n"] = len(pmmlCluster.value) return self.trans = numpy.matrix(numpy.identity(len(self.model.fields))) self.shift = numpy.matrix(numpy.zeros(len(self.model.fields))).T if self.distanceMeasure: # characterize the data so that you can generate random numbers with the same distribution try: covariance = self.dataDistribution.covariance() except ZeroDivisionError: covariance = INVALID if covariance is not INVALID: self.shift = self.dataDistribution.covmean() try: self.trans = numpy.linalg.cholesky(covariance) except numpy.linalg.LinAlgError: pass # FIXME: at least make trans a diagonal matrix with stdev entries (or 1/stdev)! else: raise NotImplementedError( "Currently, only clusters with ComparisonMeasure.kind == 'distance' metrics can be produced." ) # make a new set of trials if self.seedSource is ProducerKMeans.RANDOM_DATAPOINTS: # pick a random point from the dataset def randomization(): i = random.randint(0, len(self.buffer[self.SYNCNUMBER]) - 1) return [ self.buffer[field][i] for field in self.model.fields if field is not self.SYNCNUMBER ] self.randomization = randomization elif self.seedSource == ProducerKMeans.RANDOM_DATAWEIGHTED: # pick a random point from the dataset, weighted by their weights sumOfWeights = numpy.cumsum(self.buffer[self.model.weightField]) def randomization(): x = random.uniform(0., sumOfWeights[-1]) i = numpy.where(sumOfWeights > x)[0][0] return [ self.buffer[field][i] for field in self.model.fields if field is not self.SYNCNUMBER ] self.randomization = randomization elif self.seedSource == ProducerKMeans.RANDOM_DATACOVARIANCE: # generate a random point from a distribution with a covariance like the data self.randomization = lambda: ((self.trans * (numpy.matrix( numpy.random.randn(len(self.shift))).T)) + self.shift) elif self.seedSource == ProducerKMeans.RANDOM_UNITRECT: # generate a random point in the unit rectangle self.randomization = lambda: [ random.random() for i in xrange(len(self.shift)) ] self.trials = [ TrialClusterSet(self.model.numberOfClusters, self.randomization, self.engine.producerUpdateScheme) for i in xrange(self.numberOfTrials) ] # prepare small subsamples to run first to improve convergence when the whole dataset gets used allIndices = range(len(self.buffer[self.SYNCNUMBER])) quickConvergeSamples = [] for numEvents in self.quickConvergeSteps: if numEvents > len(allIndices): numEvents = len(allIndices) quickConvergeSamples.append( numpy.array(random.sample(allIndices, numEvents))) allIndices = numpy.array(allIndices) for key in self.buffer: self.buffer[key] = numpy.array(self.buffer[key]) for i, quickConvergenceSample in enumerate(quickConvergeSamples): if self.logDebug: self.logger.debug( "KMeansClustering.produce: ===== quickConverge %d: preparing for k-means by clustering a random subset of %d events" % (i + 1, len(quickConvergenceSample))) self.iterations(quickConvergenceSample) self.logger.debug( "KMeansClustering.produce: ===== starting k-means clustering algorithm (whole dataset)" ) convergence.attrib["iterations"] = self.iterations() # find the best one best = None for trial in self.trials: if trial.hasConverged: if best is None or trial.updator.mean() < best.updator.mean(): best = trial convergence.attrib["converged"] = (best is not None) if best is None: self.logger.error( "KMeansClustering.produce: no trial cluster-sets converged within the desired number of iterations (%s), using the best UNCONVERGED set instead." % str(self.maxIterations) if self. maxIterations is not None else "unset") for trial in self.trials: if best is None or trial.updator.mean() < best.updator.mean(): best = trial # write it to the PMML file for bestCluster, pmmlCluster in zip(best.clusters, self.model.matches(pmml.Cluster)): pmmlCluster.attrib["size"] = bestCluster.count() theArray = pmmlCluster.child(pmml.Array) theArray.value = bestCluster.initialPosition theArray.attrib["n"] = len(theArray.value)