def getUpdateScheme(configOptions): """Return an UpdateScheme as specified by the user configOptions. Arguments: configOptions (XML object, defined in xmlbase): An XML element of type "Blending"; either <ConsumerBlending/> or <ProducerBlending/>; containing the weightings and default settings for the model update schemes. """ if configOptions is None: return UpdateScheme("unweighted") params = configOptions.attrib.copy() scheme = "unweighted" if "method" in params: scheme = params["method"] if scheme == "computerTimeWindowSeconds": raise NotImplementedError elif scheme == "eventTimeWindow": scheme = "synchronized" del params["method"] if "windowLag" not in params: if scheme not in ("unweighted", "exponential"): params["windowLag"] = 0 return UpdateScheme(scheme, **params)
def _getUpdateScheme(self, configuration): """Return an UpdateScheme as specified by the user configuration. Arguments: configuration (XML object, defined in xmlbase): An XML element of type "Blending"; either <ConsumerBlending/> or <ProducerBlending/>; containing the weightings and default settings for the model update schemes. """ if configuration is None: return UpdateScheme("unweighted") params = dict(configuration.attrib) scheme = "unweighted" if "method" in params: scheme = params["method"] if scheme == "eventTimeWindow": scheme = "synchronized" del params["method"] if scheme in ("window", "synchronized") and "windowLag" not in params: params["windowLag"] = 0 return UpdateScheme(scheme, **params)
def initialize(self): """Initialize a baseline consumer. Unlike other consumers, this creates the score function dynamically, depending on the type of testStatistic. """ testDistributions = self.segmentRecord.pmmlModel.child( pmml.TestDistributions) self.field = testDistributions.attrib["field"] testStatistic = testDistributions.attrib["testStatistic"] # updating can be configured in the Augustus configuration file and in the "windowSize" attribute in this segment # I will assume that the "windowSize" attribute can override CUSUM and GLR only # (the only other baseline consumer that has an intermediate state is chiSquareDistribution, which only makes # sense as UpdateScheme("synchronized"), and that one depends on the configuration of syncNumber, not in PMML) # the general case: self.updateScheme = self.engine.consumerUpdateScheme # the special case: if testStatistic in ("CUSUM", "GLR"): if "windowSize" in testDistributions.attrib and testDistributions.attrib[ "windowSize"] != 0: self.updateScheme = UpdateScheme( "window", windowSize=testDistributions.attrib["windowSize"], windowLag=0) if testStatistic == "CUSUM": self.baseline = testDistributions.child(pmml.Baseline).child() self.alternate = testDistributions.child(pmml.Alternate).child() self.updator = self.updateScheme.updator(CUSUM) self.updator.resetValue = testDistributions.attrib["resetValue"] self.score = self.scoreCUSUM extension = testDistributions.child(pmml.Extension, exception=False) if extension is not None: init = extension.child(pmml.X_ODG_CUSUMInitialization, exception=False) if init is not None: self.updator.initialize({CUSUM: [init.attrib["value"]]}) self.pseudoField = self.field self.pseudoOutputAll = True elif testStatistic == "zValue": self.baseline = testDistributions.child(pmml.Baseline).child() if isinstance(self.baseline, pmml.GaussianDistribution): self.score = self.scoreZValueGaussian else: self.score = self.scoreZValue self.pseudoField = self.field self.pseudoOutputAll = True elif testStatistic in ("chiSquareDistribution", "scalarProduct"): self.updators = {} self.countTable = testDistributions.child(pmml.Baseline).child() if "weightField" in testDistributions.attrib: self.weightField = testDistributions.attrib["weightField"] else: self.weightField = None if "normalizationScheme" not in testDistributions.attrib: self.normalizationScheme = None elif testDistributions.attrib[ "normalizationScheme"] == "Independent": self.normalizationScheme = self.INDEPENDENT elif testDistributions.attrib[ "normalizationScheme"] == "SizeWeighted": self.normalizationScheme = self.SIZEWEIGHTED self.testStatistic = { "chiSquareDistribution": self.CHISQUAREDISTRIBUTION, "scalarProduct": self.SCALARPRODUCT, }[testStatistic] self.score = self.scoreHistogram self.pseudoField = (self.field, self.weightField) self.pseudoOutputAll = False # ODG extensions self.binsOfInterest = testDistributions.descendant( pmml.X_ODG_BinsOfInterest, exception=False) elif testStatistic == "chiSquareIndependence": self.baseline = testDistributions.child(pmml.Baseline) self.fields = None self.countTable = None self.score = self.scoreChiSquareIndependence self.pseudoField = None self.pseudoOutputAll = True # ODG extensions elif testStatistic == "GLR": self.baseline = testDistributions.child(pmml.Baseline).child() if not isinstance( self.baseline, (pmml.GaussianDistribution, pmml.PoissonDistribution)): raise NotImplementedError, "GLR has only been implemented for Gaussian and Poisson distributions" self.updator = self.updateScheme.updator(GLR) self.score = self.scoreGLR self.pseudoField = self.field self.pseudoOutputAll = False
class ConsumerBaselineModel(ConsumerAlgorithm): CHISQUAREDISTRIBUTION = Atom("chiSquareDistribution") SCALARPRODUCT = Atom("scalarProduct") INDEPENDENT = Atom("Independent") SIZEWEIGHTED = Atom("SizeWeighted") def initialize(self): """Initialize a baseline consumer. Unlike other consumers, this creates the score function dynamically, depending on the type of testStatistic. """ testDistributions = self.segmentRecord.pmmlModel.child( pmml.TestDistributions) self.field = testDistributions.attrib["field"] testStatistic = testDistributions.attrib["testStatistic"] # updating can be configured in the Augustus configuration file and in the "windowSize" attribute in this segment # I will assume that the "windowSize" attribute can override CUSUM and GLR only # (the only other baseline consumer that has an intermediate state is chiSquareDistribution, which only makes # sense as UpdateScheme("synchronized"), and that one depends on the configuration of syncNumber, not in PMML) # the general case: self.updateScheme = self.engine.consumerUpdateScheme # the special case: if testStatistic in ("CUSUM", "GLR"): if "windowSize" in testDistributions.attrib and testDistributions.attrib[ "windowSize"] != 0: self.updateScheme = UpdateScheme( "window", windowSize=testDistributions.attrib["windowSize"], windowLag=0) if testStatistic == "CUSUM": self.baseline = testDistributions.child(pmml.Baseline).child() self.alternate = testDistributions.child(pmml.Alternate).child() self.updator = self.updateScheme.updator(CUSUM) self.updator.resetValue = testDistributions.attrib["resetValue"] self.score = self.scoreCUSUM extension = testDistributions.child(pmml.Extension, exception=False) if extension is not None: init = extension.child(pmml.X_ODG_CUSUMInitialization, exception=False) if init is not None: self.updator.initialize({CUSUM: [init.attrib["value"]]}) self.pseudoField = self.field self.pseudoOutputAll = True elif testStatistic == "zValue": self.baseline = testDistributions.child(pmml.Baseline).child() if isinstance(self.baseline, pmml.GaussianDistribution): self.score = self.scoreZValueGaussian else: self.score = self.scoreZValue self.pseudoField = self.field self.pseudoOutputAll = True elif testStatistic in ("chiSquareDistribution", "scalarProduct"): self.updators = {} self.countTable = testDistributions.child(pmml.Baseline).child() if "weightField" in testDistributions.attrib: self.weightField = testDistributions.attrib["weightField"] else: self.weightField = None if "normalizationScheme" not in testDistributions.attrib: self.normalizationScheme = None elif testDistributions.attrib[ "normalizationScheme"] == "Independent": self.normalizationScheme = self.INDEPENDENT elif testDistributions.attrib[ "normalizationScheme"] == "SizeWeighted": self.normalizationScheme = self.SIZEWEIGHTED self.testStatistic = { "chiSquareDistribution": self.CHISQUAREDISTRIBUTION, "scalarProduct": self.SCALARPRODUCT, }[testStatistic] self.score = self.scoreHistogram self.pseudoField = (self.field, self.weightField) self.pseudoOutputAll = False # ODG extensions self.binsOfInterest = testDistributions.descendant( pmml.X_ODG_BinsOfInterest, exception=False) elif testStatistic == "chiSquareIndependence": self.baseline = testDistributions.child(pmml.Baseline) self.fields = None self.countTable = None self.score = self.scoreChiSquareIndependence self.pseudoField = None self.pseudoOutputAll = True # ODG extensions elif testStatistic == "GLR": self.baseline = testDistributions.child(pmml.Baseline).child() if not isinstance( self.baseline, (pmml.GaussianDistribution, pmml.PoissonDistribution)): raise NotImplementedError, "GLR has only been implemented for Gaussian and Poisson distributions" self.updator = self.updateScheme.updator(GLR) self.score = self.scoreGLR self.pseudoField = self.field self.pseudoOutputAll = False ######################################## CUSUM def scoreCUSUM(self, syncNumber, get): """Score one event with a CUSUM testStatistic.""" value = get(self.field) if value is INVALID or value is MISSING: return INVALID self.updator.increment( syncNumber, self.alternate.logpdf(value) - self.baseline.logpdf(value)) return {SCORE_predictedValue: self.updator.cusum()} ######################################## zValue def scoreZValueGaussian(self, syncNumber, get): """Score one event with a zValue testStatistic (Gaussian).""" value = get(self.field) if value is INVALID or value is MISSING: return INVALID if self.baseline.attrib["variance"] == 0.: return {SCORE_predictedValue: float("inf"), SCORE_pValue: 0.} elif self.baseline.attrib["variance"] < 0.: return INVALID zValue = (value - self.baseline.attrib["mean"]) / math.sqrt( self.baseline.attrib["variance"]) probability = self.baseline.cdf(value) pValue = 1. - 2. * abs(probability - 0.5) return {SCORE_predictedValue: zValue, SCORE_pValue: pValue} def scoreZValue(self, syncNumber, get): """Score one event with a zValue testStatistic (non-Gaussian).""" value = get(self.field) if value is INVALID or value is MISSING: return INVALID probability = self.baseline.cdf(value) if probability <= 1e-16: zValue = -10. elif probability >= 1. - 1e-16: zValue = 10. else: zValue = math.sqrt(2.) * erfinv(2. * probability - 1.) pValue = 1. - 2. * abs(probability - 0.5) return {SCORE_predictedValue: zValue, SCORE_pValue: pValue} ######################################## chiSquareDistribution and scalarProduct def scoreHistogram(self, syncNumber, get): """Score one event with a chiSquareDistribution or scalarProduct.""" value = get(self.field) if self.weightField is None: weight = 1. else: weight = get(self.weightField) # we can still calculate the consistency of the *accumulated* distribution, even if this datapoint is invalid if value is INVALID or value is MISSING or weight is INVALID or weight is MISSING: pass else: # for histograms, increment all bins, but only the correct bin gets a non-zero value found = False for bin, updator in self.updators.items(): if bin == value: updator.increment(syncNumber, weight) found = True else: updator.increment(syncNumber, 0.) # this might be a new bin if not found: updator = self.updateScheme.updator(SUMX) updator.increment(syncNumber, weight) self.updators[value] = updator fieldValueCounts = self.countTable.matches(pmml.FieldValueCount, maxdepth=None) # chiSquareDistribution if self.testStatistic == self.CHISQUAREDISTRIBUTION: expectedTotal = 0. expectedValues = {} for fieldValueCount in fieldValueCounts: bin = fieldValueCount.attrib["value"] count = fieldValueCount.attrib["count"] expectedTotal += count expectedValues[bin] = count observedTotal = 0. for bin, updator in self.updators.items(): observedTotal += updator.sum() if expectedTotal <= 0. or observedTotal <= 0. or ( isinstance(self.countTable, pmml.NormalizedCountTable) and self.countTable.attrib["sample"] <= 0.): return INVALID chi2 = 0. if self.binsOfInterest is None: ndf = -1 # normalization removes one degree of freedom else: ndf = 0 for bin in set(expectedValues.keys()).union( set(self.updators.keys())): if self.binsOfInterest is not None: if bin not in self.binsOfInterest: continue expected = expectedValues.get(bin, 0.) updator = self.updators.get(bin, None) if updator is not None: observed = updator.sum() else: observed = 0. if expected > 0. or observed > 0.: if isinstance(self.countTable, pmml.CountTable): chi2 += (expected / expectedTotal - observed / observedTotal)**2 / ( expected / expectedTotal**2 + observed / observedTotal**2) elif isinstance(self.countTable, pmml.NormalizedCountTable): sample = self.countTable.attrib["sample"] chi2 += (expected / expectedTotal - observed / observedTotal)**2 / ( expected / expectedTotal / sample + observed / observedTotal**2) ndf += 1 if ndf > 0: probability = chiSquare_cdf(chi2, ndf) pValue = 1. - probability return { SCORE_predictedValue: probability, SCORE_pValue: pValue, SCORE_chiSquare: chi2, SCORE_degreesOfFreedom: ndf } else: return INVALID # scalarProduct elif self.testStatistic == self.SCALARPRODUCT: expectedNorm2 = 0. dotProduct = 0. for fieldValueCount in fieldValueCounts: expected = fieldValueCount.attrib["count"] expectedNorm2 += expected**2 bin = fieldValueCount.attrib["value"] if expected > 0. and bin in self.updators: observed = self.updators[bin].sum() dotProduct += expected * observed observedNorm2 = 0. for updator in self.updators.values(): observed = updator.sum() observedNorm2 += observed**2 if expectedNorm2 > 0. and observedNorm2 > 0.: if self.normalizationScheme is None: return {SCORE_predictedValue: dotProduct} elif self.normalizationScheme is self.INDEPENDENT: if expectedNorm2 <= 0. or observedNorm2 <= 0.: return INVALID return { SCORE_predictedValue: dotProduct / math.sqrt(expectedNorm2) / math.sqrt(observedNorm2) } elif self.normalizationScheme is self.SIZEWEIGHTED: if expectedNorm2 + observedNorm2 <= 0.: return INVALID return { SCORE_predictedValue: 2. * dotProduct / (expectedNorm2 + observedNorm2) } else: return INVALID ######################################## chiSquareIndependence def _chiSquareIndependence_add(self, pmmlNode, fieldValues, totals): if isinstance( pmmlNode, (pmml.CountTable, pmml.NormalizedCountTable, pmml.FieldValue)): for child in pmmlNode: self._chiSquareIndependence_add( child, fieldValues + [child.attrib["value"]], totals) elif isinstance(pmmlNode, pmml.FieldValueCount): count = pmmlNode.attrib["count"] totals[None] += count for f, v in zip(self.fields, fieldValues): if v not in totals[f]: totals[f][v] = 0. totals[f][v] += count def _chiSquareIndependence_chi2(self, pmmlNode, fieldValues, totals): if isinstance( pmmlNode, (pmml.CountTable, pmml.NormalizedCountTable, pmml.FieldValue)): output = 0. for child in pmmlNode: subchi2 = self._chiSquareIndependence_chi2( child, fieldValues + [child.attrib["value"]], totals) if subchi2 is None: return None output += subchi2 return output elif isinstance(pmmlNode, pmml.FieldValueCount): observed = pmmlNode.attrib["count"] if totals[None] == 0: return None else: if isinstance(self.countTable, pmml.NormalizedCountTable): scale = self.countTable.attrib["sample"] / totals[None] else: scale = 1. expected = 1. / (totals[None] * scale)**(len(self.fields) - 1) for f, v in zip(self.fields, fieldValues): expected *= (totals[f][v] * scale) if expected == 0.: return None else: return (expected - (observed * scale))**2 / expected def scoreChiSquareIndependence(self, syncNumber, get): """Score one event with a chiSquareIndependence testStatistic. This reads from the multi-dimensional CountTable in PMML and ignores the data! Data are only used to make the CountTable, so be sure to be running the producer if you want chiSquareIndependence. """ # expect a CountTable (if it doesn't exist, the producer will make it) self.countTable = self.baseline.child() if not isinstance(self.countTable, (pmml.CountTable, pmml.NormalizedCountTable)): return INVALID # the "first" time doesn't happen until we see a count table self.fields = [] dimension = self.countTable.child(pmml.nonExtension) while True: self.fields.append(dimension.attrib["field"]) if isinstance(dimension, pmml.FieldValueCount): break dimension = dimension.child(pmml.nonExtension) totals = {None: 0.} for f in self.fields: totals[f] = {} # every time: add up the n-field margins (which are "rows and columns" in 2-field case) self._chiSquareIndependence_add(self.countTable, [], totals) chi2 = self._chiSquareIndependence_chi2(self.countTable, [], totals) ndf = 1 for f, tot in totals.items(): if f is not None: ndf *= (len(tot) - 1) if chi2 is not None and ndf > 0: probability = chiSquare_cdf(chi2, ndf) pValue = 1. - probability return { SCORE_predictedValue: probability, SCORE_pValue: pValue, SCORE_chiSquare: chi2, SCORE_degreesOfFreedom: ndf } else: return INVALID ######################################## ODG-extension: GLR def _scoreGLR_GaussianDistribution(self, s, N): return (s - N * self.baseline.attrib["mean"])**2 / N def _scoreGLR_PoissonDistribution(self, s, N): if s > 0.: return -math.log(self.baseline.attrib["mean"]) * s + math.log( s / N) * s + N * self.baseline.attrib["mean"] - s else: return -math.log(self.baseline.attrib["mean"] ) * s + N * self.baseline.attrib["mean"] - s def scoreGLR(self, syncNumber, get): """Score one event with a GLR testStatistic. Output is the *current* best-guess of the turn-around time (as the corresponding syncNumber) and its log-likelihood ratio. """ # Eq. 2.4.40 in Basseville and Nikiforov: http://www.irisa.fr/sisthem/kniga/ (partly in eventweighting.py) value = get(self.field) if value is not INVALID and value is not MISSING: self.updator.increment(syncNumber, value) if isinstance(self.baseline, pmml.GaussianDistribution): maximum_syncNumber, maximum = self.updator.glr( self._scoreGLR_GaussianDistribution) if maximum is None or self.baseline.attrib["variance"] < 0.: return INVALID elif self.baseline.attrib["variance"] == 0.: return { SCORE_predictedValue: float("inf"), SCORE_thresholdTime: maximum_syncNumber } else: return { SCORE_predictedValue: maximum / 2. / self.baseline.attrib["variance"], SCORE_thresholdTime: maximum_syncNumber } elif isinstance(self.baseline, pmml.PoissonDistribution): maximum_syncNumber, maximum = self.updator.glr( self._scoreGLR_PoissonDistribution) if maximum is None: return INVALID else: return { SCORE_predictedValue: maximum, SCORE_thresholdTime: maximum_syncNumber }
def initialize(self): """Initialize a baseline consumer. Unlike other consumers, this creates the score function dynamically, depending on the type of testStatistic. """ testDistributions = self.segmentRecord.pmmlModel.child(pmml.TestDistributions) self.field = testDistributions.attrib["field"] testStatistic = testDistributions.attrib["testStatistic"] # updating can be configured in the Augustus configuration file and in the "windowSize" attribute in this segment # I will assume that the "windowSize" attribute can override CUSUM and GLR only # (the only other baseline consumer that has an intermediate state is chiSquareDistribution, which only makes # sense as UpdateScheme("synchronized"), and that one depends on the configuration of syncNumber, not in PMML) # the general case: self.updateScheme = self.engine.consumerUpdateScheme # the special case: if testStatistic in ("CUSUM", "GLR"): if "windowSize" in testDistributions.attrib and testDistributions.attrib["windowSize"] != 0: self.updateScheme = UpdateScheme("window", windowSize=testDistributions.attrib["windowSize"], windowLag=0) if testStatistic == "CUSUM": self.baseline = testDistributions.child(pmml.Baseline).child() self.alternate = testDistributions.child(pmml.Alternate).child() self.updator = self.updateScheme.updator(CUSUM) self.updator.resetValue = testDistributions.attrib["resetValue"] self.score = self.scoreCUSUM extension = testDistributions.child(pmml.Extension, exception=False) if extension is not None: init = extension.child(pmml.X_ODG_CUSUMInitialization, exception=False) if init is not None: self.updator.initialize({CUSUM: [init.attrib["value"]]}) self.pseudoField = self.field self.pseudoOutputAll = True elif testStatistic == "zValue": self.baseline = testDistributions.child(pmml.Baseline).child() if isinstance(self.baseline, pmml.GaussianDistribution): self.score = self.scoreZValueGaussian else: self.score = self.scoreZValue self.pseudoField = self.field self.pseudoOutputAll = True elif testStatistic in ("chiSquareDistribution", "scalarProduct"): self.updators = {} self.countTable = testDistributions.child(pmml.Baseline).child() if "weightField" in testDistributions.attrib: self.weightField = testDistributions.attrib["weightField"] else: self.weightField = None if "normalizationScheme" not in testDistributions.attrib: self.normalizationScheme = None elif testDistributions.attrib["normalizationScheme"] == "Independent": self.normalizationScheme = self.INDEPENDENT elif testDistributions.attrib["normalizationScheme"] == "SizeWeighted": self.normalizationScheme = self.SIZEWEIGHTED self.testStatistic = {"chiSquareDistribution": self.CHISQUAREDISTRIBUTION, "scalarProduct": self.SCALARPRODUCT, }[testStatistic] self.score = self.scoreHistogram self.pseudoField = (self.field, self.weightField) self.pseudoOutputAll = False # ODG extensions self.binsOfInterest = testDistributions.descendant(pmml.X_ODG_BinsOfInterest, exception=False) elif testStatistic == "chiSquareIndependence": self.baseline = testDistributions.child(pmml.Baseline) self.fields = None self.countTable = None self.score = self.scoreChiSquareIndependence self.pseudoField = None self.pseudoOutputAll = True # ODG extensions elif testStatistic == "GLR": self.baseline = testDistributions.child(pmml.Baseline).child() if not isinstance(self.baseline, (pmml.GaussianDistribution, pmml.PoissonDistribution)): raise NotImplementedError, "GLR has only been implemented for Gaussian and Poisson distributions" self.updator = self.updateScheme.updator(GLR) self.score = self.scoreGLR self.pseudoField = self.field self.pseudoOutputAll = False
class ConsumerBaselineModel(ConsumerAlgorithm): CHISQUAREDISTRIBUTION = Atom("chiSquareDistribution") SCALARPRODUCT = Atom("scalarProduct") INDEPENDENT = Atom("Independent") SIZEWEIGHTED = Atom("SizeWeighted") def initialize(self): """Initialize a baseline consumer. Unlike other consumers, this creates the score function dynamically, depending on the type of testStatistic. """ testDistributions = self.segmentRecord.pmmlModel.child(pmml.TestDistributions) self.field = testDistributions.attrib["field"] testStatistic = testDistributions.attrib["testStatistic"] # updating can be configured in the Augustus configuration file and in the "windowSize" attribute in this segment # I will assume that the "windowSize" attribute can override CUSUM and GLR only # (the only other baseline consumer that has an intermediate state is chiSquareDistribution, which only makes # sense as UpdateScheme("synchronized"), and that one depends on the configuration of syncNumber, not in PMML) # the general case: self.updateScheme = self.engine.consumerUpdateScheme # the special case: if testStatistic in ("CUSUM", "GLR"): if "windowSize" in testDistributions.attrib and testDistributions.attrib["windowSize"] != 0: self.updateScheme = UpdateScheme("window", windowSize=testDistributions.attrib["windowSize"], windowLag=0) if testStatistic == "CUSUM": self.baseline = testDistributions.child(pmml.Baseline).child() self.alternate = testDistributions.child(pmml.Alternate).child() self.updator = self.updateScheme.updator(CUSUM) self.updator.resetValue = testDistributions.attrib["resetValue"] self.score = self.scoreCUSUM extension = testDistributions.child(pmml.Extension, exception=False) if extension is not None: init = extension.child(pmml.X_ODG_CUSUMInitialization, exception=False) if init is not None: self.updator.initialize({CUSUM: [init.attrib["value"]]}) self.pseudoField = self.field self.pseudoOutputAll = True elif testStatistic == "zValue": self.baseline = testDistributions.child(pmml.Baseline).child() if isinstance(self.baseline, pmml.GaussianDistribution): self.score = self.scoreZValueGaussian else: self.score = self.scoreZValue self.pseudoField = self.field self.pseudoOutputAll = True elif testStatistic in ("chiSquareDistribution", "scalarProduct"): self.updators = {} self.countTable = testDistributions.child(pmml.Baseline).child() if "weightField" in testDistributions.attrib: self.weightField = testDistributions.attrib["weightField"] else: self.weightField = None if "normalizationScheme" not in testDistributions.attrib: self.normalizationScheme = None elif testDistributions.attrib["normalizationScheme"] == "Independent": self.normalizationScheme = self.INDEPENDENT elif testDistributions.attrib["normalizationScheme"] == "SizeWeighted": self.normalizationScheme = self.SIZEWEIGHTED self.testStatistic = {"chiSquareDistribution": self.CHISQUAREDISTRIBUTION, "scalarProduct": self.SCALARPRODUCT, }[testStatistic] self.score = self.scoreHistogram self.pseudoField = (self.field, self.weightField) self.pseudoOutputAll = False # ODG extensions self.binsOfInterest = testDistributions.descendant(pmml.X_ODG_BinsOfInterest, exception=False) elif testStatistic == "chiSquareIndependence": self.baseline = testDistributions.child(pmml.Baseline) self.fields = None self.countTable = None self.score = self.scoreChiSquareIndependence self.pseudoField = None self.pseudoOutputAll = True # ODG extensions elif testStatistic == "GLR": self.baseline = testDistributions.child(pmml.Baseline).child() if not isinstance(self.baseline, (pmml.GaussianDistribution, pmml.PoissonDistribution)): raise NotImplementedError, "GLR has only been implemented for Gaussian and Poisson distributions" self.updator = self.updateScheme.updator(GLR) self.score = self.scoreGLR self.pseudoField = self.field self.pseudoOutputAll = False ######################################## CUSUM def scoreCUSUM(self, syncNumber, get): """Score one event with a CUSUM testStatistic.""" self.resetLoggerLevels() value = get(self.field) if value is INVALID or value is MISSING: self.lastScore = INVALID self.logger.debug("scoreCUSUM: returning INVALID score") return self.lastScore self.updator.increment(syncNumber, self.alternate.logpdf(value) - self.baseline.logpdf(value)) self.lastScore = {SCORE_predictedValue: self.updator.cusum()} return self.lastScore ######################################## zValue def scoreZValueGaussian(self, syncNumber, get): """Score one event with a zValue testStatistic (Gaussian).""" self.resetLoggerLevels() value = get(self.field) if value is INVALID or value is MISSING: self.lastScore = INVALID self.logger.debug("scoreZValueGaussian: returning INVALID score") return self.lastScore if self.baseline.attrib["variance"] == 0.: self.lastScore = {SCORE_predictedValue: float("inf"), SCORE_pValue: 0.} self.logger.debug("scoreZValueGaussian: returning infinite score") return self.lastScore elif self.baseline.attrib["variance"] < 0.: self.logger.debug("scoreZValueGaussian: returning INVALID score") self.lastScore = INVALID return self.lastScore zValue = (value - self.baseline.attrib["mean"]) / math.sqrt(self.baseline.attrib["variance"]) probability = self.baseline.cdf(value) pValue = 1. - 2.*abs(probability - 0.5) self.lastScore = {SCORE_predictedValue: zValue, SCORE_pValue: pValue} return self.lastScore def scoreZValue(self, syncNumber, get): """Score one event with a zValue testStatistic (non-Gaussian).""" self.resetLoggerLevels() value = get(self.field) if value is INVALID or value is MISSING: self.lastScore = INVALID self.logger.debug("scoreZValue: returning INVALID score") return self.lastScore probability = self.baseline.cdf(value) if probability <= 1e-16: zValue = -10. elif probability >= 1. - 1e-16: zValue = 10. else: zValue = math.sqrt(2.)*erfinv(2.*probability - 1.) pValue = 1. - 2.*abs(probability - 0.5) self.lastScore = {SCORE_predictedValue: zValue, SCORE_pValue: pValue} return self.lastScore ######################################## chiSquareDistribution and scalarProduct def scoreHistogram(self, syncNumber, get): """Score one event with a chiSquareDistribution or scalarProduct.""" self.resetLoggerLevels() value = get(self.field) if self.weightField is None: weight = 1. else: weight = get(self.weightField) # we can still calculate the consistency of the *accumulated* distribution, even if this datapoint is invalid if value is INVALID or value is MISSING or weight is INVALID or weight is MISSING: pass else: # for histograms, increment all bins, but only the correct bin gets a non-zero value found = False for bin, updator in self.updators.items(): if bin == value: updator.increment(syncNumber, weight) found = True else: updator.increment(syncNumber, 0.) # this might be a new bin if not found: updator = self.updateScheme.updator(SUMX) updator.increment(syncNumber, weight) self.updators[value] = updator fieldValueCounts = self.countTable.matches(pmml.FieldValueCount, maxdepth=None) # chiSquareDistribution if self.testStatistic == self.CHISQUAREDISTRIBUTION: expectedTotal = 0. expectedValues = {} for fieldValueCount in fieldValueCounts: bin = fieldValueCount.attrib["value"] count = fieldValueCount.attrib["count"] expectedTotal += count expectedValues[bin] = count observedTotal = 0. for bin, updator in self.updators.items(): observedTotal += updator.sum() if expectedTotal <= 0. or observedTotal <= 0. or (isinstance(self.countTable, pmml.NormalizedCountTable) and self.countTable.attrib["sample"] <= 0.): self.lastScore = INVALID self.logger.debug("scoreHistogram: returning INVALID score") return self.lastScore chi2 = 0. if self.binsOfInterest is None: ndf = -1 # normalization removes one degree of freedom else: ndf = 0 for bin in set(expectedValues.keys()).union(set(self.updators.keys())): if self.binsOfInterest is not None: if bin not in self.binsOfInterest: continue expected = expectedValues.get(bin, 0.) updator = self.updators.get(bin, None) if updator is not None: observed = updator.sum() else: observed = 0. if expected > 0. or observed > 0.: if isinstance(self.countTable, pmml.CountTable): chi2 += (expected/expectedTotal - observed/observedTotal)**2 / (expected/expectedTotal**2 + observed/observedTotal**2) elif isinstance(self.countTable, pmml.NormalizedCountTable): sample = self.countTable.attrib["sample"] chi2 += (expected/expectedTotal - observed/observedTotal)**2 / (expected/expectedTotal/sample + observed/observedTotal**2) ndf += 1 if ndf > 0: probability = chiSquare_cdf(chi2, ndf) pValue = 1. - probability self.lastScore = {SCORE_predictedValue: probability, SCORE_pValue: pValue, SCORE_chiSquare: chi2, SCORE_degreesOfFreedom: ndf} else: self.lastScore = INVALID self.logger.debug("scoreHistogram: returning INVALID score") return self.lastScore # scalarProduct elif self.testStatistic == self.SCALARPRODUCT: expectedNorm2 = 0. dotProduct = 0. for fieldValueCount in fieldValueCounts: expected = fieldValueCount.attrib["count"] expectedNorm2 += expected**2 bin = fieldValueCount.attrib["value"] if expected > 0. and bin in self.updators: observed = self.updators[bin].sum() dotProduct += expected * observed observedNorm2 = 0. for updator in self.updators.values(): observed = updator.sum() observedNorm2 += observed**2 if expectedNorm2 > 0. and observedNorm2 > 0.: if self.normalizationScheme is None: self.lastScore = {SCORE_predictedValue: dotProduct} elif self.normalizationScheme is self.INDEPENDENT: if expectedNorm2 <= 0. or observedNorm2 <= 0.: self.lastScore = INVALID self.logger.debug("scoreHistogram: returning INVALID score") return self.lastScore self.lastScore = {SCORE_predictedValue: dotProduct/math.sqrt(expectedNorm2)/math.sqrt(observedNorm2)} elif self.normalizationScheme is self.SIZEWEIGHTED: if expectedNorm2 + observedNorm2 <= 0.: self.lastScore = INVALID self.logger.debug("scoreHistogram: returning INVALID score") return self.lastScore self.lastScore = {SCORE_predictedValue: 2.*dotProduct/(expectedNorm2 + observedNorm2)} else: self.lastScore = INVALID self.logger.debug("scoreHistogram: returning INVALID score") return self.lastScore ######################################## chiSquareIndependence def _chiSquareIndependence_add(self, pmmlNode, fieldValues, totals): if isinstance(pmmlNode, (pmml.CountTable, pmml.NormalizedCountTable, pmml.FieldValue)): for child in pmmlNode: self._chiSquareIndependence_add(child, fieldValues + [child.attrib["value"]], totals) elif isinstance(pmmlNode, pmml.FieldValueCount): count = pmmlNode.attrib["count"] totals[None] += count for f, v in zip(self.fields, fieldValues): if v not in totals[f]: totals[f][v] = 0. totals[f][v] += count def _chiSquareIndependence_chi2(self, pmmlNode, fieldValues, totals): if isinstance(pmmlNode, (pmml.CountTable, pmml.NormalizedCountTable, pmml.FieldValue)): output = 0. for child in pmmlNode: subchi2 = self._chiSquareIndependence_chi2(child, fieldValues + [child.attrib["value"]], totals) if subchi2 is None: return None output += subchi2 return output elif isinstance(pmmlNode, pmml.FieldValueCount): observed = pmmlNode.attrib["count"] if totals[None] == 0: return None else: if isinstance(self.countTable, pmml.NormalizedCountTable): scale = self.countTable.attrib["sample"]/totals[None] else: scale = 1. expected = 1./(totals[None] * scale)**(len(self.fields) - 1) for f, v in zip(self.fields, fieldValues): expected *= (totals[f][v] * scale) if expected == 0.: return None else: return (expected - (observed*scale))**2 / expected def scoreChiSquareIndependence(self, syncNumber, get): """Score one event with a chiSquareIndependence testStatistic. This reads from the multi-dimensional CountTable in PMML and ignores the data! Data are only used to make the CountTable, so be sure to be running the producer if you want chiSquareIndependence. """ self.resetLoggerLevels() # expect a CountTable (if it doesn't exist, the producer will make it) self.countTable = self.baseline.child() if not isinstance(self.countTable, (pmml.CountTable, pmml.NormalizedCountTable)): self.lastScore = INVALID # the "first" time doesn't happen until we see a count table return self.lastScore self.fields = [] dimension = self.countTable.child(pmml.nonExtension) while True: self.fields.append(dimension.attrib["field"]) if isinstance(dimension, pmml.FieldValueCount): break dimension = dimension.child(pmml.nonExtension) totals = {None: 0.} for f in self.fields: totals[f] = {} # every time: add up the n-field margins (which are "rows and columns" in 2-field case) self._chiSquareIndependence_add(self.countTable, [], totals) chi2 = self._chiSquareIndependence_chi2(self.countTable, [], totals) ndf = 1 for f, tot in totals.items(): if f is not None: ndf *= (len(tot) - 1) if chi2 is not None and ndf > 0: probability = chiSquare_cdf(chi2, ndf) pValue = 1. - probability self.lastScore = {SCORE_predictedValue: probability, SCORE_pValue: pValue, SCORE_chiSquare: chi2, SCORE_degreesOfFreedom: ndf} else: self.lastScore = INVALID self.logger.debug("scoreChiSquareIndependence: returning INVALID score") return self.lastScore ######################################## ODG-extension: GLR def _scoreGLR_GaussianDistribution(self, s, N): return (s - N*self.baseline.attrib["mean"])**2 / N def _scoreGLR_PoissonDistribution(self, s, N): if s > 0.: return -math.log(self.baseline.attrib["mean"])*s + math.log(s/N)*s + N*self.baseline.attrib["mean"] - s else: return -math.log(self.baseline.attrib["mean"])*s + N*self.baseline.attrib["mean"] - s def scoreGLR(self, syncNumber, get): """Score one event with a GLR testStatistic. Output is the *current* best-guess of the turn-around time (as the corresponding syncNumber) and its log-likelihood ratio. """ self.resetLoggerLevels() # Eq. 2.4.40 in Basseville and Nikiforov: http://www.irisa.fr/sisthem/kniga/ (partly in eventweighting.py) value = get(self.field) if value is not INVALID and value is not MISSING: self.updator.increment(syncNumber, value) if isinstance(self.baseline, pmml.GaussianDistribution): maximum_syncNumber, maximum = self.updator.glr(self._scoreGLR_GaussianDistribution) if maximum is None or self.baseline.attrib["variance"] < 0.: self.lastScore = INVALID self.logger.debug("scoreGLR: returning INVALID score") elif self.baseline.attrib["variance"] == 0.: self.lastScore = {SCORE_predictedValue: float("inf"), SCORE_thresholdTime: maximum_syncNumber} else: self.lastScore = {SCORE_predictedValue: maximum/2./self.baseline.attrib["variance"], SCORE_thresholdTime: maximum_syncNumber} return self.lastScore elif isinstance(self.baseline, pmml.PoissonDistribution): maximum_syncNumber, maximum = self.updator.glr(self._scoreGLR_PoissonDistribution) if maximum is None: self.lastScore = INVALID self.logger.debug("scoreGLR: returning INVALID score") else: self.lastScore = {SCORE_predictedValue: maximum, SCORE_thresholdTime: maximum_syncNumber} return self.lastScore