class PythonFunction(ScoresAwk): xsd = load_xsdElement(ScoresAwk, """ <xs:element name="PythonFunction"> <xs:complexType> <xs:sequence> <xs:element minOccurs="0" maxOccurs="unbounded" ref="Context"/> </xs:sequence> <xs:attribute name="condition" type="xs:string" use="optional"/> <xs:attribute name="action" type="xs:string" use="required"/> </xs:complexType> </xs:element> """) BEGIN = Atom("Begin") EVENT = Atom("Event") END = Atom("End") def post_validate(self): context = {"g": globalVariables} for c in self.matches(Context): context.update(c.context) cdatas = [i for i in self.children if isinstance(i, xmlbase.XMLCDATA)] if len(cdatas) != 1: raise XMLValidationError, "A PythonFunction object must contain exactly one CDATA" theCode = "".join(cdatas[0].text).lstrip().rstrip() ## CAREFUL: evaluates whatever you give it! try: exec theCode in context except SyntaxError, err: raise XMLValidationError, "PythonFunction could not be evaluated: %s" % str(err) if "condition" in self.attrib: if self["condition"] == "BEGIN": self.condition = self.BEGIN elif self["condition"] == "END": self.condition = self.END else: try: self.condition = context[self["condition"]] if not callable(self.condition): raise KeyError except KeyError: raise XMLValidationError, "PythonFunction does not contain a condition function called \"%s\"" % self["condition"] else: self.condition = self.EVENT try: self.action = context[self["action"]] if not callable(self.action): raise KeyError except KeyError: raise XMLValidationError, "PythonFunction does not contain an action function called \"%s\"" % self["action"]
class ConsumerBaselineModel(ConsumerAlgorithm): CHISQUAREDISTRIBUTION = Atom("chiSquareDistribution") SCALARPRODUCT = Atom("scalarProduct") INDEPENDENT = Atom("Independent") SIZEWEIGHTED = Atom("SizeWeighted") def initialize(self): """Initialize a baseline consumer. Unlike other consumers, this creates the score function dynamically, depending on the type of testStatistic. """ testDistributions = self.segmentRecord.pmmlModel.child( pmml.TestDistributions) self.field = testDistributions.attrib["field"] testStatistic = testDistributions.attrib["testStatistic"] # updating can be configured in the Augustus configuration file and in the "windowSize" attribute in this segment # I will assume that the "windowSize" attribute can override CUSUM and GLR only # (the only other baseline consumer that has an intermediate state is chiSquareDistribution, which only makes # sense as UpdateScheme("synchronized"), and that one depends on the configuration of syncNumber, not in PMML) # the general case: self.updateScheme = self.engine.consumerUpdateScheme # the special case: if testStatistic in ("CUSUM", "GLR"): if "windowSize" in testDistributions.attrib and testDistributions.attrib[ "windowSize"] != 0: self.updateScheme = UpdateScheme( "window", windowSize=testDistributions.attrib["windowSize"], windowLag=0) if testStatistic == "CUSUM": self.baseline = testDistributions.child(pmml.Baseline).child() self.alternate = testDistributions.child(pmml.Alternate).child() self.updator = self.updateScheme.updator(CUSUM) self.updator.resetValue = testDistributions.attrib["resetValue"] self.score = self.scoreCUSUM extension = testDistributions.child(pmml.Extension, exception=False) if extension is not None: init = extension.child(pmml.X_ODG_CUSUMInitialization, exception=False) if init is not None: self.updator.initialize({CUSUM: [init.attrib["value"]]}) self.pseudoField = self.field self.pseudoOutputAll = True elif testStatistic == "zValue": self.baseline = testDistributions.child(pmml.Baseline).child() if isinstance(self.baseline, pmml.GaussianDistribution): self.score = self.scoreZValueGaussian else: self.score = self.scoreZValue self.pseudoField = self.field self.pseudoOutputAll = True elif testStatistic in ("chiSquareDistribution", "scalarProduct"): self.updators = {} self.countTable = testDistributions.child(pmml.Baseline).child() if "weightField" in testDistributions.attrib: self.weightField = testDistributions.attrib["weightField"] else: self.weightField = None if "normalizationScheme" not in testDistributions.attrib: self.normalizationScheme = None elif testDistributions.attrib[ "normalizationScheme"] == "Independent": self.normalizationScheme = self.INDEPENDENT elif testDistributions.attrib[ "normalizationScheme"] == "SizeWeighted": self.normalizationScheme = self.SIZEWEIGHTED self.testStatistic = { "chiSquareDistribution": self.CHISQUAREDISTRIBUTION, "scalarProduct": self.SCALARPRODUCT, }[testStatistic] self.score = self.scoreHistogram self.pseudoField = (self.field, self.weightField) self.pseudoOutputAll = False # ODG extensions self.binsOfInterest = testDistributions.descendant( pmml.X_ODG_BinsOfInterest, exception=False) elif testStatistic == "chiSquareIndependence": self.baseline = testDistributions.child(pmml.Baseline) self.fields = None self.countTable = None self.score = self.scoreChiSquareIndependence self.pseudoField = None self.pseudoOutputAll = True # ODG extensions elif testStatistic == "GLR": self.baseline = testDistributions.child(pmml.Baseline).child() if not isinstance( self.baseline, (pmml.GaussianDistribution, pmml.PoissonDistribution)): raise NotImplementedError, "GLR has only been implemented for Gaussian and Poisson distributions" self.updator = self.updateScheme.updator(GLR) self.score = self.scoreGLR self.pseudoField = self.field self.pseudoOutputAll = False ######################################## CUSUM def scoreCUSUM(self, syncNumber, get): """Score one event with a CUSUM testStatistic.""" value = get(self.field) if value is INVALID or value is MISSING: return INVALID self.updator.increment( syncNumber, self.alternate.logpdf(value) - self.baseline.logpdf(value)) return {SCORE_predictedValue: self.updator.cusum()} ######################################## zValue def scoreZValueGaussian(self, syncNumber, get): """Score one event with a zValue testStatistic (Gaussian).""" value = get(self.field) if value is INVALID or value is MISSING: return INVALID if self.baseline.attrib["variance"] == 0.: return {SCORE_predictedValue: float("inf"), SCORE_pValue: 0.} elif self.baseline.attrib["variance"] < 0.: return INVALID zValue = (value - self.baseline.attrib["mean"]) / math.sqrt( self.baseline.attrib["variance"]) probability = self.baseline.cdf(value) pValue = 1. - 2. * abs(probability - 0.5) return {SCORE_predictedValue: zValue, SCORE_pValue: pValue} def scoreZValue(self, syncNumber, get): """Score one event with a zValue testStatistic (non-Gaussian).""" value = get(self.field) if value is INVALID or value is MISSING: return INVALID probability = self.baseline.cdf(value) if probability <= 1e-16: zValue = -10. elif probability >= 1. - 1e-16: zValue = 10. else: zValue = math.sqrt(2.) * erfinv(2. * probability - 1.) pValue = 1. - 2. * abs(probability - 0.5) return {SCORE_predictedValue: zValue, SCORE_pValue: pValue} ######################################## chiSquareDistribution and scalarProduct def scoreHistogram(self, syncNumber, get): """Score one event with a chiSquareDistribution or scalarProduct.""" value = get(self.field) if self.weightField is None: weight = 1. else: weight = get(self.weightField) # we can still calculate the consistency of the *accumulated* distribution, even if this datapoint is invalid if value is INVALID or value is MISSING or weight is INVALID or weight is MISSING: pass else: # for histograms, increment all bins, but only the correct bin gets a non-zero value found = False for bin, updator in self.updators.items(): if bin == value: updator.increment(syncNumber, weight) found = True else: updator.increment(syncNumber, 0.) # this might be a new bin if not found: updator = self.updateScheme.updator(SUMX) updator.increment(syncNumber, weight) self.updators[value] = updator fieldValueCounts = self.countTable.matches(pmml.FieldValueCount, maxdepth=None) # chiSquareDistribution if self.testStatistic == self.CHISQUAREDISTRIBUTION: expectedTotal = 0. expectedValues = {} for fieldValueCount in fieldValueCounts: bin = fieldValueCount.attrib["value"] count = fieldValueCount.attrib["count"] expectedTotal += count expectedValues[bin] = count observedTotal = 0. for bin, updator in self.updators.items(): observedTotal += updator.sum() if expectedTotal <= 0. or observedTotal <= 0. or ( isinstance(self.countTable, pmml.NormalizedCountTable) and self.countTable.attrib["sample"] <= 0.): return INVALID chi2 = 0. if self.binsOfInterest is None: ndf = -1 # normalization removes one degree of freedom else: ndf = 0 for bin in set(expectedValues.keys()).union( set(self.updators.keys())): if self.binsOfInterest is not None: if bin not in self.binsOfInterest: continue expected = expectedValues.get(bin, 0.) updator = self.updators.get(bin, None) if updator is not None: observed = updator.sum() else: observed = 0. if expected > 0. or observed > 0.: if isinstance(self.countTable, pmml.CountTable): chi2 += (expected / expectedTotal - observed / observedTotal)**2 / ( expected / expectedTotal**2 + observed / observedTotal**2) elif isinstance(self.countTable, pmml.NormalizedCountTable): sample = self.countTable.attrib["sample"] chi2 += (expected / expectedTotal - observed / observedTotal)**2 / ( expected / expectedTotal / sample + observed / observedTotal**2) ndf += 1 if ndf > 0: probability = chiSquare_cdf(chi2, ndf) pValue = 1. - probability return { SCORE_predictedValue: probability, SCORE_pValue: pValue, SCORE_chiSquare: chi2, SCORE_degreesOfFreedom: ndf } else: return INVALID # scalarProduct elif self.testStatistic == self.SCALARPRODUCT: expectedNorm2 = 0. dotProduct = 0. for fieldValueCount in fieldValueCounts: expected = fieldValueCount.attrib["count"] expectedNorm2 += expected**2 bin = fieldValueCount.attrib["value"] if expected > 0. and bin in self.updators: observed = self.updators[bin].sum() dotProduct += expected * observed observedNorm2 = 0. for updator in self.updators.values(): observed = updator.sum() observedNorm2 += observed**2 if expectedNorm2 > 0. and observedNorm2 > 0.: if self.normalizationScheme is None: return {SCORE_predictedValue: dotProduct} elif self.normalizationScheme is self.INDEPENDENT: if expectedNorm2 <= 0. or observedNorm2 <= 0.: return INVALID return { SCORE_predictedValue: dotProduct / math.sqrt(expectedNorm2) / math.sqrt(observedNorm2) } elif self.normalizationScheme is self.SIZEWEIGHTED: if expectedNorm2 + observedNorm2 <= 0.: return INVALID return { SCORE_predictedValue: 2. * dotProduct / (expectedNorm2 + observedNorm2) } else: return INVALID ######################################## chiSquareIndependence def _chiSquareIndependence_add(self, pmmlNode, fieldValues, totals): if isinstance( pmmlNode, (pmml.CountTable, pmml.NormalizedCountTable, pmml.FieldValue)): for child in pmmlNode: self._chiSquareIndependence_add( child, fieldValues + [child.attrib["value"]], totals) elif isinstance(pmmlNode, pmml.FieldValueCount): count = pmmlNode.attrib["count"] totals[None] += count for f, v in zip(self.fields, fieldValues): if v not in totals[f]: totals[f][v] = 0. totals[f][v] += count def _chiSquareIndependence_chi2(self, pmmlNode, fieldValues, totals): if isinstance( pmmlNode, (pmml.CountTable, pmml.NormalizedCountTable, pmml.FieldValue)): output = 0. for child in pmmlNode: subchi2 = self._chiSquareIndependence_chi2( child, fieldValues + [child.attrib["value"]], totals) if subchi2 is None: return None output += subchi2 return output elif isinstance(pmmlNode, pmml.FieldValueCount): observed = pmmlNode.attrib["count"] if totals[None] == 0: return None else: if isinstance(self.countTable, pmml.NormalizedCountTable): scale = self.countTable.attrib["sample"] / totals[None] else: scale = 1. expected = 1. / (totals[None] * scale)**(len(self.fields) - 1) for f, v in zip(self.fields, fieldValues): expected *= (totals[f][v] * scale) if expected == 0.: return None else: return (expected - (observed * scale))**2 / expected def scoreChiSquareIndependence(self, syncNumber, get): """Score one event with a chiSquareIndependence testStatistic. This reads from the multi-dimensional CountTable in PMML and ignores the data! Data are only used to make the CountTable, so be sure to be running the producer if you want chiSquareIndependence. """ # expect a CountTable (if it doesn't exist, the producer will make it) self.countTable = self.baseline.child() if not isinstance(self.countTable, (pmml.CountTable, pmml.NormalizedCountTable)): return INVALID # the "first" time doesn't happen until we see a count table self.fields = [] dimension = self.countTable.child(pmml.nonExtension) while True: self.fields.append(dimension.attrib["field"]) if isinstance(dimension, pmml.FieldValueCount): break dimension = dimension.child(pmml.nonExtension) totals = {None: 0.} for f in self.fields: totals[f] = {} # every time: add up the n-field margins (which are "rows and columns" in 2-field case) self._chiSquareIndependence_add(self.countTable, [], totals) chi2 = self._chiSquareIndependence_chi2(self.countTable, [], totals) ndf = 1 for f, tot in totals.items(): if f is not None: ndf *= (len(tot) - 1) if chi2 is not None and ndf > 0: probability = chiSquare_cdf(chi2, ndf) pValue = 1. - probability return { SCORE_predictedValue: probability, SCORE_pValue: pValue, SCORE_chiSquare: chi2, SCORE_degreesOfFreedom: ndf } else: return INVALID ######################################## ODG-extension: GLR def _scoreGLR_GaussianDistribution(self, s, N): return (s - N * self.baseline.attrib["mean"])**2 / N def _scoreGLR_PoissonDistribution(self, s, N): if s > 0.: return -math.log(self.baseline.attrib["mean"]) * s + math.log( s / N) * s + N * self.baseline.attrib["mean"] - s else: return -math.log(self.baseline.attrib["mean"] ) * s + N * self.baseline.attrib["mean"] - s def scoreGLR(self, syncNumber, get): """Score one event with a GLR testStatistic. Output is the *current* best-guess of the turn-around time (as the corresponding syncNumber) and its log-likelihood ratio. """ # Eq. 2.4.40 in Basseville and Nikiforov: http://www.irisa.fr/sisthem/kniga/ (partly in eventweighting.py) value = get(self.field) if value is not INVALID and value is not MISSING: self.updator.increment(syncNumber, value) if isinstance(self.baseline, pmml.GaussianDistribution): maximum_syncNumber, maximum = self.updator.glr( self._scoreGLR_GaussianDistribution) if maximum is None or self.baseline.attrib["variance"] < 0.: return INVALID elif self.baseline.attrib["variance"] == 0.: return { SCORE_predictedValue: float("inf"), SCORE_thresholdTime: maximum_syncNumber } else: return { SCORE_predictedValue: maximum / 2. / self.baseline.attrib["variance"], SCORE_thresholdTime: maximum_syncNumber } elif isinstance(self.baseline, pmml.PoissonDistribution): maximum_syncNumber, maximum = self.updator.glr( self._scoreGLR_PoissonDistribution) if maximum is None: return INVALID else: return { SCORE_predictedValue: maximum, SCORE_thresholdTime: maximum_syncNumber }
augustus.algorithms.trees.ProducerTreeModel, (pmml.RuleSetModel, "iterative"): augustus.algorithms.trees.ProducerIterative, (pmml.RuleSetModel, "c45"): augustus.algorithms.trees.ProducerC45, (pmml.RuleSetModel, "cart"): augustus.algorithms.trees.ProducerCART, (pmml.RegressionModel, "streaming"): augustus.algorithms.regression.ProducerRegressionModel, (pmml.NaiveBayesModel, "streaming"): augustus.algorithms.naivebayes.ProducerNaiveBayesModel, } ########################################################### for faster segment lookup MATCHRANGES = Atom("MatchRanges") def __matchesPartition(matcher, partition): for bound, comparator in partition: if bound is not None and not comparator(matcher, bound): return False return True _segmentHelpers = NameSpace( lessThan=lambda x, val: x < val, lessOrEqual=lambda x, val: x <= val, greaterThan=lambda x, val: x > val, greaterOrEqual=lambda x, val: x >= val, isCompoundAnd=lambda x: isinstance(x, pmml.CompoundPredicate) and
# You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Represents a segment, maintains the producer and consumer algorithms for this segment, and has pointers to everything relevant. Maintained by Engine.""" from augustus.algorithms.eventweighting import COUNT import augustus.core.pmml41 as pmml from augustus.core.defs import Atom, IMMATURE, MATURE, LOCKED, UNINITIALIZED SELECTFIRST = Atom("SelectFirst") SELECTALL = Atom("SelectAll") SELECTONLY = Atom("SelectOnly") ########################################################### SegmentRecord class SegmentNameRegistry: """Give new segments IDs if they don't already have one and enforces uniqueness of IDs. Only one object of this class should exist. Maintains a double-referenced lookup table (dict <-> dict). """ def __init__(self): """Called by SegmentRecord (the class, not an instance) when Python starts up."""
class Feature: CATEGORICAL = Atom("Categorical") CONTINUOUS = Atom("Continuous") ORDINALSTRING = Atom("OrdinalString") STRING = Atom("String") INTEGER = Atom("Integer") FLOAT = Atom("Float") def __init__(self, name, optype, dataType, producerUpdateScheme): self.name = name if optype == "categorical": self.values = set() self.optype = self.CATEGORICAL self.dataType = self.STRING if dataType == "string" else dataType self.mature = False self.maturityCounter = 0 elif optype == "continuous": self.updator = producerUpdateScheme.updator(SUM1, SUMX, SUMXX) self.optype = self.CONTINUOUS self.dataType = { "integer": self.INTEGER, "float": self.FLOAT, "double": self.FLOAT }.get(dataType, dataType) self.mature = False self.maturityCounter = 0 else: self.values = map(optype, optype.values) self.optype = self.ORDINALSTRING self.dataType = self.STRING if dataType == "string" else dataType self.mature = True def increment(self, syncValue, get): value = get(self.name) if value is not INVALID and value is not MISSING: if self.optype is self.CATEGORICAL: self.values.add(value) if self.maturityCounter < self.maturityThreshold: self.maturityCounter += 1 else: self.mature = True elif self.optype is self.CONTINUOUS: self.updator.increment(syncValue, value) if self.maturityCounter < self.maturityThreshold: self.maturityCounter += 1 else: self.mature = True def randomSplit(self): if self.optype is self.CATEGORICAL: return SplitEqual(self.name, random.choice(tuple(self.values))) elif self.optype is self.CONTINUOUS: if self.dataType is self.INTEGER: return SplitGreaterThan( self.name, int( round( random.gauss(self.updator.mean(), math.sqrt(self.updator.variance()))))) else: return SplitGreaterThan( self.name, random.gauss(self.updator.mean(), math.sqrt(self.updator.variance()))) elif self.optype is self.ORDINALSTRING: return SplitGreaterThan(self.name, random.choice(tuple(self.values)))
class ProducerTreeModel(ProducerAlgorithm): TREEMODEL = Atom("TreeModel") RULESETMODEL = Atom("RuleSetModel") def initialize(self, **params): """An event-based tree-producing algorithm. Although it does not iterate over the data as the standard CART algorithm does, it converges to an approximate tree by keeping alternate hypotheses in mind and collecting data for all active hypotheses. """ if "resume" in params: self.resume = int(params["resume"]) del params["resume"] else: self.resume = False if self.resume: raise NotImplementedError, "Updating from existing TreeModels/RuleSetModels not implemented; use mode='replaceExisting'" if "featureMaturityThreshold" in params: self.featureMaturityThreshold = int( params["featureMaturityThreshold"]) del params["featureMaturityThreshold"] else: self.featureMaturityThreshold = 10 if "splitMaturityThreshold" in params: self.splitMaturityThreshold = int(params["splitMaturityThreshold"]) del params["splitMaturityThreshold"] else: self.splitMaturityThreshold = 30 if "trialsToKeep" in params: self.trialsToKeep = int(params["trialsToKeep"]) del params["trialsToKeep"] else: self.trialsToKeep = 50 if "worldsToSplit" in params: self.worldsToSplit = int(params["worldsToSplit"]) del params["worldsToSplit"] else: self.worldsToSplit = 3 if "treeDepth" in params: self.treeDepth = int(params["treeDepth"]) del params["treeDepth"] else: self.treeDepth = 3 if "classification" in params: self.classification = int(params["classification"]) del params["classification"] else: self.classification = None self.model = self.segmentRecord.pmmlModel if isinstance(self.model, pmml.TreeModel): self.modelType = self.TREEMODEL self.nodeIndex = self.model.index(pmml.Node) elif isinstance(self.model, pmml.RuleSetModel): self.ruleSet = self.model.child(pmml.RuleSet) self.modelType = self.RULESETMODEL self.nodeIndex = self.ruleSet.index( lambda x: isinstance(x, (pmml.SimpleRule, pmml.CompoundRule)), exception=False) if self.nodeIndex is None: self.nodeIndex = len(self.ruleSet.children) self.ruleSet.children.append(None) self.features = [] self.predicted = [] for miningField in self.model.child(pmml.MiningSchema).matches( pmml.MiningField): name = miningField.attrib["name"] usageType = miningField.attrib.get("usageType", "active") if usageType == "active": dataType = self.model.dataContext.dataType[name] optype = self.model.dataContext.optype[name] if optype == "ordinal" and dataType == "string": optype = self.model.dataContext.cast[name] feature = Feature(name, optype, dataType, self.engine.producerUpdateScheme) feature.maturityThreshold = self.featureMaturityThreshold self.features.append(feature) if usageType == "predicted": self.predicted.append(name) if len(self.predicted) == 0: self.classification = INVALID else: if self.classification is None: # by default, take the first 'predicted' feature self.classification = self.predicted[0] else: if self.classification not in self.predicted: raise RuntimeError, "Classification feature not found among the 'predicted' features in the decision tree's MiningSchema%s" % self.model.child( pmml.MiningSchema).fileAndLine() self.topWorld = World(0, None) self.counts = {} def update(self, syncNumber, get): if self.classification is INVALID: raise RuntimeError, "Cannot produce a decision tree with no 'predicted' features in the MiningSchema%s" % self.model.child( pmml.MiningSchema).fileAndLine() values = [get(feature.name) for feature in self.features] if INVALID in values or MISSING in values: return False classification = get(self.classification) if classification not in self.counts: self.counts[classification] = 0 self.counts[classification] += 1 bestClassification = None bestCount = None for c, count in self.counts.items(): if bestClassification is None or count > bestCount: bestClassification = c bestCount = count matureFeatures = [] for feature in self.features: feature.increment(syncNumber, get) if feature.mature: matureFeatures.append(feature) if len(matureFeatures) > 0: self.topWorld.increment(syncNumber, get, classification, matureFeatures, self) if self.modelType is self.TREEMODEL: self.topWorld.bestTree(self.model, bestClassification, self) elif self.modelType is self.RULESETMODEL: self.topWorld.bestRule(self.ruleSet, bestClassification, self) return True
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Defines the way all consumer algorithms store their states, and how events are weighted or blended. Can be expanded to handle model production in a parallelized system.""" import numpy import numpy.linalg from augustus.core.defs import Atom, INVALID from augustus.core.extramath import MINFLOAT ########################################################### Atoms COUNT = Atom("Count") SUM1 = Atom("Sum1") SUMX = Atom("SumX") SUMXX = Atom("SumXX") RUNMEAN = Atom("RunMean") RUNSN = Atom("RunSN") MIN = Atom("Min") MAX = Atom("Max") CUSUM = Atom("CUSUM") GLR = Atom("GLR") class COVARIANCE(Atom): """Atom (isotope?) for covariance calculations. The dimension of this object depends on the data, and is given at initialization.""" def __init__(self, dimension):
class OutputWriter(object): """Writes all scoring output. Opened at the beginning of a job, written to with each event/pseudoevent, and closed at the end of a job. """ XML = Atom("xml") JSON = Atom("json") identifier = None address = None def __init__(self, fileName, mode="xml", reportName=None, pmmlFileName=None, eventName="Event", pseudoEventName="pseudoEvent", segmentName="Segment", segmentExpressionName="SegmentExpression", outputName="Output", matchingSegmentsName="MatchingSegments"): """Create an OutputWriter with specified tag names.""" self.fileName = fileName if mode == "xml": self.mode = self.XML elif mode == "json": self.mode = self.JSON else: raise NotImplementedError( "Only 'xml' and 'json' output modes have been implemented") self.pmmlFileName = pmmlFileName self.reportName = reportName self.eventName = eventName self.pseudoEventName = pseudoEventName self.segmentName = segmentName self.segmentExpressionName = segmentExpressionName self.outputName = outputName self.matchingSegmentsName = matchingSegmentsName self.streams = [] def open(self, append=True): """Open an output file for writing. If a reportName is given, open the outermost XML or JSON object. """ if isinstance(self.fileName, basestring): self.ostream = codecs.open(self.fileName, "a" if append else "w", encoding="utf-8") else: self.ostream = self.fileName self.fileName = self.ostream.name if hasattr( self.ostream, "name") else "Untitled" self.streams.append(self.ostream) if self.reportName is not None: if self.mode is self.XML: label = dict(timestamp=datetime.datetime.now()) if self.pmmlFileName is not None: label["model"] = self.pmmlFileName label = " ".join([ "%s=%s" % (k, quoteattr(str(v))) for k, v in label.iteritems() ]) for stream in self.streams: stream.write("<%s %s>%s" % (self.reportName, label, os.linesep)) elif self.mode is self.JSON: for stream in self.streams: stream.write("{\"%s\": [%s" % (self.reportName, os.linesep)) self.needsComma = False def write(self, outputRecord, eventTags=None, eventName=None, pseudoEventName=None, segmentName=None, segmentExpressionName=None, outputName=None, matchingSegmentsName=None): """Write one record to the output file.""" if outputRecord is None: return if isinstance(outputRecord, basestring): for stream in self.streams: stream.write(outputRecord) return if self.identifier is not None: if eventTags is None: eventTags = [] eventTags.append(("id", self.identifier)) if self.address is not None: if eventTags is None: eventTags = [] eventTags.append(("address", self.address)) if self.mode is self.XML: for stream in self.streams: stream.write( outputRecord.xml( eventTags, self.eventName if eventName is None else eventName, self.pseudoEventName if pseudoEventName is None else pseudoEventName, self.segmentName if segmentName is None else segmentName, self.segmentExpressionName if segmentExpressionName is None else segmentExpressionName, self.outputName if outputName is None else outputName, self.matchingSegmentsName if matchingSegmentsName is None else matchingSegmentsName)) stream.write(os.linesep) elif self.mode is self.JSON: if self.reportName is not None: if self.needsComma: for stream in self.streams: stream.write(",") stream.write(os.linesep) for stream in self.streams: stream.write( outputRecord.json( eventTags, self.eventName if eventName is None else eventName, self.pseudoEventName if pseudoEventName is None else pseudoEventName, self.segmentName if segmentName is None else segmentName, self.segmentExpressionName if segmentExpressionName is None else segmentExpressionName, self.outputName if outputName is None else outputName, self.matchingSegmentsName if matchingSegmentsName is None else matchingSegmentsName)) if self.reportName is not None: self.needsComma = True else: for stream in self.streams: stream.write(os.linesep) for stream in self.streams: if hasattr(stream, "finishedEvent"): stream.finishedEvent() def close(self): """Close the output record. If a reportName is given, close the outermost XML or JSON object. """ if self.reportName is not None: if self.mode is self.XML: for stream in self.streams: stream.write("</%s>%s" % (self.reportName, os.linesep)) elif self.mode is self.JSON: for stream in self.streams: stream.write("]}") stream.write(os.linesep)
class ProducerKMeans(ProducerAlgorithm): """The standard k-means clustering algorithm.""" SYNCNUMBER = Atom("SyncNumber") RANDOM_DATAPOINTS = Atom("Random_DataPoints") RANDOM_DATAWEIGHTED = Atom("Random_DataWeighted") RANDOM_DATACOVARIANCE = Atom("Random_DataCovariance") RANDOM_UNITRECT = Atom("Random_UnitRect") defaultParams = { "updateExisting": "false", "quickConvergeSteps": "()", "numberOfClusters": "unset", "seedSource": "dataPoints", "numberOfTrials": "20", "numberToConverge": "5", "maxIterations": "unset", "closeEnough": "0" } def initialize(self, **params): if "updateExisting" in params: self.updateExisting = pmml.boolCheck(params["updateExisting"]) del params["updateExisting"] if self.updateExisting: raise NotImplementedError( "Updating from existing ClusterModels using 'kmeans' not implemented; use mode='replaceExisting'" ) else: self.updateExisting = pmml.boolCheck( self.defaultParams["updateExisting"]) if "quickConvergeSteps" in params: try: self.quickConvergeSteps = eval(params["quickConvergeSteps"]) if not isinstance(self.quickConvergeSteps, tuple): raise RuntimeError self.quickConvergeSteps = map(int, self.quickConvergeSteps) except err: raise RuntimeError( "quickConvergeSteps must be a tuple of numbers of events") del params["quickConvergeSteps"] else: self.quickConvergeSteps = eval( self.defaultParams["quickConvergeSteps"]) if "numberOfClusters" in params: self.numberOfClusters = params["numberOfClusters"] del params["numberOfClusters"] else: self.numberOfClusters = self.defaultParams["numberOfClusters"] try: self.numberOfClusters = int(self.numberOfClusters) if self.numberOfClusters <= 0: raise ValueError except ValueError: if self.numberOfClusters == "unset": self.numberOfClusters = None else: raise RuntimeError( "numberOfClusters must be a positive integer or \"unset\", not \"%s\"" % self.numberOfClusters) if "seedSource" in params: self.seedSource = params["seedSource"] del params["seedSource"] else: self.seedSource = self.defaultParams["seedSource"] if self.seedSource == "dataPoints": self.seedSource = self.RANDOM_DATAPOINTS elif self.seedSource == "dataWeighted": self.seedSource = self.RANDOM_DATAWEIGHTED elif self.seedSource == "dataCovariance": self.seedSource = self.RANDOM_DATACOVARIANCE elif self.seedSource == "unitRect": self.seedSource = self.RANDOM_UNITRECT else: raise NotImplementedError( "The seedSource must be one of 'dataPoints', 'dataCovariance', 'unitRect'" ) if "numberOfTrials" in params: self.numberOfTrials = int(params["numberOfTrials"]) del params["numberOfTrials"] else: self.numberOfTrials = int(self.defaultParams["numberOfTrials"]) if "numberToConverge" in params: self.numberToConverge = int(params["numberToConverge"]) del params["numberToConverge"] else: self.numberToConverge = int(self.defaultParams["numberToConverge"]) if self.numberToConverge > self.numberOfTrials: raise RuntimeError( "numberToConverge (%d) must not be greater than numberOfTrials (%d)" % (self.numberToConverge, self.numberOfTrials)) if "maxIterations" in params: self.maxIterations = params["maxIterations"] del params["maxIterations"] else: self.maxIterations = self.defaultParams["maxIterations"] try: self.maxIterations = int(self.maxIterations) if self.maxIterations <= 0: raise ValueError except ValueError: if self.maxIterations == "unset": self.maxIterations = None else: raise RuntimeError( "maxIterations must be a positive integer or \"unset\", not \"%s\"" % self.maxIterations) if "closeEnough" in params: self.closeEnough = float(params["closeEnough"]) del params["closeEnough"] else: self.closeEnough = float(self.defaultParams["closeEnough"]) self.model = self.segmentRecord.pmmlModel self.dataDistribution = self.engine.producerUpdateScheme.updator( COVARIANCE(self.model.numberOfFields)) self.distanceMeasure = (self.model.child( pmml.ComparisonMeasure).attrib["kind"] == "distance") if self.seedSource == self.RANDOM_DATAWEIGHTED and self.model.weightField is None: self.seedSource = self.RANDOM_DATAPOINTS # get rid of any PartialSums objects, since they would be misleading (this algorithm doesn't use them) extension = self.model.child(pmml.Extension, exception=False) if extension is not None: newChildren = [] for child in extension.children: if not isinstance(child, pmml.X_ODG_PartialSums): newChildren.append(child) extension.children = newChildren self.buffer = {self.SYNCNUMBER: []} for field in self.model.fields: self.buffer[field] = [] if self.model.weightField is not None: self.buffer[self.model.weightField] = [] if len(params) > 0: raise TypeError("Unrecognized parameters %s" % params) def update(self, syncNumber, get): self.resetLoggerLevels() vector = [get(field) for field in self.model.fields] if INVALID in vector: self.logger.debug( "KMeansClustering.update: returning False (INVALID data)") return False if self.model.weightField is not None: weight = get(self.model.weightField) if weight is INVALID or weight is MISSING: self.logger.debug( "KMeansClustering.update: returning False (INVALID or MISSING weight)" ) return False self.buffer[self.model.weightField].append(weight) self.buffer[self.SYNCNUMBER].append(syncNumber) for i, field in enumerate(self.model.fields): self.buffer[field].append(vector[i]) if self.distanceMeasure and MISSING not in vector: self.dataDistribution.increment(syncNumber, vector) return True def produce(self): self.resetLoggerLevels() extension = self.model.child(pmml.Extension, exception=False) if extension is None: extension = pmml.Extension() self.model.children.append(extension) convergence = extension.child(pmml.X_ODG_Convergence, exception=False) if convergence is None: convergence = pmml.X_ODG_Convergence() extension.children.append(convergence) numRecords = len(self.buffer[self.SYNCNUMBER]) if self.logDebug: self.logger.debug( "KMeansClustering.produce: this segment has %d data records; setting up for cluster production." % numRecords) if numRecords == 0: self.logger.debug( "KMeansClustering.produce: no data in this segment, so there are no clusters to produce." ) return if self.numberOfClusters is not None: if self.numberOfClusters > numRecords: self.logger.info( "KMeansClustering.produce: number of desired clusters (%d) exceeds number of data records (%d), reducing number of clusters to match." % (self.model.numberOfClusters, numRecords)) self.model.changeNumberOfClusters(numRecords) elif self.numberOfClusters != self.model.numberOfClusters: self.model.changeNumberOfClusters(self.numberOfClusters) elif self.model.numberOfClusters > numRecords: self.logger.info( "KMeansClustering.produce: number of desired clusters (%d) exceeds number of data records (%d), reducing number of clusters to match." % (self.model.numberOfClusters, numRecords)) self.model.changeNumberOfClusters(numRecords) # special case that should be easy, but it can cause the standard k-means algorithm to infinite loop: if self.model.numberOfClusters == numRecords: self.logger.debug( "KMeansClustering.produce: number of records equals the number of clusters (%d), so we skip the standard algorithm and just assign data points to clusters" % numRecords) for i, pmmlCluster in enumerate(self.model.cluster): pmmlCluster.value = [ self.buffer[field][i] for field in self.model.fields ] pmmlCluster.attrib["n"] = len(pmmlCluster.value) return self.trans = numpy.matrix(numpy.identity(len(self.model.fields))) self.shift = numpy.matrix(numpy.zeros(len(self.model.fields))).T if self.distanceMeasure: # characterize the data so that you can generate random numbers with the same distribution try: covariance = self.dataDistribution.covariance() except ZeroDivisionError: covariance = INVALID if covariance is not INVALID: self.shift = self.dataDistribution.covmean() try: self.trans = numpy.linalg.cholesky(covariance) except numpy.linalg.LinAlgError: pass # FIXME: at least make trans a diagonal matrix with stdev entries (or 1/stdev)! else: raise NotImplementedError( "Currently, only clusters with ComparisonMeasure.kind == 'distance' metrics can be produced." ) # make a new set of trials if self.seedSource is ProducerKMeans.RANDOM_DATAPOINTS: # pick a random point from the dataset def randomization(): i = random.randint(0, len(self.buffer[self.SYNCNUMBER]) - 1) return [ self.buffer[field][i] for field in self.model.fields if field is not self.SYNCNUMBER ] self.randomization = randomization elif self.seedSource == ProducerKMeans.RANDOM_DATAWEIGHTED: # pick a random point from the dataset, weighted by their weights sumOfWeights = numpy.cumsum(self.buffer[self.model.weightField]) def randomization(): x = random.uniform(0., sumOfWeights[-1]) i = numpy.where(sumOfWeights > x)[0][0] return [ self.buffer[field][i] for field in self.model.fields if field is not self.SYNCNUMBER ] self.randomization = randomization elif self.seedSource == ProducerKMeans.RANDOM_DATACOVARIANCE: # generate a random point from a distribution with a covariance like the data self.randomization = lambda: ((self.trans * (numpy.matrix( numpy.random.randn(len(self.shift))).T)) + self.shift) elif self.seedSource == ProducerKMeans.RANDOM_UNITRECT: # generate a random point in the unit rectangle self.randomization = lambda: [ random.random() for i in xrange(len(self.shift)) ] self.trials = [ TrialClusterSet(self.model.numberOfClusters, self.randomization, self.engine.producerUpdateScheme) for i in xrange(self.numberOfTrials) ] # prepare small subsamples to run first to improve convergence when the whole dataset gets used allIndices = range(len(self.buffer[self.SYNCNUMBER])) quickConvergeSamples = [] for numEvents in self.quickConvergeSteps: if numEvents > len(allIndices): numEvents = len(allIndices) quickConvergeSamples.append( numpy.array(random.sample(allIndices, numEvents))) allIndices = numpy.array(allIndices) for key in self.buffer: self.buffer[key] = numpy.array(self.buffer[key]) for i, quickConvergenceSample in enumerate(quickConvergeSamples): if self.logDebug: self.logger.debug( "KMeansClustering.produce: ===== quickConverge %d: preparing for k-means by clustering a random subset of %d events" % (i + 1, len(quickConvergenceSample))) self.iterations(quickConvergenceSample) self.logger.debug( "KMeansClustering.produce: ===== starting k-means clustering algorithm (whole dataset)" ) convergence.attrib["iterations"] = self.iterations() # find the best one best = None for trial in self.trials: if trial.hasConverged: if best is None or trial.updator.mean() < best.updator.mean(): best = trial convergence.attrib["converged"] = (best is not None) if best is None: self.logger.error( "KMeansClustering.produce: no trial cluster-sets converged within the desired number of iterations (%s), using the best UNCONVERGED set instead." % str(self.maxIterations) if self. maxIterations is not None else "unset") for trial in self.trials: if best is None or trial.updator.mean() < best.updator.mean(): best = trial # write it to the PMML file for bestCluster, pmmlCluster in zip(best.clusters, self.model.matches(pmml.Cluster)): pmmlCluster.attrib["size"] = bestCluster.count() theArray = pmmlCluster.child(pmml.Array) theArray.value = bestCluster.initialPosition theArray.attrib["n"] = len(theArray.value) def iterations(self, indices=None): if indices is None: dataset = self.buffer else: dataset = {} for key, value in self.buffer.items(): dataset[key] = value[indices] # loop over the data many times until a subset of trials converge iteration = 0 while True: iteration += 1 # set "initialPosition" to the mean within each cluster for trial in self.trials: trial.reset() if self.logDebug: self.logger.debug("KMeansClustering.produce: iteration %d" % iteration) # loop over data (pre-calculated, including all derived fields) for i in xrange(len(dataset[self.SYNCNUMBER])): if self.logDebug and i % 10000 == 0: self.logger.debug(" event %d/%d = %g%%" % (i, len(dataset[self.SYNCNUMBER]), 100. * i / len(dataset[self.SYNCNUMBER]))) syncNumber = dataset[self.SYNCNUMBER][i] vector = [dataset[field][i] for field in self.model.fields] weight = None if self.model.weightField is not None: weight = dataset[self.model.weightField][i] for trial in self.trials: trial.update(syncNumber, vector, self.model, False, weight) if self.logDebug: self.logger.debug(" event %d/%d = 100%%" % (len( dataset[self.SYNCNUMBER]), len(dataset[self.SYNCNUMBER]))) self.logger.debug( "KMeansClustering.produce: about to sort the trials") self.trials.sort( lambda a, b: -cmp(a.updator.mean(), b.updator.mean())) self.logger.debug( "KMeansClustering.produce: about to check convergence of the trials" ) numConverged = 0 for trial in self.trials: if trial.converged(self.closeEnough): trial.hasConverged = True numConverged += 1 else: trial.hasConverged = False if self.logDebug: self.logger.debug( "KMeansClustering.produce: iteration %d has %d converged cluster-set trials" % (iteration, numConverged)) best = None for trial in self.trials: if trial.hasConverged: if best is None or trial.updator.mean( ) < best.updator.mean(): best = trial if best is not None: self.logger.debug(" best CONVERGED so far: %s" % " ".join(map(repr, best.clusters))) else: best = None for trial in self.trials: if best is None or trial.updator.mean( ) < best.updator.mean(): best = trial if best is not None: self.logger.debug(" best so far: %s" % " ".join(map(repr, best.clusters))) for trial in self.trials: # self.logger.debug(" show all: %s%s" % (" ".join(map(repr, trial.clusters)), " (converged)" if trial.hasConverged else "")) trial.rethrowInvalid(self.randomization, self.engine.producerUpdateScheme) if numConverged >= self.numberToConverge: return iteration if self.maxIterations is not None and iteration >= self.maxIterations: return iteration
import augustus.core.pmml41 as pmml def sigfigs(num, n): """Round a number to n significant figures and return the result as a string.""" # stolen from Cassius: if num == 0.: level = 1 else: level = n - int(math.ceil(math.log10(abs(num)))) num = round(num, level) format = "%." + str(max(level, 0)) + "f" return format % num BROKEN = Atom("Broken") NOTFOUND = Atom("NotFound") # def _show(index, pmmlFile, index_width=20): # if index is None: # return "%s %s" % (("%%-%ds" % index_width) % "index", repr(pmmlFile)) # if index is BROKEN: # return "%s %s" % (("%%-%ds" % index_width) % "???", "???") # return "%s %s%s" % (("%%-%ds" % index_width) % repr(index), ". . " * len(index), repr(pmmlFile[index])) # def _showUpTo(i, index1, file1, index2, file2): # output = [] # for j, (j1, j2) in enumerate(zip(index1, index2)): # if j < i: # output.append("%-70s versus %-70s" % (_show(j1, file1)[:70], _show(j2, file2)[:70])) # elif j == i:
class ProducerKMeans(ProducerAlgorithm): """The standard k-means clustering algorithm.""" SYNCNUMBER = Atom("SyncNumber") RANDOM_DATAPOINTS = Atom("Random_DataPoints") RANDOM_DATACOVARIANCE = Atom("Random_DataCovariance") RANDOM_UNITRECT = Atom("Random_UnitRect") defaultParams = { "updateExisting": "false", "quickConvergeSteps": "()", "seedSource": "dataPoints", "numberOfTrials": "20", "numberToConverge": "5" } def initialize(self, **params): if "updateExisting" in params: self.updateExisting = pmml.boolCheck(params["updateExisting"]) del params["updateExisting"] if self.updateExisting: raise NotImplementedError, "Updating from existing ClusterModels using 'kmeans' not implemented; use mode='replaceExisting'" else: self.updateExisting = pmml.boolCheck( self.defaultParams["updateExisting"]) if "quickConvergeSteps" in params: try: self.quickConvergeSteps = eval(params["quickConvergeSteps"]) if not isinstance(self.quickConvergeSteps, tuple): raise RuntimeError self.quickConvergeSteps = map(int, self.quickConvergeSteps) except err: raise RuntimeError, "quickConvergeSteps must be a tuple of numbers of events" del params["quickConvergeSteps"] else: self.quickConvergeSteps = eval( self.defaultParams["quickConvergeSteps"]) if "seedSource" in params: self.seedSource = params["seedSource"] del params["seedSource"] else: self.seedSource = self.defaultParams["seedSource"] if self.seedSource == "dataPoints": self.seedSource = self.RANDOM_DATAPOINTS elif self.seedSource == "dataCovariance": self.seedSource = self.RANDOM_DATACOVARIANCE elif self.seedSource == "unitRect": self.seedSource = self.RANDOM_UNITRECT else: raise NotImplementedError, "The seedSource must be one of 'dataPoints', 'dataCovariance', 'unitRect'" if "numberOfTrials" in params: self.numberOfTrials = int(params["numberOfTrials"]) del params["numberOfTrials"] else: self.numberOfTrials = int(self.defaultParams["numberOfTrials"]) if "numberToConverge" in params: self.numberToConverge = int(params["numberToConverge"]) del params["numberToConverge"] else: self.numberToConverge = int(self.defaultParams["numberToConverge"]) self.model = self.segmentRecord.pmmlModel self.dataDistribution = self.engine.producerUpdateScheme.updator( COVARIANCE(self.model.numberOfFields)) self.distanceMeasure = (self.model.child( pmml.ComparisonMeasure).attrib["kind"] == "distance") # get rid of any PartialSums objects, since they would be misleading (this algorithm doesn't use them) extension = self.model.child(pmml.Extension, exception=False) if extension is not None: newChildren = [] for child in extension.children: if not isinstance(child, pmml.X_ODG_PartialSums): newChildren.append(child) extension.children = newChildren self.buffer = {self.SYNCNUMBER: []} for field in self.model.fields: self.buffer[field] = [] if len(params) > 0: raise TypeError, "Unrecognized parameters %s" % params def update(self, syncNumber, get): self.resetLoggerLevels() vector = [get(field) for field in self.model.fields] if INVALID in vector: self.logger.debug( "KMeansClustering.update: returning False (INVALID data)") return False self.buffer[self.SYNCNUMBER].append(syncNumber) for i, field in enumerate(self.model.fields): self.buffer[field].append(vector[i]) if self.distanceMeasure and MISSING not in vector: self.dataDistribution.increment(syncNumber, vector) def produce(self): self.resetLoggerLevels() if self.logDebug: self.logger.debug( "KMeansClustering.produce: this segment has %d data records; setting up for cluster production." ) if len(self.buffer[self.SYNCNUMBER]) == 0: self.logger.debug( "KMeansClustering.produce: no data in this segment, so there are no clusters to produce." ) return self.trans = numpy.matrix(numpy.identity(len(self.model.fields))) self.shift = numpy.matrix(numpy.zeros(len(self.model.fields))).T if self.distanceMeasure: # characterize the data so that you can generate random numbers with the same distribution try: covariance = self.dataDistribution.covariance() except ZeroDivisionError: covariance = INVALID if covariance is not INVALID: self.shift = self.dataDistribution.covmean() try: self.trans = numpy.linalg.cholesky(covariance) except numpy.linalg.LinAlgError: pass # FIXME: at least make trans a diagonal matrix with stdev entries (or 1/stdev)! else: raise NotImplementedError, "Currently, only clusters with ComparisonMeasure.kind == 'distance' metrics can be produced." # make a new set of trials if self.seedSource is ProducerKMeans.RANDOM_DATAPOINTS: # pick a random point from the dataset def randomization(): i = random.randint(0, len(self.buffer[self.SYNCNUMBER]) - 1) return [ self.buffer[field][i] for field in self.model.fields if field is not self.SYNCNUMBER ] self.randomization = randomization elif self.seedSource == ProducerKMeans.RANDOM_DATACOVARIANCE: # generate a random point from a distribution with a covariance like the data self.randomization = lambda: ((self.trans * (numpy.matrix( numpy.random.randn(len(self.shift))).T)) + self.shift) elif self.seedSource == ProducerKMeans.RANDOM_UNITRECT: # generate a random point in the unit rectangle self.randomization = lambda: [ random.random() for i in xrange(len(self.shift)) ] self.trials = [ TrialClusterSet(self.model.numberOfClusters, self.randomization, self.engine.producerUpdateScheme) for i in xrange(self.numberOfTrials) ] # prepare small subsamples to run first to improve convergence when the whole dataset gets used allIndices = range(len(self.buffer[self.SYNCNUMBER])) quickConvergeSamples = [] for numEvents in self.quickConvergeSteps: if numEvents > len(allIndices): numEvents = len(allIndices) quickConvergeSamples.append( numpy.array(random.sample(allIndices, numEvents))) allIndices = numpy.array(allIndices) for key in self.buffer: self.buffer[key] = numpy.array(self.buffer[key]) for i, quickConvergenceSample in enumerate(quickConvergeSamples): if self.logDebug: self.logger.debug( "KMeansClustering.produce: ===== quickConverge %d: preparing for k-means by clustering a random subset of %d events" % (i + 1, len(quickConvergenceSample))) self.iterations(quickConvergenceSample) self.logger.debug( "KMeansClustering.produce: ===== starting k-means clustering algorithm (whole dataset)" ) self.iterations() # find the best one best = None for trial in self.trials: if trial.hasConverged: if best is None or trial.updator.mean() < best.updator.mean(): best = trial # write it to the PMML file for bestCluster, pmmlCluster in zip(best.clusters, self.model.cluster): pmmlCluster.value = bestCluster.initialPosition def iterations(self, indices=None): if indices is None: dataset = self.buffer else: dataset = {} for key, value in self.buffer.items(): dataset[key] = value[indices] # loop over the data many times until a subset of trials converge iteration = 0 while True: # FIXME: TODO: the number of iterations and some facts about the # number of equivalent trials should go into metadata somewhere iteration += 1 # set "initialPosition" to the mean within each cluster for trial in self.trials: trial.reset() if self.logDebug: self.logger.debug("KMeansClustering.produce: iteration %d" % iteration) # loop over data (pre-calculated, including all derived fields) for i in xrange(len(dataset[self.SYNCNUMBER])): if self.logDebug and i % 10000 == 0: self.logger.debug(" event %d/%d = %g%%" % (i, len(dataset[self.SYNCNUMBER]), 100. * i / len(dataset[self.SYNCNUMBER]))) syncNumber = dataset[self.SYNCNUMBER][i] vector = [dataset[field][i] for field in self.model.fields] for trial in self.trials: trial.update(syncNumber, vector, self.model, False) if self.logDebug: self.logger.debug(" event %d/%d = 100%%" % (len( dataset[self.SYNCNUMBER]), len(dataset[self.SYNCNUMBER]))) self.logger.debug( "KMeansClustering.produce: about to sort the trials") self.trials.sort( lambda a, b: -cmp(a.updator.mean(), b.updator.mean())) self.logger.debug( "KMeansClustering.produce: about to check convergence of the trials" ) numConverged = 0 for trial in self.trials: if trial.converged(): trial.hasConverged = True numConverged += 1 else: trial.hasConverged = False if self.logDebug: self.logger.debug( "KMeansClustering.produce: iteration %d has %d converged cluster-set trials" % (iteration, numConverged)) best = None for trial in self.trials: if trial.hasConverged: if best is None or trial.updator.mean( ) < best.updator.mean(): best = trial if best is not None: self.logger.debug(" best CONVERGED so far: %s" % " ".join(map(repr, best.clusters))) else: best = None for trial in self.trials: if best is None or trial.updator.mean( ) < best.updator.mean(): best = trial if best is not None: self.logger.debug(" best so far: %s" % " ".join(map(repr, best.clusters))) for trial in self.trials: trial.rethrowInvalid(self.randomization, self.engine.producerUpdateScheme) if numConverged >= self.numberToConverge: return
class ProducerKMeans(ProducerAlgorithm): """The standard k-means clustering algorithm.""" SYNCNUMBER = Atom("SyncNumber") defaultParams = { "updateExisting": "false", "numberOfTrials": "20", "numberToConverge": "5" } def initialize(self, **params): if "updateExisting" in params: self.updateExisting = pmml.boolCheck(params["updateExisting"]) del params["updateExisting"] if self.updateExisting: raise NotImplementedError, "Updating from existing ClusterModels using 'kmeans' not implemented; use mode='replaceExisting'" else: self.updateExisting = pmml.boolCheck( self.defaultParams["updateExisting"]) if "numberOfTrials" in params: self.numberOfTrials = int(params["numberOfTrials"]) del params["numberOfTrials"] else: self.numberOfTrials = int(self.defaultParams["numberOfTrials"]) if "numberToConverge" in params: self.numberToConverge = int(params["numberToConverge"]) del params["numberToConverge"] else: self.numberToConverge = int(self.defaultParams["numberToConverge"]) self.model = self.segmentRecord.pmmlModel self.dataDistribution = self.engine.producerUpdateScheme.updator( COVARIANCE(self.model.numberOfFields)) self.distanceMeasure = (self.model.child( pmml.ComparisonMeasure).attrib["kind"] == "distance") # get rid of any PartialSums objects, since they would be misleading (this algorithm doesn't use them) extension = self.model.child(pmml.Extension, exception=False) if extension is not None: newChildren = [] for child in extension.children: if not isinstance(child, pmml.X_ODG_PartialSums): newChildren.append(child) extension.children = newChildren self.buffer = {self.SYNCNUMBER: []} for field in self.model.fields: self.buffer[field] = [] if len(params) > 0: raise TypeError, "Unrecognized parameters %s" % params def update(self, syncNumber, get): vector = [get(field) for field in self.model.fields] if INVALID in vector: return False self.buffer[self.SYNCNUMBER].append(syncNumber) for i, field in enumerate(self.model.fields): self.buffer[field].append(vector[i]) if self.distanceMeasure and MISSING not in vector: self.dataDistribution.increment(syncNumber, vector) def produce(self): trans = numpy.matrix(numpy.identity(len(self.model.fields))) shift = numpy.matrix(numpy.zeros(len(self.model.fields))).T if self.distanceMeasure: # characterize the data so that you can generate random numbers with the same distribution try: covariance = self.dataDistribution.covariance() except ZeroDivisionError: covariance = INVALID if covariance is not INVALID: shift = self.dataDistribution.covmean() try: trans = numpy.linalg.cholesky(covariance) except numpy.linalg.LinAlgError: pass # FIXME: at least make trans a diagonal matrix with stdev entries (or 1/stdev)! else: raise NotImplementedError, "Currently, only clusters with ComparisonMeasure.kind == 'distance' metrics can be produced." # make a new set of trials (randomly seeded with the same covariance as data) trials = [ TrialClusterSet(self.model.numberOfClusters, trans, shift, self.engine.producerUpdateScheme) for i in xrange(self.numberOfTrials) ] # loop over the data many times until a subset of trials converge iteration = 0 while True: # FIXME: TODO: the number of iterations and some facts about the # number of equivalent trials should go into metadata somewhere iteration += 1 # set "initialPosition" to the mean within each cluster for trial in trials: trial.reset() # loop over data (pre-calculated, including all derived fields) for i in xrange(len(self.buffer[self.SYNCNUMBER])): syncNumber = self.buffer[self.SYNCNUMBER][i] vector = [self.buffer[field][i] for field in self.model.fields] for trial in trials: trial.update(syncNumber, vector, self.model, False) trials.sort(lambda a, b: -cmp(a.updator.mean(), b.updator.mean())) numConverged = 0 for trial in trials: if trial.converged(): trial.hasConverged = True numConverged += 1 else: trial.hasConverged = False if numConverged > self.numberToConverge: break # find the best one best = None for trial in trials: if trial.hasConverged: if best is None or trial.updator.mean() < best.updator.mean(): best = trial # write it to the PMML file for bestCluster, pmmlCluster in zip(best.clusters, self.model.cluster): pmmlCluster.value = bestCluster.initialPosition
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Defines the way all consumer algorithms store their states, and how events are weighted or blended. Can be expanded to handle model production in a parallelized system.""" import numpy from augustus.core.defs import Atom, INVALID from augustus.core.extramath import MINFLOAT COUNT = Atom("Count") SUM1 = Atom("Sum1") SUMX = Atom("SumX") SUMXX = Atom("SumXX") MIN = Atom("Min") MAX = Atom("Max") CUSUM = Atom("CUSUM") GLR = Atom("GLR") class COVARIANCE(Atom): """Atom (isotope?) for covariance calculations. The dimension of this object depends on the data, and is given at initialization.""" def __init__(self, dimension): self.name = "Covariance" self.dimension = dimension
class PythonFunction(ScoresAwk): xsd = load_xsdElement( ScoresAwk, """ <xs:element name="PythonFunction"> <xs:complexType> <xs:sequence> <xs:element minOccurs="0" maxOccurs="unbounded" ref="Context"/> </xs:sequence> <xs:attribute name="condition" type="xs:string" use="optional"/> <xs:attribute name="action" type="xs:string" use="required"/> </xs:complexType> </xs:element> """) BEGIN = Atom("Begin") EVENT = Atom("Event") END = Atom("End") def post_validate(self): context = {"g": globalVariables} for c in self.matches(Context): context.update(c.context) cdatas = [i for i in self.children if isinstance(i, xmlbase.XMLCDATA)] if len(cdatas) != 1: raise XMLValidationError, "A PythonFunction object must contain exactly one CDATA" theCode = "".join(cdatas[0].text).lstrip().rstrip() ## CAREFUL: evaluates whatever you give it! try: exec theCode in context except SyntaxError as err: raise XMLValidationError, "PythonFunction could not be evaluated: %s" % str( err) if "condition" in self.attrib: if self["condition"] == "BEGIN": self.condition = self.BEGIN elif self["condition"] == "END": self.condition = self.END else: try: self.condition = context[self["condition"]] if not callable(self.condition): raise KeyError except KeyError: raise XMLValidationError, "PythonFunction does not contain a condition function called \"%s\"" % self[ "condition"] else: self.condition = self.EVENT try: self.action = context[self["action"]] if not callable(self.action): raise KeyError except KeyError: raise XMLValidationError, "PythonFunction does not contain an action function called \"%s\"" % self[ "action"] def begin(self): if self.condition is self.BEGIN: return self.action() def evaluate(self, event): if self.condition is self.EVENT: result = True else: result = self.condition(event) if result is True: return self.action(event) elif result is False: return None else: if not isinstance(result, (list, tuple)): try: result = list(result) except TypeError: raise RuntimeError, "A PythonFunction's condition must return True, False, or a list of objects to act upon; result of %s is %s" % ( self.condition, result) output = [] for r in result: output.append(self.action(r)) return output def end(self): if self.condition is self.END: return self.action()