def _updateDistribution_first(self): if isinstance(self.baseline, (pmml.PoissonDistribution, pmml.GaussianDistribution)): self.baselinePartialSums = self.baseline.descendant( pmml.X_ODG_PartialSums, exception=False, maxdepth=2) if self.baselinePartialSums is None: self.baselinePartialSums = pmml.X_ODG_PartialSums() if not self.baseline.exists(pmml.Extension): self.baseline.children.append(pmml.Extension()) self.baseline.child(pmml.Extension).children.append( self.baselinePartialSums) if isinstance(self.baseline, pmml.PoissonDistribution): self.baselineUpdator = self.engine.producerUpdateScheme.updator( SUM1, SUMX) if self.updateExisting: self.baselineUpdator.initialize({ COUNT: self.baselinePartialSums.attrib.get("COUNT", 0), SUM1: self.baselinePartialSums.attrib.get("SUM1", 0.), SUMX: self.baselinePartialSums.attrib.get("SUMX", 0.) }) if COUNT in self.baselineUpdator.counters: self.baselinePartialSums.attrib[ "COUNT"] = self.baselineUpdator.counters[COUNT] self.baselinePartialSums.attrib[ "SUM1"] = self.baselineUpdator.counters[SUM1] self.baselinePartialSums.attrib[ "SUMX"] = self.baselineUpdator.counters[SUMX] elif isinstance(self.baseline, pmml.GaussianDistribution): self.baselineUpdator = self.engine.producerUpdateScheme.updator( SUM1, SUMX, SUMXX) if self.updateExisting: self.baselineUpdator.initialize({ COUNT: self.baselinePartialSums.attrib.get("COUNT", 0), SUM1: self.baselinePartialSums.attrib.get("SUM1", 0.), SUMX: self.baselinePartialSums.attrib.get("SUMX", 0.), SUMXX: self.baselinePartialSums.attrib.get("SUMXX", 0.) }) if COUNT in self.baselineUpdator.counters: self.baselinePartialSums.attrib[ "COUNT"] = self.baselineUpdator.counters[COUNT] self.baselinePartialSums.attrib[ "SUM1"] = self.baselineUpdator.counters[SUM1] self.baselinePartialSums.attrib[ "SUMX"] = self.baselineUpdator.counters[SUMX] self.baselinePartialSums.attrib[ "SUMXX"] = self.baselineUpdator.counters[SUMXX] elif isinstance(self.baseline, pmml.UniformDistribution): self.baselineUpdator = self.engine.producerUpdateScheme.updator( MIN, MAX) if self.updateExisting: self.baselineUpdator.initialize({ MIN: self.baseline.attrib["lower"], MAX: self.baseline.attrib["upper"] }) else: raise NotImplementedError, "Only production of Gaussian, Poisson, and Uniform distributions has been implemented." if self.alternateField is not None: if not testDistributions.exists(pmml.Alternate): raise RuntimeError, "alternateField requested but there is no <Alternate/> distribution in the PMML" self.alternate = testDistributions.child(pmml.Alternate).child() if isinstance( self.alternate, (pmml.PoissonDistribution, pmml.GaussianDistribution)): self.alternatePartialSums = self.alternate.descendant( pmml.X_ODG_PartialSums, exception=False, maxdepth=2) if self.alternatePartialSums is None: self.alternatePartialSums = pmml.X_ODG_PartialSums() if not self.alternate.exists(pmml.Extension): self.alternate.children.append(pmml.Extension()) self.alternate.child(pmml.Extension).children.append( self.alternatePartialSums) if isinstance(self.alternate, pmml.PoissonDistribution): self.alternateUpdator = self.engine.producerUpdateScheme.updator( SUM1, SUMX) if self.updateExisting: self.alternateUpdator.initialize({ COUNT: self.alternatePartialSums.attrib.get("COUNT", 0), SUM1: self.alternatePartialSums.attrib.get("SUM1", 0.), SUMX: self.alternatePartialSums.attrib.get("SUMX", 0.) }) if COUNT in self.alternateUpdator.counters: self.alternatePartialSums.attrib[ "COUNT"] = self.alternateUpdator.counters[COUNT] self.alternatePartialSums.attrib[ "SUM1"] = self.alternateUpdator.counters[SUM1] self.alternatePartialSums.attrib[ "SUMX"] = self.alternateUpdator.counters[SUMX] elif isinstance(self.alternate, pmml.GaussianDistribution): self.alternateUpdator = self.engine.producerUpdateScheme.updator( SUM1, SUMX, SUMXX) if self.updateExisting: self.alternateUpdator.initialize({ COUNT: self.alternatePartialSums.attrib.get("COUNT", 0), SUM1: self.alternatePartialSums.attrib.get("SUM1", 0.), SUMX: self.alternatePartialSums.attrib.get("SUMX", 0.), SUMXX: self.alternatePartialSums.attrib.get("SUMXX", 0.) }) if COUNT in self.alternateUpdator.counters: self.alternatePartialSums.attrib[ "COUNT"] = self.alternateUpdator.counters[COUNT] self.alternatePartialSums.attrib[ "SUM1"] = self.alternateUpdator.counters[SUM1] self.alternatePartialSums.attrib[ "SUMX"] = self.alternateUpdator.counters[SUMX] self.alternatePartialSums.attrib[ "SUMXX"] = self.alternateUpdator.counters[SUMXX] elif isinstance(self.alternate, pmml.UniformDistribution): self.alternateUpdator = self.engine.producerUpdateScheme.updator( MIN, MAX) if self.updateExisting: self.alteranteUpdator.initialize({ MIN: self.alterante.attrib["lower"], MAX: self.alterante.attrib["upper"] }) else: raise NotImplementedError, "Only production of Gaussian, Poisson, and Uniform distributions has been implemented." else: self.alternate = None
def initialize(self, **params): """Initialize a clustering model producer.""" if "resume" in params: self.resume = pmml.boolCheck(params["resume"]) del params["resume"] else: self.resume = False if "numberOfTrials" in params: self.numberOfTrials = int(params["numberOfTrials"]) del params["numberOfTrials"] else: self.numberOfTrials = 10 if "numberToKeep" in params: self.numberToKeep = int(params["numberToKeep"]) del params["numberToKeep"] else: self.numberToKeep = 3 if "maturityThreshold" in params: self.maturityThreshold = int(params["maturityThreshold"]) del params["maturityThreshold"] else: self.maturityThreshold = 100 if "initialStability" in params: self.initialStability = int(params["initialStability"]) del params["initialStability"] else: self.initialStability = 100 if "overrideSignificance" in params: self.overrideSignificance = float(params["overrideSignificance"]) del params["overrideSignificance"] if self.overrideSignificance == 0.: self.overrideSignificance = None else: self.overrideSignificance = 5. self.model = self.segmentRecord.pmmlModel self.dataDistribution = self.engine.producerUpdateScheme.updator(COVARIANCE(self.model.numberOfFields)) self.distanceMeasure = (self.model.child(pmml.ComparisonMeasure).attrib["kind"] == "distance") # put PartialSums in the model if they're not already there; pick up old values if you're resuming extension = self.model.child(pmml.Extension, exception=False) if extension is None: extension = pmml.Extension() self.model.children.append(extension) if self.resume: self.sumOfDistances = extension.child(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == "SumOfDistances", exception=False) else: index = extension.index(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == "SumOfDistances", exception=False) if index is not None: del extension[index] self.sumOfDistances = None if self.sumOfDistances is None: self.sumOfDistances = pmml.X_ODG_PartialSums(name="SumOfDistances", COUNT=0, SUM1=0., SUMX=0., SUMXX=0.) extension.children.append(self.sumOfDistances) self.partialSums = {} for theid, cluster in zip(self.model.ids, self.model.cluster): for i, field in enumerate(self.model.fields): fullname = "%s.%s" % (theid, field) if self.resume: partialSum = extension.child(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == fullname, exception=False) else: index = extension.index(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == fullname, exception=False) if index is not None: del extension[index] partialSum = None if partialSum is None: partialSum = pmml.X_ODG_PartialSums(name=fullname, SUM1=1., SUMX=cluster.value[i]) extension.children.append(partialSum) self.partialSums[fullname] = partialSum # create the first trial using the values constructed above (they come from the PMML file if resume is True) trialFromPmml = new.instance(TrialClusterSet) trialFromPmml.updator = self.engine.producerUpdateScheme.updator(COUNT, SUM1, SUMX, SUMXX) trialFromPmml.updator.initialize({COUNT: self.sumOfDistances.attrib["COUNT"], SUM1: self.sumOfDistances.attrib["SUM1"], SUMX: self.sumOfDistances.attrib["SUMX"], SUMXX: self.sumOfDistances.attrib["SUMXX"]}) trialFromPmml.clusters = [] for theid, cluster in zip(self.model.ids, self.model.cluster): trialCluster = new.instance(TrialCluster) trialCluster.fields = [] trialCluster.initialPosition = [] for field in self.model.fields: partialSum = self.partialSums["%s.%s" % (theid, field)] u = self.engine.producerUpdateScheme.updator(SUM1, SUMX) u.initialize({SUM1: partialSum.attrib["SUM1"], SUMX: partialSum.attrib["SUMX"]}) trialCluster.fields.append(u) trialCluster.initialPosition = list(cluster.value) trialFromPmml.clusters.append(trialCluster) self.trials = [trialFromPmml] if len(params) > 0: raise TypeError, "Unrecognized parameters %s" % params