Esempio n. 1
0
    def _updateDistribution_first(self):
        if isinstance(self.baseline,
                      (pmml.PoissonDistribution, pmml.GaussianDistribution)):
            self.baselinePartialSums = self.baseline.descendant(
                pmml.X_ODG_PartialSums, exception=False, maxdepth=2)
            if self.baselinePartialSums is None:
                self.baselinePartialSums = pmml.X_ODG_PartialSums()
                if not self.baseline.exists(pmml.Extension):
                    self.baseline.children.append(pmml.Extension())
                self.baseline.child(pmml.Extension).children.append(
                    self.baselinePartialSums)

        if isinstance(self.baseline, pmml.PoissonDistribution):
            self.baselineUpdator = self.engine.producerUpdateScheme.updator(
                SUM1, SUMX)
            if self.updateExisting:
                self.baselineUpdator.initialize({
                    COUNT:
                    self.baselinePartialSums.attrib.get("COUNT", 0),
                    SUM1:
                    self.baselinePartialSums.attrib.get("SUM1", 0.),
                    SUMX:
                    self.baselinePartialSums.attrib.get("SUMX", 0.)
                })
            if COUNT in self.baselineUpdator.counters:
                self.baselinePartialSums.attrib[
                    "COUNT"] = self.baselineUpdator.counters[COUNT]
            self.baselinePartialSums.attrib[
                "SUM1"] = self.baselineUpdator.counters[SUM1]
            self.baselinePartialSums.attrib[
                "SUMX"] = self.baselineUpdator.counters[SUMX]

        elif isinstance(self.baseline, pmml.GaussianDistribution):
            self.baselineUpdator = self.engine.producerUpdateScheme.updator(
                SUM1, SUMX, SUMXX)
            if self.updateExisting:
                self.baselineUpdator.initialize({
                    COUNT:
                    self.baselinePartialSums.attrib.get("COUNT", 0),
                    SUM1:
                    self.baselinePartialSums.attrib.get("SUM1", 0.),
                    SUMX:
                    self.baselinePartialSums.attrib.get("SUMX", 0.),
                    SUMXX:
                    self.baselinePartialSums.attrib.get("SUMXX", 0.)
                })
            if COUNT in self.baselineUpdator.counters:
                self.baselinePartialSums.attrib[
                    "COUNT"] = self.baselineUpdator.counters[COUNT]
            self.baselinePartialSums.attrib[
                "SUM1"] = self.baselineUpdator.counters[SUM1]
            self.baselinePartialSums.attrib[
                "SUMX"] = self.baselineUpdator.counters[SUMX]
            self.baselinePartialSums.attrib[
                "SUMXX"] = self.baselineUpdator.counters[SUMXX]

        elif isinstance(self.baseline, pmml.UniformDistribution):
            self.baselineUpdator = self.engine.producerUpdateScheme.updator(
                MIN, MAX)
            if self.updateExisting:
                self.baselineUpdator.initialize({
                    MIN:
                    self.baseline.attrib["lower"],
                    MAX:
                    self.baseline.attrib["upper"]
                })

        else:
            raise NotImplementedError, "Only production of Gaussian, Poisson, and Uniform distributions has been implemented."

        if self.alternateField is not None:
            if not testDistributions.exists(pmml.Alternate):
                raise RuntimeError, "alternateField requested but there is no <Alternate/> distribution in the PMML"

            self.alternate = testDistributions.child(pmml.Alternate).child()

            if isinstance(
                    self.alternate,
                (pmml.PoissonDistribution, pmml.GaussianDistribution)):
                self.alternatePartialSums = self.alternate.descendant(
                    pmml.X_ODG_PartialSums, exception=False, maxdepth=2)
                if self.alternatePartialSums is None:
                    self.alternatePartialSums = pmml.X_ODG_PartialSums()
                    if not self.alternate.exists(pmml.Extension):
                        self.alternate.children.append(pmml.Extension())
                    self.alternate.child(pmml.Extension).children.append(
                        self.alternatePartialSums)

            if isinstance(self.alternate, pmml.PoissonDistribution):
                self.alternateUpdator = self.engine.producerUpdateScheme.updator(
                    SUM1, SUMX)
                if self.updateExisting:
                    self.alternateUpdator.initialize({
                        COUNT:
                        self.alternatePartialSums.attrib.get("COUNT", 0),
                        SUM1:
                        self.alternatePartialSums.attrib.get("SUM1", 0.),
                        SUMX:
                        self.alternatePartialSums.attrib.get("SUMX", 0.)
                    })
                if COUNT in self.alternateUpdator.counters:
                    self.alternatePartialSums.attrib[
                        "COUNT"] = self.alternateUpdator.counters[COUNT]
                self.alternatePartialSums.attrib[
                    "SUM1"] = self.alternateUpdator.counters[SUM1]
                self.alternatePartialSums.attrib[
                    "SUMX"] = self.alternateUpdator.counters[SUMX]

            elif isinstance(self.alternate, pmml.GaussianDistribution):
                self.alternateUpdator = self.engine.producerUpdateScheme.updator(
                    SUM1, SUMX, SUMXX)
                if self.updateExisting:
                    self.alternateUpdator.initialize({
                        COUNT:
                        self.alternatePartialSums.attrib.get("COUNT", 0),
                        SUM1:
                        self.alternatePartialSums.attrib.get("SUM1", 0.),
                        SUMX:
                        self.alternatePartialSums.attrib.get("SUMX", 0.),
                        SUMXX:
                        self.alternatePartialSums.attrib.get("SUMXX", 0.)
                    })
                if COUNT in self.alternateUpdator.counters:
                    self.alternatePartialSums.attrib[
                        "COUNT"] = self.alternateUpdator.counters[COUNT]
                self.alternatePartialSums.attrib[
                    "SUM1"] = self.alternateUpdator.counters[SUM1]
                self.alternatePartialSums.attrib[
                    "SUMX"] = self.alternateUpdator.counters[SUMX]
                self.alternatePartialSums.attrib[
                    "SUMXX"] = self.alternateUpdator.counters[SUMXX]

            elif isinstance(self.alternate, pmml.UniformDistribution):
                self.alternateUpdator = self.engine.producerUpdateScheme.updator(
                    MIN, MAX)
                if self.updateExisting:
                    self.alteranteUpdator.initialize({
                        MIN:
                        self.alterante.attrib["lower"],
                        MAX:
                        self.alterante.attrib["upper"]
                    })

            else:
                raise NotImplementedError, "Only production of Gaussian, Poisson, and Uniform distributions has been implemented."

        else:
            self.alternate = None
Esempio n. 2
0
    def initialize(self, **params):
        """Initialize a clustering model producer."""

        if "resume" in params:
            self.resume = pmml.boolCheck(params["resume"])
            del params["resume"]
        else:
            self.resume = False

        if "numberOfTrials" in params:
            self.numberOfTrials = int(params["numberOfTrials"])
            del params["numberOfTrials"]
        else:
            self.numberOfTrials = 10

        if "numberToKeep" in params:
            self.numberToKeep = int(params["numberToKeep"])
            del params["numberToKeep"]
        else:
            self.numberToKeep = 3

        if "maturityThreshold" in params:
            self.maturityThreshold = int(params["maturityThreshold"])
            del params["maturityThreshold"]
        else:
            self.maturityThreshold = 100

        if "initialStability" in params:
            self.initialStability = int(params["initialStability"])
            del params["initialStability"]
        else:
            self.initialStability = 100

        if "overrideSignificance" in params:
            self.overrideSignificance = float(params["overrideSignificance"])
            del params["overrideSignificance"]
            if self.overrideSignificance == 0.:
                self.overrideSignificance = None
        else:
            self.overrideSignificance = 5.

        self.model = self.segmentRecord.pmmlModel
        self.dataDistribution = self.engine.producerUpdateScheme.updator(COVARIANCE(self.model.numberOfFields))

        self.distanceMeasure = (self.model.child(pmml.ComparisonMeasure).attrib["kind"] == "distance")
            
        # put PartialSums in the model if they're not already there; pick up old values if you're resuming
        extension = self.model.child(pmml.Extension, exception=False)
        if extension is None:
            extension = pmml.Extension()
            self.model.children.append(extension)

        if self.resume:
            self.sumOfDistances = extension.child(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == "SumOfDistances", exception=False)
        else:
            index = extension.index(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == "SumOfDistances", exception=False)
            if index is not None:
                del extension[index]
            self.sumOfDistances = None

        if self.sumOfDistances is None:
            self.sumOfDistances = pmml.X_ODG_PartialSums(name="SumOfDistances", COUNT=0, SUM1=0., SUMX=0., SUMXX=0.)
            extension.children.append(self.sumOfDistances)

        self.partialSums = {}
        for theid, cluster in zip(self.model.ids, self.model.cluster):
            for i, field in enumerate(self.model.fields):
                fullname = "%s.%s" % (theid, field)

                if self.resume:
                    partialSum = extension.child(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == fullname, exception=False)
                else:
                    index = extension.index(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == fullname, exception=False)
                    if index is not None:
                        del extension[index]
                    partialSum = None

                if partialSum is None:
                    partialSum = pmml.X_ODG_PartialSums(name=fullname, SUM1=1., SUMX=cluster.value[i])
                    extension.children.append(partialSum)

                self.partialSums[fullname] = partialSum
                    
        # create the first trial using the values constructed above (they come from the PMML file if resume is True)
        trialFromPmml = new.instance(TrialClusterSet)
        trialFromPmml.updator = self.engine.producerUpdateScheme.updator(COUNT, SUM1, SUMX, SUMXX)
        trialFromPmml.updator.initialize({COUNT: self.sumOfDistances.attrib["COUNT"], SUM1: self.sumOfDistances.attrib["SUM1"], SUMX: self.sumOfDistances.attrib["SUMX"], SUMXX: self.sumOfDistances.attrib["SUMXX"]})

        trialFromPmml.clusters = []
        for theid, cluster in zip(self.model.ids, self.model.cluster):
            trialCluster = new.instance(TrialCluster)
            trialCluster.fields = []
            trialCluster.initialPosition = []
            for field in self.model.fields:
                partialSum = self.partialSums["%s.%s" % (theid, field)]
                u = self.engine.producerUpdateScheme.updator(SUM1, SUMX)
                u.initialize({SUM1: partialSum.attrib["SUM1"], SUMX: partialSum.attrib["SUMX"]})
                trialCluster.fields.append(u)
            trialCluster.initialPosition = list(cluster.value)
            trialFromPmml.clusters.append(trialCluster)

        self.trials = [trialFromPmml]

        if len(params) > 0:
            raise TypeError, "Unrecognized parameters %s" % params