Example #1
0
    def initialize(self, **params):
        if "updateExisting" in params:
            self.updateExisting = pmml.boolCheck(params["updateExisting"])
            del params["updateExisting"]
        else:
            self.updateExisting = pmml.boolCheck(
                self.defaultParams["updateExisting"])
        if self.updateExisting:
            raise NotImplementedError(
                "Updating from existing RegressionModels not implemented; use mode='replaceExisting'"
            )

        if "dependentField" in params:
            self.dependentField = params["dependentField"]
            del params["dependentField"]
        else:
            self.dependentField = self.defaultParams["dependentField"]
        if self.dependentField == "": self.dependentField = None

        self.model = self.segmentRecord.pmmlModel

        self.predicted = []
        for miningField in self.model.child(pmml.MiningSchema).matches(
                pmml.MiningField):
            name = miningField.attrib["name"]
            usageType = miningField.attrib.get("usageType", "active")
            if usageType == "predicted":
                self.predicted.append(name)

        if len(self.predicted) == 0:
            self.dependentField = INVALID

        else:
            if self.dependentField is None:
                # by default, take the first 'predicted' feature
                self.dependentField = self.predicted[0]
            else:
                if self.dependentField not in self.predicted:
                    raise RuntimeError(
                        "DependentField feature not found among the 'predicted' features in the RegressionModel's MiningSchema%s"
                        % self.model.child(pmml.MiningSchema).fileAndLine())

        self.regressionTable = self.model.regressionTables[0]

        p = 1 + len(self.regressionTable.numericTerms) + len(
            self.regressionTable.categoricalTerms) + len(
                self.regressionTable.predictorTerms)
        self.updator = self.engine.producerUpdateScheme.updator(OLS(p))

        if len(params) > 0:
            raise TypeError("Unrecognized parameters %s" % params)
Example #2
0
    def initialize(self, **params):
        if "updateExisting" in params:
            self.updateExisting = pmml.boolCheck(params["updateExisting"])
            del params["updateExisting"]
        else:
            self.updateExisting = pmml.boolCheck(self.defaultParams["updateExisting"])
        if self.updateExisting:
            raise NotImplementedError("Updating from existing RegressionModels not implemented; use mode='replaceExisting'")

        if "dependentField" in params:
            self.dependentField = params["dependentField"]
            del params["dependentField"]
        else:
            self.dependentField = self.defaultParams["dependentField"]
        if self.dependentField == "": self.dependentField = None

        self.model = self.segmentRecord.pmmlModel

        self.predicted = []
        for miningField in self.model.child(pmml.MiningSchema).matches(pmml.MiningField):
            name = miningField.attrib["name"]
            usageType = miningField.attrib.get("usageType", "active")
            if usageType == "predicted":
                self.predicted.append(name)

        if len(self.predicted) == 0:
            self.dependentField = INVALID

        else:
            if self.dependentField is None:
                # by default, take the first 'predicted' feature
                self.dependentField = self.predicted[0]
            else:
                if self.dependentField not in self.predicted:
                    raise RuntimeError("DependentField feature not found among the 'predicted' features in the RegressionModel's MiningSchema%s" % self.model.child(pmml.MiningSchema).fileAndLine())

        self.regressionTable = self.model.regressionTables[0]

        p = 1 + len(self.regressionTable.numericTerms) + len(self.regressionTable.categoricalTerms) + len(self.regressionTable.predictorTerms)
        self.updator = self.engine.producerUpdateScheme.updator(OLS(p))

        if len(params) > 0:
            raise TypeError("Unrecognized parameters %s" % params)
Example #3
0
    def initialize(self, **params):
        if "updateExisting" in params:
            self.updateExisting = pmml.boolCheck(params["updateExisting"])
            del params["updateExisting"]
            if self.updateExisting:
                raise NotImplementedError, "Updating from existing ClusterModels using 'kmeans' not implemented; use mode='replaceExisting'"
        else:
            self.updateExisting = pmml.boolCheck(
                self.defaultParams["updateExisting"])

        if "numberOfTrials" in params:
            self.numberOfTrials = int(params["numberOfTrials"])
            del params["numberOfTrials"]
        else:
            self.numberOfTrials = int(self.defaultParams["numberOfTrials"])

        if "numberToConverge" in params:
            self.numberToConverge = int(params["numberToConverge"])
            del params["numberToConverge"]
        else:
            self.numberToConverge = int(self.defaultParams["numberToConverge"])

        self.model = self.segmentRecord.pmmlModel
        self.dataDistribution = self.engine.producerUpdateScheme.updator(
            COVARIANCE(self.model.numberOfFields))
        self.distanceMeasure = (self.model.child(
            pmml.ComparisonMeasure).attrib["kind"] == "distance")

        # get rid of any PartialSums objects, since they would be misleading (this algorithm doesn't use them)
        extension = self.model.child(pmml.Extension, exception=False)
        if extension is not None:
            newChildren = []
            for child in extension.children:
                if not isinstance(child, pmml.X_ODG_PartialSums):
                    newChildren.append(child)
            extension.children = newChildren

        self.buffer = {self.SYNCNUMBER: []}
        for field in self.model.fields:
            self.buffer[field] = []

        if len(params) > 0:
            raise TypeError, "Unrecognized parameters %s" % params
Example #4
0
    def initialize(self, **params):
        if "updateExisting" in params:
            self.updateExisting = pmml.boolCheck(params["updateExisting"])
            del params["updateExisting"]
        else:
            self.updateExisting = pmml.boolCheck(self.defaultParams["updateExisting"])

        self.model = self.segmentRecord.pmmlModel

        ### field names for get()
        self.inputFields = []
        for bayesInput in self.model.bayesInputs:
            self.inputFields.append(bayesInput.attrib["fieldName"])

        self.outputField = self.model.bayesOutput.attrib["fieldName"]

        self.first = True

        if len(params) > 0:
            raise TypeError("Unrecognized parameters %s" % params)
Example #5
0
    def initialize(self, **params):
        if "updateExisting" in params:
            self.updateExisting = pmml.boolCheck(params["updateExisting"])
            del params["updateExisting"]
        else:
            self.updateExisting = pmml.boolCheck(
                self.defaultParams["updateExisting"])

        self.model = self.segmentRecord.pmmlModel

        ### field names for get()
        self.inputFields = []
        for bayesInput in self.model.bayesInputs:
            self.inputFields.append(bayesInput.attrib["fieldName"])

        self.outputField = self.model.bayesOutput.attrib["fieldName"]

        self.first = True

        if len(params) > 0:
            raise TypeError("Unrecognized parameters %s" % params)
Example #6
0
    def initialize(self, **params):
        if "updateExisting" in params:
            self.updateExisting = pmml.boolCheck(params["updateExisting"])
            del params["updateExisting"]
            if self.updateExisting:
                raise NotImplementedError, "Updating from existing ClusterModels using 'kmeans' not implemented; use mode='replaceExisting'"
        else:
            self.updateExisting = pmml.boolCheck(self.defaultParams["updateExisting"])

        if "numberOfTrials" in params:
            self.numberOfTrials = int(params["numberOfTrials"])
            del params["numberOfTrials"]
        else:
            self.numberOfTrials = int(self.defaultParams["numberOfTrials"])

        if "numberToConverge" in params:
            self.numberToConverge = int(params["numberToConverge"])
            del params["numberToConverge"]
        else:
            self.numberToConverge = int(self.defaultParams["numberToConverge"])

        self.model = self.segmentRecord.pmmlModel
        self.dataDistribution = self.engine.producerUpdateScheme.updator(COVARIANCE(self.model.numberOfFields))
        self.distanceMeasure = (self.model.child(pmml.ComparisonMeasure).attrib["kind"] == "distance")

        # get rid of any PartialSums objects, since they would be misleading (this algorithm doesn't use them)
        extension = self.model.child(pmml.Extension, exception=False)
        if extension is not None:
            newChildren = []
            for child in extension.children:
                if not isinstance(child, pmml.X_ODG_PartialSums):
                    newChildren.append(child)
            extension.children = newChildren

        self.buffer = {self.SYNCNUMBER: []}
        for field in self.model.fields:
            self.buffer[field] = []

        if len(params) > 0:
            raise TypeError, "Unrecognized parameters %s" % params
Example #7
0
    def initialize(self, **params):
        if "updateExisting" in params:
            self.updateExisting = pmml.boolCheck(params["updateExisting"])
            del params["updateExisting"]
            if self.updateExisting:
                raise NotImplementedError, "Updating from existing ClusterModels using 'kmeans' not implemented; use mode='replaceExisting'"
        else:
            self.updateExisting = pmml.boolCheck(self.defaultParams["updateExisting"])

        if "quickConvergeSteps" in params:
            try:
                self.quickConvergeSteps = eval(params["quickConvergeSteps"])
                if not isinstance(self.quickConvergeSteps, tuple):
                    raise RuntimeError
                self.quickConvergeSteps = map(int, self.quickConvergeSteps)
            except err:
                raise RuntimeError, "quickConvergeSteps must be a tuple of numbers of events"
            del params["quickConvergeSteps"]
        else:
            self.quickConvergeSteps = eval(self.defaultParams["quickConvergeSteps"])

        if "seedSource" in params:
            self.seedSource = params["seedSource"]
            del params["seedSource"]
        else:
            self.seedSource = self.defaultParams["seedSource"]
        if self.seedSource == "dataPoints":
            self.seedSource = self.RANDOM_DATAPOINTS
        elif self.seedSource == "dataCovariance":
            self.seedSource = self.RANDOM_DATACOVARIANCE
        elif self.seedSource == "unitRect":
            self.seedSource = self.RANDOM_UNITRECT
        else:
            raise NotImplementedError, "The seedSource must be one of 'dataPoints', 'dataCovariance', 'unitRect'"

        if "numberOfTrials" in params:
            self.numberOfTrials = int(params["numberOfTrials"])
            del params["numberOfTrials"]
        else:
            self.numberOfTrials = int(self.defaultParams["numberOfTrials"])

        if "numberToConverge" in params:
            self.numberToConverge = int(params["numberToConverge"])
            del params["numberToConverge"]
        else:
            self.numberToConverge = int(self.defaultParams["numberToConverge"])

        self.model = self.segmentRecord.pmmlModel
        self.dataDistribution = self.engine.producerUpdateScheme.updator(COVARIANCE(self.model.numberOfFields))
        self.distanceMeasure = (self.model.child(pmml.ComparisonMeasure).attrib["kind"] == "distance")

        # get rid of any PartialSums objects, since they would be misleading (this algorithm doesn't use them)
        extension = self.model.child(pmml.Extension, exception=False)
        if extension is not None:
            newChildren = []
            for child in extension.children:
                if not isinstance(child, pmml.X_ODG_PartialSums):
                    newChildren.append(child)
            extension.children = newChildren

        self.buffer = {self.SYNCNUMBER: []}
        for field in self.model.fields:
            self.buffer[field] = []

        if len(params) > 0:
            raise TypeError, "Unrecognized parameters %s" % params
Example #8
0
    def initialize(self, **params):
        """Initialize a clustering model producer."""

        if "updateExisting" in params:
            self.updateExisting = pmml.boolCheck(params["updateExisting"])
            del params["updateExisting"]
        else:
            self.updateExisting = pmml.boolCheck(self.defaultParams["updateExisting"])

        if "numberOfTrials" in params:
            self.numberOfTrials = int(params["numberOfTrials"])
            del params["numberOfTrials"]
        else:
            self.numberOfTrials = int(self.defaultParams["numberOfTrials"])

        if "numberToKeep" in params:
            self.numberToKeep = int(params["numberToKeep"])
            del params["numberToKeep"]
        else:
            self.numberToKeep = int(self.defaultParams["numberToKeep"])

        if "maturityThreshold" in params:
            self.maturityThreshold = int(params["maturityThreshold"])
            del params["maturityThreshold"]
        else:
            self.maturityThreshold = int(self.defaultParams["maturityThreshold"])

        if "initialStability" in params:
            self.initialStability = int(params["initialStability"])
            del params["initialStability"]
        else:
            self.initialStability = int(self.defaultParams["initialStability"])

        if "overrideSignificance" in params:
            self.overrideSignificance = float(params["overrideSignificance"])
            del params["overrideSignificance"]
            if self.overrideSignificance == 0.:
                self.overrideSignificance = None
        else:
            self.overrideSignificance = float(self.defaultParams["overrideSignificance"])

        self.model = self.segmentRecord.pmmlModel
        self.dataDistribution = self.engine.producerUpdateScheme.updator(COVARIANCE(self.model.numberOfFields))

        self.distanceMeasure = (self.model.child(pmml.ComparisonMeasure).attrib["kind"] == "distance")
            
        # put PartialSums in the model if they're not already there; pick up old values if you're resuming
        extension = self.model.child(pmml.Extension, exception=False)
        if extension is None:
            extension = pmml.Extension()
            self.model.children.append(extension)

        if self.updateExisting:
            self.sumOfDistances = extension.child(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == "SumOfDistances", exception=False)
        else:
            index = extension.index(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == "SumOfDistances", exception=False)
            if index is not None:
                del extension[index]
            self.sumOfDistances = None

        if self.sumOfDistances is None:
            self.sumOfDistances = pmml.X_ODG_PartialSums(name="SumOfDistances", COUNT=0, SUM1=0., SUMX=0., SUMXX=0.)
            extension.children.append(self.sumOfDistances)

        self.partialSums = {}
        for theid, cluster in zip(self.model.ids, self.model.cluster):
            for i, field in enumerate(self.model.fields):
                fullname = "%s.%s" % (theid, field)

                if self.updateExisting:
                    partialSum = extension.child(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == fullname, exception=False)
                else:
                    index = extension.index(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == fullname, exception=False)
                    if index is not None:
                        del extension[index]
                    partialSum = None

                if partialSum is None:
                    partialSum = pmml.X_ODG_PartialSums(name=fullname, SUM1=1., SUMX=cluster.value[i])
                    extension.children.append(partialSum)

                self.partialSums[fullname] = partialSum
                    
        # create the first trial using the values constructed above (they come from the PMML file if updateExisting is True)
        trialFromPmml = new.instance(TrialClusterSet)
        trialFromPmml.updator = self.engine.producerUpdateScheme.updator(COUNT, SUM1, SUMX, SUMXX)
        trialFromPmml.updator.initialize({COUNT: self.sumOfDistances.attrib["COUNT"], SUM1: self.sumOfDistances.attrib["SUM1"], SUMX: self.sumOfDistances.attrib["SUMX"], SUMXX: self.sumOfDistances.attrib["SUMXX"]})

        trialFromPmml.clusters = []
        for theid, cluster in zip(self.model.ids, self.model.cluster):
            trialCluster = new.instance(TrialCluster)
            trialCluster.fields = []
            trialCluster.initialPosition = []
            for field in self.model.fields:
                partialSum = self.partialSums["%s.%s" % (theid, field)]
                u = self.engine.producerUpdateScheme.updator(SUM1, SUMX)
                u.initialize({SUM1: partialSum.attrib["SUM1"], SUMX: partialSum.attrib["SUMX"]})
                trialCluster.fields.append(u)
            trialCluster.initialPosition = list(cluster.value)
            trialFromPmml.clusters.append(trialCluster)

        self.trials = [trialFromPmml]

        if len(params) > 0:
            raise TypeError, "Unrecognized parameters %s" % params
Example #9
0
    def initialize(self, **params):
        """Initialize a baseline producer.

        Unlike other producers, this creates the update function
        dynamically, depending on the testStatistic.
        """

        testDistributions = self.model.child(pmml.TestDistributions)
        self.field = testDistributions.attrib["field"]

        if "updateExisting" in params:
            self.updateExisting = pmml.boolCheck(params["updateExisting"])
            del params["updateExisting"]
        else:
            self.updateExisting = pmml.boolCheck(self.defaultParams["updateExisting"])

        self.first = True

        testStatistic = self.model.child(pmml.TestDistributions).attrib["testStatistic"]
        if testStatistic in ("CUSUM", "zValue", "GLR"):
            self.baseline = testDistributions.child(pmml.Baseline).child()
            self.update = self.updateDistribution

            if testStatistic == "CUSUM":
                if "alternateField" in params:
                    self.alternateField = params["alternateField"]
                    del params["alternateField"]
                else:
                    self.alternateField = None

                if "alternateValue" in params:
                    self.alternateValue = params["alternateValue"]
                    del params["alternateValue"]
                else:
                    self.alternateValue = None

                extension = testDistributions.child(pmml.Extension, exception=False)
                if extension is None:
                    extension = pmml.newInstance("Extension")
                    testDistributions.children.append(extension)
                extension.extender("ODG")

                self.cusumInitialization = extension.child(pmml.X_ODG_CUSUMInitialization, exception=False)
                if self.cusumInitialization is None:
                    self.cusumInitialization = pmml.newInstance("X-ODG-CUSUMInitialization", attrib={"value": 0.}, base=pmml.X_ODG_PMML)
                    extension.children.append(self.cusumInitialization)
                elif not self.updateExisting:
                    self.cusumInitialization.attrib["value"] = 0.

            else:
                self.alternateField = None
                self.alternateValue = None
                self.cusumInitialization = None

        elif testStatistic in ("chiSquareDistribution", "scalarProduct"):
            self.weightField = testDistributions.attrib.get("weightField", None)
            self.countTable = testDistributions.child(pmml.Baseline).child(lambda x: isinstance(x, (pmml.CountTable, pmml.NormalizedCountTable)))
            self.update = self.updateHistogram

        elif testStatistic == "chiSquareIndependence": 
            self.baseline = testDistributions.child(pmml.Baseline)
            self.fields = None
            self.countTable = None

            self.updators = {}
            self.total_updator = self.engine.producerUpdateScheme.updator(SUMX)
            self.update = self.updateChiSquareIndependence

        if "alternateField" in params:
            raise NotImplementedError, "The 'alternateField' producerParameter is only used by CUSUM"
        if "alternateValue" in params:
            raise NotImplementedError, "The 'alternateValue' producerParameter is only used by CUSUM"

        if len(params) > 0:
            raise TypeError, "Unrecognized parameters %s" % params
Example #10
0
    def initialize(self, **params):
        if "updateExisting" in params:
            self.updateExisting = pmml.boolCheck(params["updateExisting"])
            del params["updateExisting"]
            if self.updateExisting:
                raise NotImplementedError, "Updating from existing ClusterModels using 'kmeans' not implemented; use mode='replaceExisting'"
        else:
            self.updateExisting = pmml.boolCheck(
                self.defaultParams["updateExisting"])

        if "quickConvergeSteps" in params:
            try:
                self.quickConvergeSteps = eval(params["quickConvergeSteps"])
                if not isinstance(self.quickConvergeSteps, tuple):
                    raise RuntimeError
                self.quickConvergeSteps = map(int, self.quickConvergeSteps)
            except err:
                raise RuntimeError, "quickConvergeSteps must be a tuple of numbers of events"
            del params["quickConvergeSteps"]
        else:
            self.quickConvergeSteps = eval(
                self.defaultParams["quickConvergeSteps"])

        if "seedSource" in params:
            self.seedSource = params["seedSource"]
            del params["seedSource"]
        else:
            self.seedSource = self.defaultParams["seedSource"]
        if self.seedSource == "dataPoints":
            self.seedSource = self.RANDOM_DATAPOINTS
        elif self.seedSource == "dataCovariance":
            self.seedSource = self.RANDOM_DATACOVARIANCE
        elif self.seedSource == "unitRect":
            self.seedSource = self.RANDOM_UNITRECT
        else:
            raise NotImplementedError, "The seedSource must be one of 'dataPoints', 'dataCovariance', 'unitRect'"

        if "numberOfTrials" in params:
            self.numberOfTrials = int(params["numberOfTrials"])
            del params["numberOfTrials"]
        else:
            self.numberOfTrials = int(self.defaultParams["numberOfTrials"])

        if "numberToConverge" in params:
            self.numberToConverge = int(params["numberToConverge"])
            del params["numberToConverge"]
        else:
            self.numberToConverge = int(self.defaultParams["numberToConverge"])

        self.model = self.segmentRecord.pmmlModel
        self.dataDistribution = self.engine.producerUpdateScheme.updator(
            COVARIANCE(self.model.numberOfFields))
        self.distanceMeasure = (self.model.child(
            pmml.ComparisonMeasure).attrib["kind"] == "distance")

        # get rid of any PartialSums objects, since they would be misleading (this algorithm doesn't use them)
        extension = self.model.child(pmml.Extension, exception=False)
        if extension is not None:
            newChildren = []
            for child in extension.children:
                if not isinstance(child, pmml.X_ODG_PartialSums):
                    newChildren.append(child)
            extension.children = newChildren

        self.buffer = {self.SYNCNUMBER: []}
        for field in self.model.fields:
            self.buffer[field] = []

        if len(params) > 0:
            raise TypeError, "Unrecognized parameters %s" % params
Example #11
0
    def initialize(self, **params):
        """Initialize a baseline producer.

        Unlike other producers, this creates the update function
        dynamically, depending on the testStatistic.
        """

        testDistributions = self.segmentRecord.pmmlModel.child(pmml.TestDistributions)
        self.field = testDistributions.attrib["field"]

        if "resume" in params:
            self.resume = pmml.boolCheck(params["resume"])
            del params["resume"]
        else:
            self.resume = False

        self.first = True

        testStatistic = self.segmentRecord.pmmlModel.child(pmml.TestDistributions).attrib["testStatistic"]
        if testStatistic in ("CUSUM", "zValue", "GLR"):
            self.baseline = testDistributions.child(pmml.Baseline).child()
            self.update = self.updateDistribution

            if testStatistic == "CUSUM":
                if "alternateField" in params:
                    self.alternateField = params["alternateField"]
                    del params["alternateField"]
                else:
                    self.alternateField = None

                if "alternateValue" in params:
                    self.alternateValue = params["alternateValue"]
                    del params["alternateValue"]
                else:
                    self.alternateValue = None

            else:
                self.alternateField = None
                self.alternateValue = None

        elif testStatistic in ("chiSquareDistribution", "scalarProduct"):
            self.weightField = testDistributions.attrib.get("weightField", None)
            self.countTable = testDistributions.child(pmml.Baseline).child(
                lambda x: isinstance(x, (pmml.CountTable, pmml.NormalizedCountTable))
            )
            self.update = self.updateHistogram

        elif testStatistic == "chiSquareIndependence":
            self.baseline = testDistributions.child(pmml.Baseline)
            self.fields = None
            self.countTable = None

            self.updators = {}
            self.total_updator = self.engine.producerUpdateScheme.updator(SUMX)
            self.update = self.updateChiSquareIndependence

        if "alternateField" in params:
            raise NotImplementedError, "The 'alternateField' producerParameter is only used by CUSUM"
        if "alternateValue" in params:
            raise NotImplementedError, "The 'alternateValue' producerParameter is only used by CUSUM"

        if len(params) > 0:
            raise TypeError, "Unrecognized parameters %s" % params
Example #12
0
    def initialize(self, **params):
        """Initialize a baseline producer.

        Unlike other producers, this creates the update function
        dynamically, depending on the testStatistic.
        """

        testDistributions = self.model.child(pmml.TestDistributions)
        self.field = testDistributions.attrib["field"]

        if "updateExisting" in params:
            self.updateExisting = pmml.boolCheck(params["updateExisting"])
            del params["updateExisting"]
        else:
            self.updateExisting = pmml.boolCheck(self.defaultParams["updateExisting"])

        self.first = True

        testStatistic = self.model.child(pmml.TestDistributions).attrib["testStatistic"]
        if testStatistic in ("CUSUM", "zValue", "GLR"):
            self.baseline = testDistributions.child(pmml.Baseline).child()
            self.update = self.updateDistribution

            if testStatistic == "CUSUM":
                if "alternateField" in params:
                    self.alternateField = params["alternateField"]
                    del params["alternateField"]
                else:
                    self.alternateField = None

                if "alternateValue" in params:
                    self.alternateValue = params["alternateValue"]
                    del params["alternateValue"]
                else:
                    self.alternateValue = None

                extension = testDistributions.child(pmml.Extension, exception=False)
                if extension is None:
                    extension = pmml.newInstance("Extension")
                    testDistributions.children.append(extension)

                self.cusumInitialization = extension.child(pmml.X_ODG_CUSUMInitialization, exception=False)
                if self.cusumInitialization is None:
                    self.cusumInitialization = pmml.newInstance("X-ODG-CUSUMInitialization", attrib={"value": 0.}, base=pmml.X_ODG_PMML)
                    extension.children.append(self.cusumInitialization)
                elif not self.updateExisting:
                    self.cusumInitialization.attrib["value"] = 0.

            else:
                self.alternateField = None
                self.alternateValue = None
                self.cusumInitialization = None

        elif testStatistic in ("chiSquareDistribution", "scalarProduct"):
            self.weightField = testDistributions.attrib.get("weightField", None)
            self.countTable = testDistributions.child(pmml.Baseline).child(lambda x: isinstance(x, (pmml.CountTable, pmml.NormalizedCountTable)))
            self.update = self.updateHistogram

        elif testStatistic == "chiSquareIndependence": 
            self.baseline = testDistributions.child(pmml.Baseline)
            self.fields = None
            self.countTable = None

            self.updators = {}
            self.total_updator = self.engine.producerUpdateScheme.updator(SUMX)
            self.update = self.updateChiSquareIndependence

        if "alternateField" in params:
            raise NotImplementedError("The 'alternateField' producerParameter is only used by CUSUM")
        if "alternateValue" in params:
            raise NotImplementedError("The 'alternateValue' producerParameter is only used by CUSUM")

        if len(params) > 0:
            raise TypeError("Unrecognized parameters %s" % params)
Example #13
0
    def initialize(self, **params):
        if "updateExisting" in params:
            self.updateExisting = pmml.boolCheck(params["updateExisting"])
            del params["updateExisting"]
        else:
            self.updateExisting = pmml.boolCheck(
                self.defaultParams["updateExisting"])

        if self.updateExisting:
            raise NotImplementedError, "Updating from existing TreeModels/RuleSetModels not implemented; use mode='replaceExisting'"

        if "treeMaxDepth" in params:
            self.treeMaxDepth = int(params["treeMaxDepth"])
            del params["treeMaxDepth"]
        else:
            self.treeMaxDepth = int(self.defaultParams["treeMaxDepth"])

        if "classification" in params:
            self.classification = params["classification"]
            del params["classification"]
        else:
            self.classification = self.defaultParams["classification"]
        if self.classification == "": self.classification = None

        self.model = self.segmentRecord.pmmlModel

        if isinstance(self.model, pmml.TreeModel):
            self.nodeIndex = self.model.index(pmml.Node)
            self.model.attrib["splitCharacteristic"] = "binarySplit"

        elif isinstance(self.model, pmml.RuleSetModel):
            self.ruleSet = self.model.child(pmml.RuleSet)
            self.nodeIndex = self.ruleSet.index(
                lambda x: isinstance(x, (pmml.SimpleRule, pmml.CompoundRule)),
                exception=False)
            if self.nodeIndex is None:
                self.nodeIndex = len(self.ruleSet.children)
                self.ruleSet.children.append(None)

        self.features = []
        self.categorical = {}
        self.predicted = []
        self.data = {}
        for miningField in self.model.child(pmml.MiningSchema).matches(
                pmml.MiningField):
            name = miningField.attrib["name"]
            usageType = miningField.attrib.get("usageType", "active")
            if usageType == "active":
                dataType = self.model.dataContext.dataType[name]
                optype = self.model.dataContext.optype[name]

                self.features.append(name)
                self.categorical[name] = (optype == "categorical")

                if dataType == "boolean":
                    self.data[name] = array.array("b")
                elif dataType == "integer":
                    self.data[name] = array.array("l")
                elif dataType in ("float", "double"):
                    self.data[name] = array.array("d")
                else:
                    self.data[name] = []

            if usageType == "predicted":
                self.predicted.append(name)

        if len(self.predicted) == 0:
            self.classification = INVALID

        else:
            if self.classification is None:
                # by default, take the first 'predicted' feature
                self.classification = self.predicted[0]
            else:
                if self.classification not in self.predicted:
                    raise RuntimeError, "Classification feature not found among the 'predicted' features in the decision tree's MiningSchema%s" % self.model.child(
                        pmml.MiningSchema).fileAndLine()

        self.data[self.classification] = []

        if len(params) > 0:
            raise TypeError, "Unrecognized parameters %s" % params
Example #14
0
    def initialize(self, **params):
        """An event-based tree-producing algorithm.

        Although it does not iterate over the data as the standard
        CART algorithm does, it converges to an approximate tree by
        keeping alternate hypotheses in mind and collecting data for
        all active hypotheses.
        """

        if "updateExisting" in params:
            self.updateExisting = pmml.boolCheck(params["updateExisting"])
            del params["updateExisting"]
        else:
            self.updateExisting = pmml.boolCheck(
                self.defaultParams["updateExisting"])

        if self.updateExisting:
            raise NotImplementedError, "Updating from existing TreeModels/RuleSetModels not implemented; use mode='replaceExisting'"

        if "featureMaturityThreshold" in params:
            self.featureMaturityThreshold = int(
                params["featureMaturityThreshold"])
            del params["featureMaturityThreshold"]
        else:
            self.featureMaturityThreshold = int(
                self.defaultParams["featureMaturityThreshold"])

        if "splitMaturityThreshold" in params:
            self.splitMaturityThreshold = int(params["splitMaturityThreshold"])
            del params["splitMaturityThreshold"]
        else:
            self.splitMaturityThreshold = int(
                self.defaultParams["splitMaturityThreshold"])

        if "trialsToKeep" in params:
            self.trialsToKeep = int(params["trialsToKeep"])
            del params["trialsToKeep"]
        else:
            self.trialsToKeep = int(self.defaultParams["trialsToKeep"])

        if "worldsToSplit" in params:
            self.worldsToSplit = int(params["worldsToSplit"])
            del params["worldsToSplit"]
        else:
            self.worldsToSplit = int(self.defaultParams["worldsToSplit"])

        if "treeDepth" in params:
            self.treeDepth = int(params["treeDepth"])
            del params["treeDepth"]
        else:
            self.treeDepth = int(self.defaultParams["treeDepth"])

        if "classification" in params:
            self.classification = params["classification"]
            del params["classification"]
        else:
            self.classification = self.defaultParams["classification"]
        if self.classification == "": self.classification = None

        self.model = self.segmentRecord.pmmlModel

        if isinstance(self.model, pmml.TreeModel):
            self.modelType = self.TREEMODEL
            self.nodeIndex = self.model.index(pmml.Node)

        elif isinstance(self.model, pmml.RuleSetModel):
            self.ruleSet = self.model.child(pmml.RuleSet)
            self.modelType = self.RULESETMODEL
            self.nodeIndex = self.ruleSet.index(
                lambda x: isinstance(x, (pmml.SimpleRule, pmml.CompoundRule)),
                exception=False)
            if self.nodeIndex is None:
                self.nodeIndex = len(self.ruleSet.children)
                self.ruleSet.children.append(None)

        self.features = []
        self.predicted = []
        for miningField in self.model.child(pmml.MiningSchema).matches(
                pmml.MiningField):
            name = miningField.attrib["name"]
            usageType = miningField.attrib.get("usageType", "active")
            if usageType == "active":
                dataType = self.model.dataContext.dataType[name]
                optype = self.model.dataContext.optype[name]
                if optype == "ordinal" and dataType == "string":
                    optype = self.model.dataContext.cast[name]

                feature = Feature(name, optype, dataType,
                                  self.engine.producerUpdateScheme)
                feature.maturityThreshold = self.featureMaturityThreshold
                self.features.append(feature)

            if usageType == "predicted":
                self.predicted.append(name)

        if len(self.predicted) == 0:
            self.classification = INVALID

        else:
            if self.classification is None:
                # by default, take the first 'predicted' feature
                self.classification = self.predicted[0]
            else:
                if self.classification not in self.predicted:
                    raise RuntimeError, "Classification feature not found among the 'predicted' features in the decision tree's MiningSchema%s" % self.model.child(
                        pmml.MiningSchema).fileAndLine()

        self.topWorld = World(0, None)
        self.counts = {}

        if len(params) > 0:
            raise TypeError, "Unrecognized parameters %s" % params
Example #15
0
    def initialize(self, **params):
        """An event-based tree-producing algorithm.

        Although it does not iterate over the data as the standard
        CART algorithm does, it converges to an approximate tree by
        keeping alternate hypotheses in mind and collecting data for
        all active hypotheses.
        """

        if "updateExisting" in params:
            self.updateExisting = pmml.boolCheck(params["updateExisting"])
            del params["updateExisting"]
        else:
            self.updateExisting = pmml.boolCheck(self.defaultParams["updateExisting"])

        if self.updateExisting:
            raise NotImplementedError, "Updating from existing TreeModels/RuleSetModels not implemented; use mode='replaceExisting'"

        if "featureMaturityThreshold" in params:
            self.featureMaturityThreshold = int(params["featureMaturityThreshold"])
            del params["featureMaturityThreshold"]
        else:
            self.featureMaturityThreshold = int(self.defaultParams["featureMaturityThreshold"])

        if "splitMaturityThreshold" in params:
            self.splitMaturityThreshold = int(params["splitMaturityThreshold"])
            del params["splitMaturityThreshold"]
        else:
            self.splitMaturityThreshold = int(self.defaultParams["splitMaturityThreshold"])

        if "trialsToKeep" in params:
            self.trialsToKeep = int(params["trialsToKeep"])
            del params["trialsToKeep"]
        else:
            self.trialsToKeep = int(self.defaultParams["trialsToKeep"])

        if "worldsToSplit" in params:
            self.worldsToSplit = int(params["worldsToSplit"])
            del params["worldsToSplit"]
        else:
            self.worldsToSplit = int(self.defaultParams["worldsToSplit"])

        if "treeDepth" in params:
            self.treeDepth = int(params["treeDepth"])
            del params["treeDepth"]
        else:
            self.treeDepth = int(self.defaultParams["treeDepth"])

        if "classification" in params:
            self.classification = params["classification"]
            del params["classification"]
        else:
            self.classification = self.defaultParams["classification"]
        if self.classification == "": self.classification = None

        self.model = self.segmentRecord.pmmlModel

        if isinstance(self.model, pmml.TreeModel):
            self.modelType = self.TREEMODEL
            self.nodeIndex = self.model.index(pmml.Node)

        elif isinstance(self.model, pmml.RuleSetModel):
            self.ruleSet = self.model.child(pmml.RuleSet)
            self.modelType = self.RULESETMODEL
            self.nodeIndex = self.ruleSet.index(lambda x: isinstance(x, (pmml.SimpleRule, pmml.CompoundRule)), exception=False)
            if self.nodeIndex is None:
                self.nodeIndex = len(self.ruleSet.children)
                self.ruleSet.children.append(None)

        self.features = []
        self.predicted = []
        for miningField in self.model.child(pmml.MiningSchema).matches(pmml.MiningField):
            name = miningField.attrib["name"]
            usageType = miningField.attrib.get("usageType", "active")
            if usageType == "active":
                dataType = self.model.dataContext.dataType[name]
                optype = self.model.dataContext.optype[name]
                if optype == "ordinal" and dataType == "string":
                    optype = self.model.dataContext.cast[name]

                feature = Feature(name, optype, dataType, self.engine.producerUpdateScheme)
                feature.maturityThreshold = self.featureMaturityThreshold
                self.features.append(feature)

            if usageType == "predicted":
                self.predicted.append(name)

        if len(self.predicted) == 0:
            self.classification = INVALID

        else:
            if self.classification is None:
                # by default, take the first 'predicted' feature
                self.classification = self.predicted[0]
            else:
                if self.classification not in self.predicted:
                    raise RuntimeError, "Classification feature not found among the 'predicted' features in the decision tree's MiningSchema%s" % self.model.child(pmml.MiningSchema).fileAndLine()
        
        self.topWorld = World(0, None)
        self.counts = {}

        if len(params) > 0:
            raise TypeError, "Unrecognized parameters %s" % params
Example #16
0
    def initialize(self, **params):
        if "updateExisting" in params:
            self.updateExisting = pmml.boolCheck(params["updateExisting"])
            del params["updateExisting"]
            if self.updateExisting:
                raise NotImplementedError(
                    "Updating from existing ClusterModels using 'kmeans' not implemented; use mode='replaceExisting'"
                )
        else:
            self.updateExisting = pmml.boolCheck(
                self.defaultParams["updateExisting"])

        if "quickConvergeSteps" in params:
            try:
                self.quickConvergeSteps = eval(params["quickConvergeSteps"])
                if not isinstance(self.quickConvergeSteps, tuple):
                    raise RuntimeError
                self.quickConvergeSteps = map(int, self.quickConvergeSteps)
            except err:
                raise RuntimeError(
                    "quickConvergeSteps must be a tuple of numbers of events")
            del params["quickConvergeSteps"]
        else:
            self.quickConvergeSteps = eval(
                self.defaultParams["quickConvergeSteps"])

        if "numberOfClusters" in params:
            self.numberOfClusters = params["numberOfClusters"]
            del params["numberOfClusters"]
        else:
            self.numberOfClusters = self.defaultParams["numberOfClusters"]
        try:
            self.numberOfClusters = int(self.numberOfClusters)
            if self.numberOfClusters <= 0: raise ValueError
        except ValueError:
            if self.numberOfClusters == "unset":
                self.numberOfClusters = None
            else:
                raise RuntimeError(
                    "numberOfClusters must be a positive integer or \"unset\", not \"%s\""
                    % self.numberOfClusters)

        if "seedSource" in params:
            self.seedSource = params["seedSource"]
            del params["seedSource"]
        else:
            self.seedSource = self.defaultParams["seedSource"]
        if self.seedSource == "dataPoints":
            self.seedSource = self.RANDOM_DATAPOINTS
        elif self.seedSource == "dataWeighted":
            self.seedSource = self.RANDOM_DATAWEIGHTED
        elif self.seedSource == "dataCovariance":
            self.seedSource = self.RANDOM_DATACOVARIANCE
        elif self.seedSource == "unitRect":
            self.seedSource = self.RANDOM_UNITRECT
        else:
            raise NotImplementedError(
                "The seedSource must be one of 'dataPoints', 'dataCovariance', 'unitRect'"
            )

        if "numberOfTrials" in params:
            self.numberOfTrials = int(params["numberOfTrials"])
            del params["numberOfTrials"]
        else:
            self.numberOfTrials = int(self.defaultParams["numberOfTrials"])

        if "numberToConverge" in params:
            self.numberToConverge = int(params["numberToConverge"])
            del params["numberToConverge"]
        else:
            self.numberToConverge = int(self.defaultParams["numberToConverge"])

        if self.numberToConverge > self.numberOfTrials:
            raise RuntimeError(
                "numberToConverge (%d) must not be greater than numberOfTrials (%d)"
                % (self.numberToConverge, self.numberOfTrials))

        if "maxIterations" in params:
            self.maxIterations = params["maxIterations"]
            del params["maxIterations"]
        else:
            self.maxIterations = self.defaultParams["maxIterations"]
        try:
            self.maxIterations = int(self.maxIterations)
            if self.maxIterations <= 0: raise ValueError
        except ValueError:
            if self.maxIterations == "unset":
                self.maxIterations = None
            else:
                raise RuntimeError(
                    "maxIterations must be a positive integer or \"unset\", not \"%s\""
                    % self.maxIterations)

        if "closeEnough" in params:
            self.closeEnough = float(params["closeEnough"])
            del params["closeEnough"]
        else:
            self.closeEnough = float(self.defaultParams["closeEnough"])

        self.model = self.segmentRecord.pmmlModel
        self.dataDistribution = self.engine.producerUpdateScheme.updator(
            COVARIANCE(self.model.numberOfFields))
        self.distanceMeasure = (self.model.child(
            pmml.ComparisonMeasure).attrib["kind"] == "distance")

        if self.seedSource == self.RANDOM_DATAWEIGHTED and self.model.weightField is None:
            self.seedSource = self.RANDOM_DATAPOINTS

        # get rid of any PartialSums objects, since they would be misleading (this algorithm doesn't use them)
        extension = self.model.child(pmml.Extension, exception=False)
        if extension is not None:
            newChildren = []
            for child in extension.children:
                if not isinstance(child, pmml.X_ODG_PartialSums):
                    newChildren.append(child)
            extension.children = newChildren

        self.buffer = {self.SYNCNUMBER: []}
        for field in self.model.fields:
            self.buffer[field] = []

        if self.model.weightField is not None:
            self.buffer[self.model.weightField] = []

        if len(params) > 0:
            raise TypeError("Unrecognized parameters %s" % params)
Example #17
0
    def initialize(self, **params):
        if "updateExisting" in params:
            self.updateExisting = pmml.boolCheck(params["updateExisting"])
            del params["updateExisting"]
        else:
            self.updateExisting = pmml.boolCheck(self.defaultParams["updateExisting"])

        if self.updateExisting:
            raise NotImplementedError, "Updating from existing TreeModels/RuleSetModels not implemented; use mode='replaceExisting'"

        if "treeMaxDepth" in params:
            self.treeMaxDepth = int(params["treeMaxDepth"])
            del params["treeMaxDepth"]
        else:
            self.treeMaxDepth = int(self.defaultParams["treeMaxDepth"])

        if "classification" in params:
            self.classification = params["classification"]
            del params["classification"]
        else:
            self.classification = self.defaultParams["classification"]
        if self.classification == "": self.classification = None

        self.model = self.segmentRecord.pmmlModel

        if isinstance(self.model, pmml.TreeModel):
            self.nodeIndex = self.model.index(pmml.Node)
            self.model.attrib["splitCharacteristic"] = "binarySplit"

        elif isinstance(self.model, pmml.RuleSetModel):
            self.ruleSet = self.model.child(pmml.RuleSet)
            self.nodeIndex = self.ruleSet.index(lambda x: isinstance(x, (pmml.SimpleRule, pmml.CompoundRule)), exception=False)
            if self.nodeIndex is None:
                self.nodeIndex = len(self.ruleSet.children)
                self.ruleSet.children.append(None)

        self.features = []
        self.categorical = {}
        self.predicted = []
        self.data = {}
        for miningField in self.model.child(pmml.MiningSchema).matches(pmml.MiningField):
            name = miningField.attrib["name"]
            usageType = miningField.attrib.get("usageType", "active")
            if usageType == "active":
                dataType = self.model.dataContext.dataType[name]
                optype = self.model.dataContext.optype[name]

                self.features.append(name)
                self.categorical[name] = (optype == "categorical")

                if dataType == "boolean":
                    self.data[name] = array.array("b")
                elif dataType == "integer":
                    self.data[name] = array.array("l")
                elif dataType in ("float", "double"):
                    self.data[name] = array.array("d")
                else:
                    self.data[name] = []

            if usageType == "predicted":
                self.predicted.append(name)

        if len(self.predicted) == 0:
            self.classification = INVALID

        else:
            if self.classification is None:
                # by default, take the first 'predicted' feature
                self.classification = self.predicted[0]
            else:
                if self.classification not in self.predicted:
                    raise RuntimeError, "Classification feature not found among the 'predicted' features in the decision tree's MiningSchema%s" % self.model.child(pmml.MiningSchema).fileAndLine()

        self.data[self.classification] = []

        if len(params) > 0:
            raise TypeError, "Unrecognized parameters %s" % params
Example #18
0
    def initialize(self, **params):
        """Initialize a clustering model producer."""

        if "resume" in params:
            self.resume = pmml.boolCheck(params["resume"])
            del params["resume"]
        else:
            self.resume = False

        if "numberOfTrials" in params:
            self.numberOfTrials = int(params["numberOfTrials"])
            del params["numberOfTrials"]
        else:
            self.numberOfTrials = 10

        if "numberToKeep" in params:
            self.numberToKeep = int(params["numberToKeep"])
            del params["numberToKeep"]
        else:
            self.numberToKeep = 3

        if "maturityThreshold" in params:
            self.maturityThreshold = int(params["maturityThreshold"])
            del params["maturityThreshold"]
        else:
            self.maturityThreshold = 100

        if "initialStability" in params:
            self.initialStability = int(params["initialStability"])
            del params["initialStability"]
        else:
            self.initialStability = 100

        if "overrideSignificance" in params:
            self.overrideSignificance = float(params["overrideSignificance"])
            del params["overrideSignificance"]
            if self.overrideSignificance == 0.:
                self.overrideSignificance = None
        else:
            self.overrideSignificance = 5.

        self.model = self.segmentRecord.pmmlModel
        self.dataDistribution = self.engine.producerUpdateScheme.updator(COVARIANCE(self.model.numberOfFields))

        self.distanceMeasure = (self.model.child(pmml.ComparisonMeasure).attrib["kind"] == "distance")
            
        # put PartialSums in the model if they're not already there; pick up old values if you're resuming
        extension = self.model.child(pmml.Extension, exception=False)
        if extension is None:
            extension = pmml.Extension()
            self.model.children.append(extension)

        if self.resume:
            self.sumOfDistances = extension.child(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == "SumOfDistances", exception=False)
        else:
            index = extension.index(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == "SumOfDistances", exception=False)
            if index is not None:
                del extension[index]
            self.sumOfDistances = None

        if self.sumOfDistances is None:
            self.sumOfDistances = pmml.X_ODG_PartialSums(name="SumOfDistances", COUNT=0, SUM1=0., SUMX=0., SUMXX=0.)
            extension.children.append(self.sumOfDistances)

        self.partialSums = {}
        for theid, cluster in zip(self.model.ids, self.model.cluster):
            for i, field in enumerate(self.model.fields):
                fullname = "%s.%s" % (theid, field)

                if self.resume:
                    partialSum = extension.child(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == fullname, exception=False)
                else:
                    index = extension.index(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == fullname, exception=False)
                    if index is not None:
                        del extension[index]
                    partialSum = None

                if partialSum is None:
                    partialSum = pmml.X_ODG_PartialSums(name=fullname, SUM1=1., SUMX=cluster.value[i])
                    extension.children.append(partialSum)

                self.partialSums[fullname] = partialSum
                    
        # create the first trial using the values constructed above (they come from the PMML file if resume is True)
        trialFromPmml = new.instance(TrialClusterSet)
        trialFromPmml.updator = self.engine.producerUpdateScheme.updator(COUNT, SUM1, SUMX, SUMXX)
        trialFromPmml.updator.initialize({COUNT: self.sumOfDistances.attrib["COUNT"], SUM1: self.sumOfDistances.attrib["SUM1"], SUMX: self.sumOfDistances.attrib["SUMX"], SUMXX: self.sumOfDistances.attrib["SUMXX"]})

        trialFromPmml.clusters = []
        for theid, cluster in zip(self.model.ids, self.model.cluster):
            trialCluster = new.instance(TrialCluster)
            trialCluster.fields = []
            trialCluster.initialPosition = []
            for field in self.model.fields:
                partialSum = self.partialSums["%s.%s" % (theid, field)]
                u = self.engine.producerUpdateScheme.updator(SUM1, SUMX)
                u.initialize({SUM1: partialSum.attrib["SUM1"], SUMX: partialSum.attrib["SUMX"]})
                trialCluster.fields.append(u)
            trialCluster.initialPosition = list(cluster.value)
            trialFromPmml.clusters.append(trialCluster)

        self.trials = [trialFromPmml]

        if len(params) > 0:
            raise TypeError, "Unrecognized parameters %s" % params
Example #19
0
    def initialize(self, **params):
        """Initialize a baseline producer.

        Unlike other producers, this creates the update function
        dynamically, depending on the testStatistic.
        """

        testDistributions = self.segmentRecord.pmmlModel.child(
            pmml.TestDistributions)
        self.field = testDistributions.attrib["field"]

        if "updateExisting" in params:
            self.updateExisting = pmml.boolCheck(params["updateExisting"])
            del params["updateExisting"]
        else:
            self.updateExisting = pmml.boolCheck(
                self.defaultParams["updateExisting"])

        self.first = True

        testStatistic = self.segmentRecord.pmmlModel.child(
            pmml.TestDistributions).attrib["testStatistic"]
        if testStatistic in ("CUSUM", "zValue", "GLR"):
            self.baseline = testDistributions.child(pmml.Baseline).child()
            self.update = self.updateDistribution

            if testStatistic == "CUSUM":
                if "alternateField" in params:
                    self.alternateField = params["alternateField"]
                    del params["alternateField"]
                else:
                    self.alternateField = None

                if "alternateValue" in params:
                    self.alternateValue = params["alternateValue"]
                    del params["alternateValue"]
                else:
                    self.alternateValue = None

            else:
                self.alternateField = None
                self.alternateValue = None

        elif testStatistic in ("chiSquareDistribution", "scalarProduct"):
            self.weightField = testDistributions.attrib.get(
                "weightField", None)
            self.countTable = testDistributions.child(
                pmml.Baseline).child(lambda x: isinstance(
                    x, (pmml.CountTable, pmml.NormalizedCountTable)))
            self.update = self.updateHistogram

        elif testStatistic == "chiSquareIndependence":
            self.baseline = testDistributions.child(pmml.Baseline)
            self.fields = None
            self.countTable = None

            self.updators = {}
            self.total_updator = self.engine.producerUpdateScheme.updator(SUMX)
            self.update = self.updateChiSquareIndependence

        if "alternateField" in params:
            raise NotImplementedError, "The 'alternateField' producerParameter is only used by CUSUM"
        if "alternateValue" in params:
            raise NotImplementedError, "The 'alternateValue' producerParameter is only used by CUSUM"

        if len(params) > 0:
            raise TypeError, "Unrecognized parameters %s" % params
Example #20
0
    def initialize(self, **params):
        if "updateExisting" in params:
            self.updateExisting = pmml.boolCheck(params["updateExisting"])
            del params["updateExisting"]
            if self.updateExisting:
                raise NotImplementedError("Updating from existing ClusterModels using 'kmeans' not implemented; use mode='replaceExisting'")
        else:
            self.updateExisting = pmml.boolCheck(self.defaultParams["updateExisting"])

        if "quickConvergeSteps" in params:
            try:
                self.quickConvergeSteps = eval(params["quickConvergeSteps"])
                if not isinstance(self.quickConvergeSteps, tuple):
                    raise RuntimeError
                self.quickConvergeSteps = map(int, self.quickConvergeSteps)
            except err:
                raise RuntimeError("quickConvergeSteps must be a tuple of numbers of events")
            del params["quickConvergeSteps"]
        else:
            self.quickConvergeSteps = eval(self.defaultParams["quickConvergeSteps"])

        if "numberOfClusters" in params:
            self.numberOfClusters = params["numberOfClusters"]
            del params["numberOfClusters"]
        else:
            self.numberOfClusters = self.defaultParams["numberOfClusters"]
        try:
            self.numberOfClusters = int(self.numberOfClusters)
            if self.numberOfClusters <= 0: raise ValueError
        except ValueError:
            if self.numberOfClusters == "unset":
                self.numberOfClusters = None
            else:
                raise RuntimeError("numberOfClusters must be a positive integer or \"unset\", not \"%s\"" % self.numberOfClusters)

        if "seedSource" in params:
            self.seedSource = params["seedSource"]
            del params["seedSource"]
        else:
            self.seedSource = self.defaultParams["seedSource"]
        if self.seedSource == "dataPoints":
            self.seedSource = self.RANDOM_DATAPOINTS
        elif self.seedSource == "dataWeighted":
            self.seedSource = self.RANDOM_DATAWEIGHTED
        elif self.seedSource == "dataCovariance":
            self.seedSource = self.RANDOM_DATACOVARIANCE
        elif self.seedSource == "unitRect":
            self.seedSource = self.RANDOM_UNITRECT
        else:
            raise NotImplementedError("The seedSource must be one of 'dataPoints', 'dataCovariance', 'unitRect'")

        if "numberOfTrials" in params:
            self.numberOfTrials = int(params["numberOfTrials"])
            del params["numberOfTrials"]
        else:
            self.numberOfTrials = int(self.defaultParams["numberOfTrials"])

        if "numberToConverge" in params:
            self.numberToConverge = int(params["numberToConverge"])
            del params["numberToConverge"]
        else:
            self.numberToConverge = int(self.defaultParams["numberToConverge"])

        if self.numberToConverge > self.numberOfTrials:
            raise RuntimeError("numberToConverge (%d) must not be greater than numberOfTrials (%d)" % (self.numberToConverge, self.numberOfTrials))

        if "maxIterations" in params:
            self.maxIterations = params["maxIterations"]
            del params["maxIterations"]
        else:
            self.maxIterations = self.defaultParams["maxIterations"]
        try:
            self.maxIterations = int(self.maxIterations)
            if self.maxIterations <= 0: raise ValueError
        except ValueError:
            if self.maxIterations == "unset":
                self.maxIterations = None
            else:
                raise RuntimeError("maxIterations must be a positive integer or \"unset\", not \"%s\"" % self.maxIterations)

        if "closeEnough" in params:
            self.closeEnough = float(params["closeEnough"])
            del params["closeEnough"]
        else:
            self.closeEnough = float(self.defaultParams["closeEnough"])

        self.model = self.segmentRecord.pmmlModel
        self.dataDistribution = self.engine.producerUpdateScheme.updator(COVARIANCE(self.model.numberOfFields))
        self.distanceMeasure = (self.model.child(pmml.ComparisonMeasure).attrib["kind"] == "distance")

        if self.seedSource == self.RANDOM_DATAWEIGHTED and self.model.weightField is None:
            self.seedSource = self.RANDOM_DATAPOINTS

        # get rid of any PartialSums objects, since they would be misleading (this algorithm doesn't use them)
        extension = self.model.child(pmml.Extension, exception=False)
        if extension is not None:
            newChildren = []
            for child in extension.children:
                if not isinstance(child, pmml.X_ODG_PartialSums):
                    newChildren.append(child)
            extension.children = newChildren

        self.buffer = {self.SYNCNUMBER: []}
        for field in self.model.fields:
            self.buffer[field] = []

        if self.model.weightField is not None:
            self.buffer[self.model.weightField] = []

        if len(params) > 0:
            raise TypeError("Unrecognized parameters %s" % params)