Beispiel #1
0
    def setProvenance(self, model, algoName, algorithm, userParameters):
        model.attrib["algorithmName"] = algoName

        parameters = dict(algorithm.defaultParams)
        parameters.update(userParameters)

        extension = model.child(pmml.Extension, exception=False)
        if extension is None:
            extension = pmml.newInstance("Extension")
        else:
            extension.children = [
                c for c in extension.children
                if not isinstance(c, pmml.X_ODG_AlgorithmParameter)
            ]
        extension.extender("ODG")

        keys = parameters.keys()
        keys.sort()
        for key in keys:
            ap = pmml.newInstance("X-ODG-AlgorithmParameter",
                                  attrib={
                                      "name": key,
                                      "value": parameters[key]
                                  },
                                  base=pmml.X_ODG_PMML)
            extension.children.append(ap)
 def _compoundAnd(self, *predicates):
     if len(predicates) == 0:
         raise Exception("Encountered a list of zero predicates in SegmentationScheme's _compoundAnd; this should not ever be possible.")
     elif len(predicates) == 1:
         return predicates[0]
     else:
         return pmml.newInstance("CompoundPredicate", attrib={"booleanOperator": "and"}, children=predicates)
Beispiel #3
0
    def setProvenance(self, model, algoName, algorithm, userParameters):
        model.attrib["algorithmName"] = algoName

        parameters = dict(algorithm.defaultParams)
        parameters.update(userParameters)

        extension = model.child(pmml.Extension, exception=False)
        if extension is None:
            extension = pmml.newInstance("Extension")
        else:
            extension.children = [c for c in extension.children if not isinstance(c, pmml.X_ODG_AlgorithmParameter)]
        
        keys = parameters.keys()
        keys.sort()
        for key in keys:
            ap = pmml.newInstance("X-ODG-AlgorithmParameter", attrib={"name": key, "value": parameters[key]}, base=pmml.X_ODG_PMML)
            extension.children.append(ap)
 def _simplePredicate(self, field, value, operator):
     p = pmml.newInstance("SimplePredicate",
                          attrib={
                              "field": field,
                              "value": value,
                              "operator": operator
                          })
     p.post_validate()
     return p
Beispiel #5
0
 def _compoundAnd(self, *predicates):
     if len(predicates) == 0:
         raise Exception, " ".join([
             "Encountered a list of zero predicates in",
             "SegmentationScheme's _compoundAnd; this",
             "should not ever be possible."])
     elif len(predicates) == 1:
         return predicates[0]
     else:
         return pmml.newInstance("CompoundPredicate", attrib={"booleanOperator": "and"}, children=predicates)
 def _compoundRange(self, rangeTuple):
     f, low, high, closure = rangeTuple
     opL = "greaterOrEqual" if closure.startswith('c') else "greaterThan"
     opH = "lessOrEqual" if closure.endswith('d') else "lessThan"
     if high is None:
         return self._simplePredicate(field=f, operator=opL, value=low)
     elif low is None:
         return self._simplePredicate(field=f, operator=opH, value=high)
     else:
         p1 = self._simplePredicate(field=f, operator=opL, value=low)
         p2 = self._simplePredicate(field=f, operator=opH, value=high)
         return pmml.newInstance("CompoundPredicate", attrib={"booleanOperator": "and"}, children=[p1, p2])
Beispiel #7
0
 def _compoundRange(self, rangeTuple):
     f, low, high, closure = rangeTuple
     opL = "greaterOrEqual" if closure.startswith('c') else "greaterThan"
     opH = "lessOrEqual" if closure.endswith('d') else "lessThan"
     if high is None:
         return self._simplePredicate(field=f, operator=opL, value=low)
     elif low is None:
         return self._simplePredicate(field=f, operator=opH, value=high)
     else:
         p1 = self._simplePredicate(field=f, operator=opL, value=low)
         p2 = self._simplePredicate(field=f, operator=opH, value=high)
         return pmml.newInstance("CompoundPredicate", attrib={"booleanOperator": "and"}, children=[p1, p2])
Beispiel #8
0
    def updateHistogram(self, syncNumber, get):
        """Update a baseline model with a chiSquareDistribution or
        scalarProduct testStatistic (binned histogram)."""

        self.resetLoggerLevels()
        if self.first:
            self._updateHistogram_first()
            self.first = False

        value = get(self.field)
        if value is INVALID or value is MISSING:
            self.logger.debug(
                "updateHistogram: returning False (INVALID or MISSING data)")
            return False

        if self.weightField is None:
            weight = 1.
        else:
            weight = get(self.weightField)
            if weight is INVALID or weight is MISSING:
                self.logger.debug(
                    "updateHistogram: returning False (INVALID or MISSING weight)"
                )
                return False

        # this might be a new bin
        if value not in self.pmmlEntries:
            newNode = pmml.newInstance("FieldValueCount",
                                       attrib={
                                           "field": self.field,
                                           "value": value,
                                           "count": 0
                                       })  # FIXME: should field=self.field???

            self.countTable.children.append(newNode)
            self.pmmlEntries[value] = newNode
            self.updators[value] = self.engine.producerUpdateScheme.updator(
                SUMX)

        # for histograms, increment all bins, but only the correct bin gets a non-zero value
        for bin, updator in self.updators.items():
            if bin == value:
                updator.increment(syncNumber, weight)
            else:
                updator.increment(syncNumber, 0.)
            self.pmmlEntries[bin].attrib["count"] = self.updators[bin].sum()

        self.total_updator.increment(syncNumber, weight)
        self.countTable.attrib["sample"] = self.total_updator.sum()

        return True
Beispiel #9
0
    def updateHistogram(self, syncNumber, get):
        """Update a baseline model with a chiSquareDistribution or
        scalarProduct testStatistic (binned histogram)."""

        self.resetLoggerLevels()
        if self.first:
            self._updateHistogram_first()
            self.first = False

        value = get(self.field)
        if value is INVALID or value is MISSING:
            self.logger.debug("updateHistogram: returning False (INVALID or MISSING data)")
            return False

        if self.weightField is None:
            weight = 1.
        else:
            weight = get(self.weightField)
            if weight is INVALID or weight is MISSING:
                self.logger.debug("updateHistogram: returning False (INVALID or MISSING weight)")
                return False

        # this might be a new bin
        if value not in self.pmmlEntries:
            newNode = pmml.newInstance("FieldValueCount", attrib={"field": self.field, "value": value, "count": 0})    # FIXME: should field=self.field???

            self.countTable.children.append(newNode)
            self.pmmlEntries[value] = newNode
            self.updators[value] = self.engine.producerUpdateScheme.updator(SUMX)

        # for histograms, increment all bins, but only the correct bin gets a non-zero value
        for bin, updator in self.updators.items():
            if bin == value:
                updator.increment(syncNumber, weight)
            else:
                updator.increment(syncNumber, 0.)
            self.pmmlEntries[bin].attrib["count"] = self.updators[bin].sum()

        self.total_updator.increment(syncNumber, weight)
        self.countTable.attrib["sample"] = self.total_updator.sum()

        return True
 def _simplePredicate(self, field, value, operator):
     p = pmml.newInstance("SimplePredicate", attrib={"field": field, "value": value, "operator": operator})
     p.post_validate()
     return p
Beispiel #11
0
    def update(self, syncNumber, get):
        self.resetLoggerLevels()
        if self.first: self.firstUpdate()

        ### get the output value
        outputValue = get(self.outputField)
        if outputValue is INVALID or outputValue is MISSING:
            self.logger.debug(
                "NaiveBayes.update: returning False (INVALID or MISSING data)")
            return False
        # output values are compared as strings because that is how they're referenced by TargetValueCount["value"] and OutputField["value"]
        outputValue = str(outputValue)

        ### if we have not seen this output value, make a new element in all representations
        ### this happens relatively rarely
        if outputValue not in self.outputUpdators:
            # updator
            self.outputUpdators[
                outputValue] = self.engine.producerUpdateScheme.updator(SUMX)

            # PMML
            tvc = pmml.newInstance("TargetValueCount",
                                   attrib={
                                       "value": outputValue,
                                       "count": 0.
                                   })
            targetValueCounts = self.model.bayesOutput.targetValueCounts
            targetValueCounts.tvcMap[outputValue] = tvc
            targetValueCounts.children.append(tvc)

            # consumer cache
            self.model.targetIndex[outputValue] = len(
                self.model.targetCategories)
            self.model.targetCategories.append(outputValue)
            self.model.targetCounts = numpy.append(self.model.targetCounts, 0.)

        ### update the output values histogram
        ### this happens very frequently
        tvcMap = self.model.bayesOutput.targetValueCounts.tvcMap
        targetCounts = self.model.targetCounts
        targetIndex = self.model.targetIndex
        for value, updator in self.outputUpdators.items():
            # updator
            if value == outputValue:
                updator.increment(syncNumber, 1.)
            else:
                updator.increment(syncNumber, 0.)
            newcount = updator.sum()

            # PMML
            tvcMap[value].attrib["count"] = newcount

            # consumer cache
            targetCounts[targetIndex[value]] = newcount

        ### get the input value; INVALID input -> skip all input fields, MISSING input -> skip only the missing field
        inputValues = [bi.evaluate(get) for bi in self.model.bayesInputs]
        if INVALID in inputValues:
            self.logger.debug(
                "NaiveBayes.update: returning False (INVALID Bayes input fields)"
            )
            return False

        for inputField, inputValue in zip(self.inputFields, inputValues):
            if inputValue is not MISSING:
                bayesInput = self.model.bayesInput[inputField]
                inputPairUpdator = self.inputPairUpdators[inputField]
                inputDenomUpdator = self.inputDenomUpdators[inputField]

                ### if we have not seen this input value, make a new element in all representations
                ### this happens relatively rarely
                if inputValue not in inputPairUpdator:
                    # updator
                    inputPairUpdator[inputValue] = {}

                    # PMML
                    tv = pmml.newInstance("TargetValueCounts")
                    tv.tvcMap = {}

                    pc = pmml.newInstance("PairCounts",
                                          attrib={"value": inputValue},
                                          children=[tv])
                    pc.targetValueCounts = tv

                    bayesInput.pcMap[inputValue] = pc
                    bayesInput.tvcMap[inputValue] = {}
                    bayesInput.children.append(pc)

                    # consumer cache
                    bayesInput.pairCounts[inputValue] = {}

                ### advance local pointers one level deeper
                inputPairUpdator = inputPairUpdator[inputValue]
                pcMap = bayesInput.pcMap[inputValue]
                tvcMap = bayesInput.tvcMap[inputValue]
                pairCounts = bayesInput.pairCounts[inputValue]

                ### if we have not seen this input value/output value combination, make a new element
                if outputValue not in inputPairUpdator:
                    # updator
                    inputPairUpdator[
                        outputValue] = self.engine.producerUpdateScheme.updator(
                            SUMX)

                    # PMML
                    tvc = pmml.newInstance("TargetValueCount",
                                           attrib={
                                               "value": outputValue,
                                               "count": 0.
                                           })
                    tvcMap[outputValue] = tvc
                    pcMap.targetValueCounts.children.append(tvc)

                    # consumer cache
                    pairCounts[outputValue] = 0.

                ### update the output values histogram for this input value
                ### this happens very frequently
                for value, updator in inputPairUpdator.items():
                    # updator
                    if value == outputValue:
                        updator.increment(syncNumber, 1.)
                    else:
                        updator.increment(syncNumber, 0.)
                    newcount = updator.sum()

                    # PMML
                    tvcMap[value].attrib["count"] = newcount

                    # consumer cache
                    pairCounts[value] = newcount

                ### if this inputField has not seen this outputValue, make new elements (there is no corresponding PMML)
                denominator = bayesInput.denominators
                if outputValue not in inputDenomUpdator:
                    # updator
                    inputDenomUpdator[
                        outputValue] = self.engine.producerUpdateScheme.updator(
                            SUMX)

                    # consumer cache
                    denominator[outputValue] = 0.

                ### update the denominator histogram for this inputField (there is no corresponding PMML)
                ### this happens very frequently
                for value, updator in inputDenomUpdator.items():
                    # updator
                    if value == outputValue:
                        updator.increment(syncNumber, 1.)
                    else:
                        updator.increment(syncNumber, 0.)

                    # consumer cache
                    denominator[value] = updator.sum()

        # print "outputValue", outputValue
        # print "inputValues", dict(zip(self.inputFields, inputValues))
        # print
        # for bi in self.model.bayesInputs:
        #     print bi.xml()
        #     print "pairCounts", bi.pairCounts
        #     print "denominators", bi.denominators
        #     print
        # print self.model.bayesOutput.xml()
        # raw_input()

        return True
Beispiel #12
0
    def update(self, syncNumber, get):
        self.resetLoggerLevels()
        if self.first: self.firstUpdate()

        ### get the output value
        outputValue = get(self.outputField)
        if outputValue is INVALID or outputValue is MISSING:
            self.logger.debug("NaiveBayes.update: returning False (INVALID or MISSING data)")
            return False
        # output values are compared as strings because that is how they're referenced by TargetValueCount["value"] and OutputField["value"]
        outputValue = str(outputValue)

        ### if we have not seen this output value, make a new element in all representations
        ### this happens relatively rarely
        if outputValue not in self.outputUpdators:
            # updator
            self.outputUpdators[outputValue] = self.engine.producerUpdateScheme.updator(SUMX)

            # PMML
            tvc = pmml.newInstance("TargetValueCount", attrib={"value": outputValue, "count": 0.})
            targetValueCounts = self.model.bayesOutput.targetValueCounts
            targetValueCounts.tvcMap[outputValue] = tvc
            targetValueCounts.children.append(tvc)

            # consumer cache
            self.model.targetIndex[outputValue] = len(self.model.targetCategories)
            self.model.targetCategories.append(outputValue)
            self.model.targetCounts = numpy.append(self.model.targetCounts, 0.)

        ### update the output values histogram
        ### this happens very frequently
        tvcMap = self.model.bayesOutput.targetValueCounts.tvcMap
        targetCounts = self.model.targetCounts
        targetIndex = self.model.targetIndex
        for value, updator in self.outputUpdators.items():
            # updator
            if value == outputValue:
                updator.increment(syncNumber, 1.)
            else:
                updator.increment(syncNumber, 0.)
            newcount = updator.sum()

            # PMML
            tvcMap[value].attrib["count"] = newcount

            # consumer cache
            targetCounts[targetIndex[value]] = newcount

        ### get the input value; INVALID input -> skip all input fields, MISSING input -> skip only the missing field
        inputValues = [bi.evaluate(get) for bi in self.model.bayesInputs]
        if INVALID in inputValues:
            self.logger.debug("NaiveBayes.update: returning False (INVALID Bayes input fields)")
            return False

        for inputField, inputValue in zip(self.inputFields, inputValues):
            if inputValue is not MISSING:
                bayesInput = self.model.bayesInput[inputField]
                inputPairUpdator = self.inputPairUpdators[inputField]
                inputDenomUpdator = self.inputDenomUpdators[inputField]

                ### if we have not seen this input value, make a new element in all representations
                ### this happens relatively rarely
                if inputValue not in inputPairUpdator:
                    # updator
                    inputPairUpdator[inputValue] = {}

                    # PMML
                    tv = pmml.newInstance("TargetValueCounts")
                    tv.tvcMap = {}

                    pc = pmml.newInstance("PairCounts", attrib={"value": inputValue}, children=[tv])
                    pc.targetValueCounts = tv

                    bayesInput.pcMap[inputValue] = pc
                    bayesInput.tvcMap[inputValue] = {}
                    bayesInput.children.append(pc)

                    # consumer cache
                    bayesInput.pairCounts[inputValue] = {}

                ### advance local pointers one level deeper
                inputPairUpdator = inputPairUpdator[inputValue]
                pcMap = bayesInput.pcMap[inputValue]
                tvcMap = bayesInput.tvcMap[inputValue]
                pairCounts = bayesInput.pairCounts[inputValue]

                ### if we have not seen this input value/output value combination, make a new element
                if outputValue not in inputPairUpdator:
                    # updator
                    inputPairUpdator[outputValue] = self.engine.producerUpdateScheme.updator(SUMX)

                    # PMML
                    tvc = pmml.newInstance("TargetValueCount", attrib={"value": outputValue, "count": 0.})
                    tvcMap[outputValue] = tvc
                    pcMap.targetValueCounts.children.append(tvc)

                    # consumer cache
                    pairCounts[outputValue] = 0.

                ### update the output values histogram for this input value
                ### this happens very frequently
                for value, updator in inputPairUpdator.items():
                    # updator
                    if value == outputValue:
                        updator.increment(syncNumber, 1.)
                    else:
                        updator.increment(syncNumber, 0.)
                    newcount = updator.sum()
                    
                    # PMML
                    tvcMap[value].attrib["count"] = newcount

                    # consumer cache
                    pairCounts[value] = newcount

                ### if this inputField has not seen this outputValue, make new elements (there is no corresponding PMML)
                denominator = bayesInput.denominators
                if outputValue not in inputDenomUpdator:
                    # updator
                    inputDenomUpdator[outputValue] = self.engine.producerUpdateScheme.updator(SUMX)

                    # consumer cache
                    denominator[outputValue] = 0.

                ### update the denominator histogram for this inputField (there is no corresponding PMML)
                ### this happens very frequently
                for value, updator in inputDenomUpdator.items():
                    # updator
                    if value == outputValue:
                        updator.increment(syncNumber, 1.)
                    else:
                        updator.increment(syncNumber, 0.)

                    # consumer cache
                    denominator[value] = updator.sum()

        # print "outputValue", outputValue
        # print "inputValues", dict(zip(self.inputFields, inputValues))
        # print
        # for bi in self.model.bayesInputs:
        #     print bi.xml()
        #     print "pairCounts", bi.pairCounts
        #     print "denominators", bi.denominators
        #     print
        # print self.model.bayesOutput.xml()
        # raw_input()

        return True
Beispiel #13
0
    def initialize(self, **params):
        """Initialize a baseline producer.

        Unlike other producers, this creates the update function
        dynamically, depending on the testStatistic.
        """

        testDistributions = self.model.child(pmml.TestDistributions)
        self.field = testDistributions.attrib["field"]

        if "updateExisting" in params:
            self.updateExisting = pmml.boolCheck(params["updateExisting"])
            del params["updateExisting"]
        else:
            self.updateExisting = pmml.boolCheck(self.defaultParams["updateExisting"])

        self.first = True

        testStatistic = self.model.child(pmml.TestDistributions).attrib["testStatistic"]
        if testStatistic in ("CUSUM", "zValue", "GLR"):
            self.baseline = testDistributions.child(pmml.Baseline).child()
            self.update = self.updateDistribution

            if testStatistic == "CUSUM":
                if "alternateField" in params:
                    self.alternateField = params["alternateField"]
                    del params["alternateField"]
                else:
                    self.alternateField = None

                if "alternateValue" in params:
                    self.alternateValue = params["alternateValue"]
                    del params["alternateValue"]
                else:
                    self.alternateValue = None

                extension = testDistributions.child(pmml.Extension, exception=False)
                if extension is None:
                    extension = pmml.newInstance("Extension")
                    testDistributions.children.append(extension)
                extension.extender("ODG")

                self.cusumInitialization = extension.child(pmml.X_ODG_CUSUMInitialization, exception=False)
                if self.cusumInitialization is None:
                    self.cusumInitialization = pmml.newInstance("X-ODG-CUSUMInitialization", attrib={"value": 0.}, base=pmml.X_ODG_PMML)
                    extension.children.append(self.cusumInitialization)
                elif not self.updateExisting:
                    self.cusumInitialization.attrib["value"] = 0.

            else:
                self.alternateField = None
                self.alternateValue = None
                self.cusumInitialization = None

        elif testStatistic in ("chiSquareDistribution", "scalarProduct"):
            self.weightField = testDistributions.attrib.get("weightField", None)
            self.countTable = testDistributions.child(pmml.Baseline).child(lambda x: isinstance(x, (pmml.CountTable, pmml.NormalizedCountTable)))
            self.update = self.updateHistogram

        elif testStatistic == "chiSquareIndependence": 
            self.baseline = testDistributions.child(pmml.Baseline)
            self.fields = None
            self.countTable = None

            self.updators = {}
            self.total_updator = self.engine.producerUpdateScheme.updator(SUMX)
            self.update = self.updateChiSquareIndependence

        if "alternateField" in params:
            raise NotImplementedError, "The 'alternateField' producerParameter is only used by CUSUM"
        if "alternateValue" in params:
            raise NotImplementedError, "The 'alternateValue' producerParameter is only used by CUSUM"

        if len(params) > 0:
            raise TypeError, "Unrecognized parameters %s" % params
Beispiel #14
0
    def initialize(self, **params):
        """Initialize a baseline producer.

        Unlike other producers, this creates the update function
        dynamically, depending on the testStatistic.
        """

        testDistributions = self.model.child(pmml.TestDistributions)
        self.field = testDistributions.attrib["field"]

        if "updateExisting" in params:
            self.updateExisting = pmml.boolCheck(params["updateExisting"])
            del params["updateExisting"]
        else:
            self.updateExisting = pmml.boolCheck(self.defaultParams["updateExisting"])

        self.first = True

        testStatistic = self.model.child(pmml.TestDistributions).attrib["testStatistic"]
        if testStatistic in ("CUSUM", "zValue", "GLR"):
            self.baseline = testDistributions.child(pmml.Baseline).child()
            self.update = self.updateDistribution

            if testStatistic == "CUSUM":
                if "alternateField" in params:
                    self.alternateField = params["alternateField"]
                    del params["alternateField"]
                else:
                    self.alternateField = None

                if "alternateValue" in params:
                    self.alternateValue = params["alternateValue"]
                    del params["alternateValue"]
                else:
                    self.alternateValue = None

                extension = testDistributions.child(pmml.Extension, exception=False)
                if extension is None:
                    extension = pmml.newInstance("Extension")
                    testDistributions.children.append(extension)

                self.cusumInitialization = extension.child(pmml.X_ODG_CUSUMInitialization, exception=False)
                if self.cusumInitialization is None:
                    self.cusumInitialization = pmml.newInstance("X-ODG-CUSUMInitialization", attrib={"value": 0.}, base=pmml.X_ODG_PMML)
                    extension.children.append(self.cusumInitialization)
                elif not self.updateExisting:
                    self.cusumInitialization.attrib["value"] = 0.

            else:
                self.alternateField = None
                self.alternateValue = None
                self.cusumInitialization = None

        elif testStatistic in ("chiSquareDistribution", "scalarProduct"):
            self.weightField = testDistributions.attrib.get("weightField", None)
            self.countTable = testDistributions.child(pmml.Baseline).child(lambda x: isinstance(x, (pmml.CountTable, pmml.NormalizedCountTable)))
            self.update = self.updateHistogram

        elif testStatistic == "chiSquareIndependence": 
            self.baseline = testDistributions.child(pmml.Baseline)
            self.fields = None
            self.countTable = None

            self.updators = {}
            self.total_updator = self.engine.producerUpdateScheme.updator(SUMX)
            self.update = self.updateChiSquareIndependence

        if "alternateField" in params:
            raise NotImplementedError("The 'alternateField' producerParameter is only used by CUSUM")
        if "alternateValue" in params:
            raise NotImplementedError("The 'alternateValue' producerParameter is only used by CUSUM")

        if len(params) > 0:
            raise TypeError("Unrecognized parameters %s" % params)