def initialize(self, **params): if "updateExisting" in params: self.updateExisting = pmml.boolCheck(params["updateExisting"]) del params["updateExisting"] else: self.updateExisting = pmml.boolCheck( self.defaultParams["updateExisting"]) if self.updateExisting: raise NotImplementedError( "Updating from existing RegressionModels not implemented; use mode='replaceExisting'" ) if "dependentField" in params: self.dependentField = params["dependentField"] del params["dependentField"] else: self.dependentField = self.defaultParams["dependentField"] if self.dependentField == "": self.dependentField = None self.model = self.segmentRecord.pmmlModel self.predicted = [] for miningField in self.model.child(pmml.MiningSchema).matches( pmml.MiningField): name = miningField.attrib["name"] usageType = miningField.attrib.get("usageType", "active") if usageType == "predicted": self.predicted.append(name) if len(self.predicted) == 0: self.dependentField = INVALID else: if self.dependentField is None: # by default, take the first 'predicted' feature self.dependentField = self.predicted[0] else: if self.dependentField not in self.predicted: raise RuntimeError( "DependentField feature not found among the 'predicted' features in the RegressionModel's MiningSchema%s" % self.model.child(pmml.MiningSchema).fileAndLine()) self.regressionTable = self.model.regressionTables[0] p = 1 + len(self.regressionTable.numericTerms) + len( self.regressionTable.categoricalTerms) + len( self.regressionTable.predictorTerms) self.updator = self.engine.producerUpdateScheme.updator(OLS(p)) if len(params) > 0: raise TypeError("Unrecognized parameters %s" % params)
def initialize(self, **params): if "updateExisting" in params: self.updateExisting = pmml.boolCheck(params["updateExisting"]) del params["updateExisting"] else: self.updateExisting = pmml.boolCheck(self.defaultParams["updateExisting"]) if self.updateExisting: raise NotImplementedError("Updating from existing RegressionModels not implemented; use mode='replaceExisting'") if "dependentField" in params: self.dependentField = params["dependentField"] del params["dependentField"] else: self.dependentField = self.defaultParams["dependentField"] if self.dependentField == "": self.dependentField = None self.model = self.segmentRecord.pmmlModel self.predicted = [] for miningField in self.model.child(pmml.MiningSchema).matches(pmml.MiningField): name = miningField.attrib["name"] usageType = miningField.attrib.get("usageType", "active") if usageType == "predicted": self.predicted.append(name) if len(self.predicted) == 0: self.dependentField = INVALID else: if self.dependentField is None: # by default, take the first 'predicted' feature self.dependentField = self.predicted[0] else: if self.dependentField not in self.predicted: raise RuntimeError("DependentField feature not found among the 'predicted' features in the RegressionModel's MiningSchema%s" % self.model.child(pmml.MiningSchema).fileAndLine()) self.regressionTable = self.model.regressionTables[0] p = 1 + len(self.regressionTable.numericTerms) + len(self.regressionTable.categoricalTerms) + len(self.regressionTable.predictorTerms) self.updator = self.engine.producerUpdateScheme.updator(OLS(p)) if len(params) > 0: raise TypeError("Unrecognized parameters %s" % params)
def initialize(self, **params): if "updateExisting" in params: self.updateExisting = pmml.boolCheck(params["updateExisting"]) del params["updateExisting"] if self.updateExisting: raise NotImplementedError, "Updating from existing ClusterModels using 'kmeans' not implemented; use mode='replaceExisting'" else: self.updateExisting = pmml.boolCheck( self.defaultParams["updateExisting"]) if "numberOfTrials" in params: self.numberOfTrials = int(params["numberOfTrials"]) del params["numberOfTrials"] else: self.numberOfTrials = int(self.defaultParams["numberOfTrials"]) if "numberToConverge" in params: self.numberToConverge = int(params["numberToConverge"]) del params["numberToConverge"] else: self.numberToConverge = int(self.defaultParams["numberToConverge"]) self.model = self.segmentRecord.pmmlModel self.dataDistribution = self.engine.producerUpdateScheme.updator( COVARIANCE(self.model.numberOfFields)) self.distanceMeasure = (self.model.child( pmml.ComparisonMeasure).attrib["kind"] == "distance") # get rid of any PartialSums objects, since they would be misleading (this algorithm doesn't use them) extension = self.model.child(pmml.Extension, exception=False) if extension is not None: newChildren = [] for child in extension.children: if not isinstance(child, pmml.X_ODG_PartialSums): newChildren.append(child) extension.children = newChildren self.buffer = {self.SYNCNUMBER: []} for field in self.model.fields: self.buffer[field] = [] if len(params) > 0: raise TypeError, "Unrecognized parameters %s" % params
def initialize(self, **params): if "updateExisting" in params: self.updateExisting = pmml.boolCheck(params["updateExisting"]) del params["updateExisting"] else: self.updateExisting = pmml.boolCheck(self.defaultParams["updateExisting"]) self.model = self.segmentRecord.pmmlModel ### field names for get() self.inputFields = [] for bayesInput in self.model.bayesInputs: self.inputFields.append(bayesInput.attrib["fieldName"]) self.outputField = self.model.bayesOutput.attrib["fieldName"] self.first = True if len(params) > 0: raise TypeError("Unrecognized parameters %s" % params)
def initialize(self, **params): if "updateExisting" in params: self.updateExisting = pmml.boolCheck(params["updateExisting"]) del params["updateExisting"] else: self.updateExisting = pmml.boolCheck( self.defaultParams["updateExisting"]) self.model = self.segmentRecord.pmmlModel ### field names for get() self.inputFields = [] for bayesInput in self.model.bayesInputs: self.inputFields.append(bayesInput.attrib["fieldName"]) self.outputField = self.model.bayesOutput.attrib["fieldName"] self.first = True if len(params) > 0: raise TypeError("Unrecognized parameters %s" % params)
def initialize(self, **params): if "updateExisting" in params: self.updateExisting = pmml.boolCheck(params["updateExisting"]) del params["updateExisting"] if self.updateExisting: raise NotImplementedError, "Updating from existing ClusterModels using 'kmeans' not implemented; use mode='replaceExisting'" else: self.updateExisting = pmml.boolCheck(self.defaultParams["updateExisting"]) if "numberOfTrials" in params: self.numberOfTrials = int(params["numberOfTrials"]) del params["numberOfTrials"] else: self.numberOfTrials = int(self.defaultParams["numberOfTrials"]) if "numberToConverge" in params: self.numberToConverge = int(params["numberToConverge"]) del params["numberToConverge"] else: self.numberToConverge = int(self.defaultParams["numberToConverge"]) self.model = self.segmentRecord.pmmlModel self.dataDistribution = self.engine.producerUpdateScheme.updator(COVARIANCE(self.model.numberOfFields)) self.distanceMeasure = (self.model.child(pmml.ComparisonMeasure).attrib["kind"] == "distance") # get rid of any PartialSums objects, since they would be misleading (this algorithm doesn't use them) extension = self.model.child(pmml.Extension, exception=False) if extension is not None: newChildren = [] for child in extension.children: if not isinstance(child, pmml.X_ODG_PartialSums): newChildren.append(child) extension.children = newChildren self.buffer = {self.SYNCNUMBER: []} for field in self.model.fields: self.buffer[field] = [] if len(params) > 0: raise TypeError, "Unrecognized parameters %s" % params
def initialize(self, **params): if "updateExisting" in params: self.updateExisting = pmml.boolCheck(params["updateExisting"]) del params["updateExisting"] if self.updateExisting: raise NotImplementedError, "Updating from existing ClusterModels using 'kmeans' not implemented; use mode='replaceExisting'" else: self.updateExisting = pmml.boolCheck(self.defaultParams["updateExisting"]) if "quickConvergeSteps" in params: try: self.quickConvergeSteps = eval(params["quickConvergeSteps"]) if not isinstance(self.quickConvergeSteps, tuple): raise RuntimeError self.quickConvergeSteps = map(int, self.quickConvergeSteps) except err: raise RuntimeError, "quickConvergeSteps must be a tuple of numbers of events" del params["quickConvergeSteps"] else: self.quickConvergeSteps = eval(self.defaultParams["quickConvergeSteps"]) if "seedSource" in params: self.seedSource = params["seedSource"] del params["seedSource"] else: self.seedSource = self.defaultParams["seedSource"] if self.seedSource == "dataPoints": self.seedSource = self.RANDOM_DATAPOINTS elif self.seedSource == "dataCovariance": self.seedSource = self.RANDOM_DATACOVARIANCE elif self.seedSource == "unitRect": self.seedSource = self.RANDOM_UNITRECT else: raise NotImplementedError, "The seedSource must be one of 'dataPoints', 'dataCovariance', 'unitRect'" if "numberOfTrials" in params: self.numberOfTrials = int(params["numberOfTrials"]) del params["numberOfTrials"] else: self.numberOfTrials = int(self.defaultParams["numberOfTrials"]) if "numberToConverge" in params: self.numberToConverge = int(params["numberToConverge"]) del params["numberToConverge"] else: self.numberToConverge = int(self.defaultParams["numberToConverge"]) self.model = self.segmentRecord.pmmlModel self.dataDistribution = self.engine.producerUpdateScheme.updator(COVARIANCE(self.model.numberOfFields)) self.distanceMeasure = (self.model.child(pmml.ComparisonMeasure).attrib["kind"] == "distance") # get rid of any PartialSums objects, since they would be misleading (this algorithm doesn't use them) extension = self.model.child(pmml.Extension, exception=False) if extension is not None: newChildren = [] for child in extension.children: if not isinstance(child, pmml.X_ODG_PartialSums): newChildren.append(child) extension.children = newChildren self.buffer = {self.SYNCNUMBER: []} for field in self.model.fields: self.buffer[field] = [] if len(params) > 0: raise TypeError, "Unrecognized parameters %s" % params
def initialize(self, **params): """Initialize a clustering model producer.""" if "updateExisting" in params: self.updateExisting = pmml.boolCheck(params["updateExisting"]) del params["updateExisting"] else: self.updateExisting = pmml.boolCheck(self.defaultParams["updateExisting"]) if "numberOfTrials" in params: self.numberOfTrials = int(params["numberOfTrials"]) del params["numberOfTrials"] else: self.numberOfTrials = int(self.defaultParams["numberOfTrials"]) if "numberToKeep" in params: self.numberToKeep = int(params["numberToKeep"]) del params["numberToKeep"] else: self.numberToKeep = int(self.defaultParams["numberToKeep"]) if "maturityThreshold" in params: self.maturityThreshold = int(params["maturityThreshold"]) del params["maturityThreshold"] else: self.maturityThreshold = int(self.defaultParams["maturityThreshold"]) if "initialStability" in params: self.initialStability = int(params["initialStability"]) del params["initialStability"] else: self.initialStability = int(self.defaultParams["initialStability"]) if "overrideSignificance" in params: self.overrideSignificance = float(params["overrideSignificance"]) del params["overrideSignificance"] if self.overrideSignificance == 0.: self.overrideSignificance = None else: self.overrideSignificance = float(self.defaultParams["overrideSignificance"]) self.model = self.segmentRecord.pmmlModel self.dataDistribution = self.engine.producerUpdateScheme.updator(COVARIANCE(self.model.numberOfFields)) self.distanceMeasure = (self.model.child(pmml.ComparisonMeasure).attrib["kind"] == "distance") # put PartialSums in the model if they're not already there; pick up old values if you're resuming extension = self.model.child(pmml.Extension, exception=False) if extension is None: extension = pmml.Extension() self.model.children.append(extension) if self.updateExisting: self.sumOfDistances = extension.child(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == "SumOfDistances", exception=False) else: index = extension.index(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == "SumOfDistances", exception=False) if index is not None: del extension[index] self.sumOfDistances = None if self.sumOfDistances is None: self.sumOfDistances = pmml.X_ODG_PartialSums(name="SumOfDistances", COUNT=0, SUM1=0., SUMX=0., SUMXX=0.) extension.children.append(self.sumOfDistances) self.partialSums = {} for theid, cluster in zip(self.model.ids, self.model.cluster): for i, field in enumerate(self.model.fields): fullname = "%s.%s" % (theid, field) if self.updateExisting: partialSum = extension.child(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == fullname, exception=False) else: index = extension.index(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == fullname, exception=False) if index is not None: del extension[index] partialSum = None if partialSum is None: partialSum = pmml.X_ODG_PartialSums(name=fullname, SUM1=1., SUMX=cluster.value[i]) extension.children.append(partialSum) self.partialSums[fullname] = partialSum # create the first trial using the values constructed above (they come from the PMML file if updateExisting is True) trialFromPmml = new.instance(TrialClusterSet) trialFromPmml.updator = self.engine.producerUpdateScheme.updator(COUNT, SUM1, SUMX, SUMXX) trialFromPmml.updator.initialize({COUNT: self.sumOfDistances.attrib["COUNT"], SUM1: self.sumOfDistances.attrib["SUM1"], SUMX: self.sumOfDistances.attrib["SUMX"], SUMXX: self.sumOfDistances.attrib["SUMXX"]}) trialFromPmml.clusters = [] for theid, cluster in zip(self.model.ids, self.model.cluster): trialCluster = new.instance(TrialCluster) trialCluster.fields = [] trialCluster.initialPosition = [] for field in self.model.fields: partialSum = self.partialSums["%s.%s" % (theid, field)] u = self.engine.producerUpdateScheme.updator(SUM1, SUMX) u.initialize({SUM1: partialSum.attrib["SUM1"], SUMX: partialSum.attrib["SUMX"]}) trialCluster.fields.append(u) trialCluster.initialPosition = list(cluster.value) trialFromPmml.clusters.append(trialCluster) self.trials = [trialFromPmml] if len(params) > 0: raise TypeError, "Unrecognized parameters %s" % params
def initialize(self, **params): """Initialize a baseline producer. Unlike other producers, this creates the update function dynamically, depending on the testStatistic. """ testDistributions = self.model.child(pmml.TestDistributions) self.field = testDistributions.attrib["field"] if "updateExisting" in params: self.updateExisting = pmml.boolCheck(params["updateExisting"]) del params["updateExisting"] else: self.updateExisting = pmml.boolCheck(self.defaultParams["updateExisting"]) self.first = True testStatistic = self.model.child(pmml.TestDistributions).attrib["testStatistic"] if testStatistic in ("CUSUM", "zValue", "GLR"): self.baseline = testDistributions.child(pmml.Baseline).child() self.update = self.updateDistribution if testStatistic == "CUSUM": if "alternateField" in params: self.alternateField = params["alternateField"] del params["alternateField"] else: self.alternateField = None if "alternateValue" in params: self.alternateValue = params["alternateValue"] del params["alternateValue"] else: self.alternateValue = None extension = testDistributions.child(pmml.Extension, exception=False) if extension is None: extension = pmml.newInstance("Extension") testDistributions.children.append(extension) extension.extender("ODG") self.cusumInitialization = extension.child(pmml.X_ODG_CUSUMInitialization, exception=False) if self.cusumInitialization is None: self.cusumInitialization = pmml.newInstance("X-ODG-CUSUMInitialization", attrib={"value": 0.}, base=pmml.X_ODG_PMML) extension.children.append(self.cusumInitialization) elif not self.updateExisting: self.cusumInitialization.attrib["value"] = 0. else: self.alternateField = None self.alternateValue = None self.cusumInitialization = None elif testStatistic in ("chiSquareDistribution", "scalarProduct"): self.weightField = testDistributions.attrib.get("weightField", None) self.countTable = testDistributions.child(pmml.Baseline).child(lambda x: isinstance(x, (pmml.CountTable, pmml.NormalizedCountTable))) self.update = self.updateHistogram elif testStatistic == "chiSquareIndependence": self.baseline = testDistributions.child(pmml.Baseline) self.fields = None self.countTable = None self.updators = {} self.total_updator = self.engine.producerUpdateScheme.updator(SUMX) self.update = self.updateChiSquareIndependence if "alternateField" in params: raise NotImplementedError, "The 'alternateField' producerParameter is only used by CUSUM" if "alternateValue" in params: raise NotImplementedError, "The 'alternateValue' producerParameter is only used by CUSUM" if len(params) > 0: raise TypeError, "Unrecognized parameters %s" % params
def initialize(self, **params): if "updateExisting" in params: self.updateExisting = pmml.boolCheck(params["updateExisting"]) del params["updateExisting"] if self.updateExisting: raise NotImplementedError, "Updating from existing ClusterModels using 'kmeans' not implemented; use mode='replaceExisting'" else: self.updateExisting = pmml.boolCheck( self.defaultParams["updateExisting"]) if "quickConvergeSteps" in params: try: self.quickConvergeSteps = eval(params["quickConvergeSteps"]) if not isinstance(self.quickConvergeSteps, tuple): raise RuntimeError self.quickConvergeSteps = map(int, self.quickConvergeSteps) except err: raise RuntimeError, "quickConvergeSteps must be a tuple of numbers of events" del params["quickConvergeSteps"] else: self.quickConvergeSteps = eval( self.defaultParams["quickConvergeSteps"]) if "seedSource" in params: self.seedSource = params["seedSource"] del params["seedSource"] else: self.seedSource = self.defaultParams["seedSource"] if self.seedSource == "dataPoints": self.seedSource = self.RANDOM_DATAPOINTS elif self.seedSource == "dataCovariance": self.seedSource = self.RANDOM_DATACOVARIANCE elif self.seedSource == "unitRect": self.seedSource = self.RANDOM_UNITRECT else: raise NotImplementedError, "The seedSource must be one of 'dataPoints', 'dataCovariance', 'unitRect'" if "numberOfTrials" in params: self.numberOfTrials = int(params["numberOfTrials"]) del params["numberOfTrials"] else: self.numberOfTrials = int(self.defaultParams["numberOfTrials"]) if "numberToConverge" in params: self.numberToConverge = int(params["numberToConverge"]) del params["numberToConverge"] else: self.numberToConverge = int(self.defaultParams["numberToConverge"]) self.model = self.segmentRecord.pmmlModel self.dataDistribution = self.engine.producerUpdateScheme.updator( COVARIANCE(self.model.numberOfFields)) self.distanceMeasure = (self.model.child( pmml.ComparisonMeasure).attrib["kind"] == "distance") # get rid of any PartialSums objects, since they would be misleading (this algorithm doesn't use them) extension = self.model.child(pmml.Extension, exception=False) if extension is not None: newChildren = [] for child in extension.children: if not isinstance(child, pmml.X_ODG_PartialSums): newChildren.append(child) extension.children = newChildren self.buffer = {self.SYNCNUMBER: []} for field in self.model.fields: self.buffer[field] = [] if len(params) > 0: raise TypeError, "Unrecognized parameters %s" % params
def initialize(self, **params): """Initialize a baseline producer. Unlike other producers, this creates the update function dynamically, depending on the testStatistic. """ testDistributions = self.segmentRecord.pmmlModel.child(pmml.TestDistributions) self.field = testDistributions.attrib["field"] if "resume" in params: self.resume = pmml.boolCheck(params["resume"]) del params["resume"] else: self.resume = False self.first = True testStatistic = self.segmentRecord.pmmlModel.child(pmml.TestDistributions).attrib["testStatistic"] if testStatistic in ("CUSUM", "zValue", "GLR"): self.baseline = testDistributions.child(pmml.Baseline).child() self.update = self.updateDistribution if testStatistic == "CUSUM": if "alternateField" in params: self.alternateField = params["alternateField"] del params["alternateField"] else: self.alternateField = None if "alternateValue" in params: self.alternateValue = params["alternateValue"] del params["alternateValue"] else: self.alternateValue = None else: self.alternateField = None self.alternateValue = None elif testStatistic in ("chiSquareDistribution", "scalarProduct"): self.weightField = testDistributions.attrib.get("weightField", None) self.countTable = testDistributions.child(pmml.Baseline).child( lambda x: isinstance(x, (pmml.CountTable, pmml.NormalizedCountTable)) ) self.update = self.updateHistogram elif testStatistic == "chiSquareIndependence": self.baseline = testDistributions.child(pmml.Baseline) self.fields = None self.countTable = None self.updators = {} self.total_updator = self.engine.producerUpdateScheme.updator(SUMX) self.update = self.updateChiSquareIndependence if "alternateField" in params: raise NotImplementedError, "The 'alternateField' producerParameter is only used by CUSUM" if "alternateValue" in params: raise NotImplementedError, "The 'alternateValue' producerParameter is only used by CUSUM" if len(params) > 0: raise TypeError, "Unrecognized parameters %s" % params
def initialize(self, **params): """Initialize a baseline producer. Unlike other producers, this creates the update function dynamically, depending on the testStatistic. """ testDistributions = self.model.child(pmml.TestDistributions) self.field = testDistributions.attrib["field"] if "updateExisting" in params: self.updateExisting = pmml.boolCheck(params["updateExisting"]) del params["updateExisting"] else: self.updateExisting = pmml.boolCheck(self.defaultParams["updateExisting"]) self.first = True testStatistic = self.model.child(pmml.TestDistributions).attrib["testStatistic"] if testStatistic in ("CUSUM", "zValue", "GLR"): self.baseline = testDistributions.child(pmml.Baseline).child() self.update = self.updateDistribution if testStatistic == "CUSUM": if "alternateField" in params: self.alternateField = params["alternateField"] del params["alternateField"] else: self.alternateField = None if "alternateValue" in params: self.alternateValue = params["alternateValue"] del params["alternateValue"] else: self.alternateValue = None extension = testDistributions.child(pmml.Extension, exception=False) if extension is None: extension = pmml.newInstance("Extension") testDistributions.children.append(extension) self.cusumInitialization = extension.child(pmml.X_ODG_CUSUMInitialization, exception=False) if self.cusumInitialization is None: self.cusumInitialization = pmml.newInstance("X-ODG-CUSUMInitialization", attrib={"value": 0.}, base=pmml.X_ODG_PMML) extension.children.append(self.cusumInitialization) elif not self.updateExisting: self.cusumInitialization.attrib["value"] = 0. else: self.alternateField = None self.alternateValue = None self.cusumInitialization = None elif testStatistic in ("chiSquareDistribution", "scalarProduct"): self.weightField = testDistributions.attrib.get("weightField", None) self.countTable = testDistributions.child(pmml.Baseline).child(lambda x: isinstance(x, (pmml.CountTable, pmml.NormalizedCountTable))) self.update = self.updateHistogram elif testStatistic == "chiSquareIndependence": self.baseline = testDistributions.child(pmml.Baseline) self.fields = None self.countTable = None self.updators = {} self.total_updator = self.engine.producerUpdateScheme.updator(SUMX) self.update = self.updateChiSquareIndependence if "alternateField" in params: raise NotImplementedError("The 'alternateField' producerParameter is only used by CUSUM") if "alternateValue" in params: raise NotImplementedError("The 'alternateValue' producerParameter is only used by CUSUM") if len(params) > 0: raise TypeError("Unrecognized parameters %s" % params)
def initialize(self, **params): if "updateExisting" in params: self.updateExisting = pmml.boolCheck(params["updateExisting"]) del params["updateExisting"] else: self.updateExisting = pmml.boolCheck( self.defaultParams["updateExisting"]) if self.updateExisting: raise NotImplementedError, "Updating from existing TreeModels/RuleSetModels not implemented; use mode='replaceExisting'" if "treeMaxDepth" in params: self.treeMaxDepth = int(params["treeMaxDepth"]) del params["treeMaxDepth"] else: self.treeMaxDepth = int(self.defaultParams["treeMaxDepth"]) if "classification" in params: self.classification = params["classification"] del params["classification"] else: self.classification = self.defaultParams["classification"] if self.classification == "": self.classification = None self.model = self.segmentRecord.pmmlModel if isinstance(self.model, pmml.TreeModel): self.nodeIndex = self.model.index(pmml.Node) self.model.attrib["splitCharacteristic"] = "binarySplit" elif isinstance(self.model, pmml.RuleSetModel): self.ruleSet = self.model.child(pmml.RuleSet) self.nodeIndex = self.ruleSet.index( lambda x: isinstance(x, (pmml.SimpleRule, pmml.CompoundRule)), exception=False) if self.nodeIndex is None: self.nodeIndex = len(self.ruleSet.children) self.ruleSet.children.append(None) self.features = [] self.categorical = {} self.predicted = [] self.data = {} for miningField in self.model.child(pmml.MiningSchema).matches( pmml.MiningField): name = miningField.attrib["name"] usageType = miningField.attrib.get("usageType", "active") if usageType == "active": dataType = self.model.dataContext.dataType[name] optype = self.model.dataContext.optype[name] self.features.append(name) self.categorical[name] = (optype == "categorical") if dataType == "boolean": self.data[name] = array.array("b") elif dataType == "integer": self.data[name] = array.array("l") elif dataType in ("float", "double"): self.data[name] = array.array("d") else: self.data[name] = [] if usageType == "predicted": self.predicted.append(name) if len(self.predicted) == 0: self.classification = INVALID else: if self.classification is None: # by default, take the first 'predicted' feature self.classification = self.predicted[0] else: if self.classification not in self.predicted: raise RuntimeError, "Classification feature not found among the 'predicted' features in the decision tree's MiningSchema%s" % self.model.child( pmml.MiningSchema).fileAndLine() self.data[self.classification] = [] if len(params) > 0: raise TypeError, "Unrecognized parameters %s" % params
def initialize(self, **params): """An event-based tree-producing algorithm. Although it does not iterate over the data as the standard CART algorithm does, it converges to an approximate tree by keeping alternate hypotheses in mind and collecting data for all active hypotheses. """ if "updateExisting" in params: self.updateExisting = pmml.boolCheck(params["updateExisting"]) del params["updateExisting"] else: self.updateExisting = pmml.boolCheck( self.defaultParams["updateExisting"]) if self.updateExisting: raise NotImplementedError, "Updating from existing TreeModels/RuleSetModels not implemented; use mode='replaceExisting'" if "featureMaturityThreshold" in params: self.featureMaturityThreshold = int( params["featureMaturityThreshold"]) del params["featureMaturityThreshold"] else: self.featureMaturityThreshold = int( self.defaultParams["featureMaturityThreshold"]) if "splitMaturityThreshold" in params: self.splitMaturityThreshold = int(params["splitMaturityThreshold"]) del params["splitMaturityThreshold"] else: self.splitMaturityThreshold = int( self.defaultParams["splitMaturityThreshold"]) if "trialsToKeep" in params: self.trialsToKeep = int(params["trialsToKeep"]) del params["trialsToKeep"] else: self.trialsToKeep = int(self.defaultParams["trialsToKeep"]) if "worldsToSplit" in params: self.worldsToSplit = int(params["worldsToSplit"]) del params["worldsToSplit"] else: self.worldsToSplit = int(self.defaultParams["worldsToSplit"]) if "treeDepth" in params: self.treeDepth = int(params["treeDepth"]) del params["treeDepth"] else: self.treeDepth = int(self.defaultParams["treeDepth"]) if "classification" in params: self.classification = params["classification"] del params["classification"] else: self.classification = self.defaultParams["classification"] if self.classification == "": self.classification = None self.model = self.segmentRecord.pmmlModel if isinstance(self.model, pmml.TreeModel): self.modelType = self.TREEMODEL self.nodeIndex = self.model.index(pmml.Node) elif isinstance(self.model, pmml.RuleSetModel): self.ruleSet = self.model.child(pmml.RuleSet) self.modelType = self.RULESETMODEL self.nodeIndex = self.ruleSet.index( lambda x: isinstance(x, (pmml.SimpleRule, pmml.CompoundRule)), exception=False) if self.nodeIndex is None: self.nodeIndex = len(self.ruleSet.children) self.ruleSet.children.append(None) self.features = [] self.predicted = [] for miningField in self.model.child(pmml.MiningSchema).matches( pmml.MiningField): name = miningField.attrib["name"] usageType = miningField.attrib.get("usageType", "active") if usageType == "active": dataType = self.model.dataContext.dataType[name] optype = self.model.dataContext.optype[name] if optype == "ordinal" and dataType == "string": optype = self.model.dataContext.cast[name] feature = Feature(name, optype, dataType, self.engine.producerUpdateScheme) feature.maturityThreshold = self.featureMaturityThreshold self.features.append(feature) if usageType == "predicted": self.predicted.append(name) if len(self.predicted) == 0: self.classification = INVALID else: if self.classification is None: # by default, take the first 'predicted' feature self.classification = self.predicted[0] else: if self.classification not in self.predicted: raise RuntimeError, "Classification feature not found among the 'predicted' features in the decision tree's MiningSchema%s" % self.model.child( pmml.MiningSchema).fileAndLine() self.topWorld = World(0, None) self.counts = {} if len(params) > 0: raise TypeError, "Unrecognized parameters %s" % params
def initialize(self, **params): """An event-based tree-producing algorithm. Although it does not iterate over the data as the standard CART algorithm does, it converges to an approximate tree by keeping alternate hypotheses in mind and collecting data for all active hypotheses. """ if "updateExisting" in params: self.updateExisting = pmml.boolCheck(params["updateExisting"]) del params["updateExisting"] else: self.updateExisting = pmml.boolCheck(self.defaultParams["updateExisting"]) if self.updateExisting: raise NotImplementedError, "Updating from existing TreeModels/RuleSetModels not implemented; use mode='replaceExisting'" if "featureMaturityThreshold" in params: self.featureMaturityThreshold = int(params["featureMaturityThreshold"]) del params["featureMaturityThreshold"] else: self.featureMaturityThreshold = int(self.defaultParams["featureMaturityThreshold"]) if "splitMaturityThreshold" in params: self.splitMaturityThreshold = int(params["splitMaturityThreshold"]) del params["splitMaturityThreshold"] else: self.splitMaturityThreshold = int(self.defaultParams["splitMaturityThreshold"]) if "trialsToKeep" in params: self.trialsToKeep = int(params["trialsToKeep"]) del params["trialsToKeep"] else: self.trialsToKeep = int(self.defaultParams["trialsToKeep"]) if "worldsToSplit" in params: self.worldsToSplit = int(params["worldsToSplit"]) del params["worldsToSplit"] else: self.worldsToSplit = int(self.defaultParams["worldsToSplit"]) if "treeDepth" in params: self.treeDepth = int(params["treeDepth"]) del params["treeDepth"] else: self.treeDepth = int(self.defaultParams["treeDepth"]) if "classification" in params: self.classification = params["classification"] del params["classification"] else: self.classification = self.defaultParams["classification"] if self.classification == "": self.classification = None self.model = self.segmentRecord.pmmlModel if isinstance(self.model, pmml.TreeModel): self.modelType = self.TREEMODEL self.nodeIndex = self.model.index(pmml.Node) elif isinstance(self.model, pmml.RuleSetModel): self.ruleSet = self.model.child(pmml.RuleSet) self.modelType = self.RULESETMODEL self.nodeIndex = self.ruleSet.index(lambda x: isinstance(x, (pmml.SimpleRule, pmml.CompoundRule)), exception=False) if self.nodeIndex is None: self.nodeIndex = len(self.ruleSet.children) self.ruleSet.children.append(None) self.features = [] self.predicted = [] for miningField in self.model.child(pmml.MiningSchema).matches(pmml.MiningField): name = miningField.attrib["name"] usageType = miningField.attrib.get("usageType", "active") if usageType == "active": dataType = self.model.dataContext.dataType[name] optype = self.model.dataContext.optype[name] if optype == "ordinal" and dataType == "string": optype = self.model.dataContext.cast[name] feature = Feature(name, optype, dataType, self.engine.producerUpdateScheme) feature.maturityThreshold = self.featureMaturityThreshold self.features.append(feature) if usageType == "predicted": self.predicted.append(name) if len(self.predicted) == 0: self.classification = INVALID else: if self.classification is None: # by default, take the first 'predicted' feature self.classification = self.predicted[0] else: if self.classification not in self.predicted: raise RuntimeError, "Classification feature not found among the 'predicted' features in the decision tree's MiningSchema%s" % self.model.child(pmml.MiningSchema).fileAndLine() self.topWorld = World(0, None) self.counts = {} if len(params) > 0: raise TypeError, "Unrecognized parameters %s" % params
def initialize(self, **params): if "updateExisting" in params: self.updateExisting = pmml.boolCheck(params["updateExisting"]) del params["updateExisting"] if self.updateExisting: raise NotImplementedError( "Updating from existing ClusterModels using 'kmeans' not implemented; use mode='replaceExisting'" ) else: self.updateExisting = pmml.boolCheck( self.defaultParams["updateExisting"]) if "quickConvergeSteps" in params: try: self.quickConvergeSteps = eval(params["quickConvergeSteps"]) if not isinstance(self.quickConvergeSteps, tuple): raise RuntimeError self.quickConvergeSteps = map(int, self.quickConvergeSteps) except err: raise RuntimeError( "quickConvergeSteps must be a tuple of numbers of events") del params["quickConvergeSteps"] else: self.quickConvergeSteps = eval( self.defaultParams["quickConvergeSteps"]) if "numberOfClusters" in params: self.numberOfClusters = params["numberOfClusters"] del params["numberOfClusters"] else: self.numberOfClusters = self.defaultParams["numberOfClusters"] try: self.numberOfClusters = int(self.numberOfClusters) if self.numberOfClusters <= 0: raise ValueError except ValueError: if self.numberOfClusters == "unset": self.numberOfClusters = None else: raise RuntimeError( "numberOfClusters must be a positive integer or \"unset\", not \"%s\"" % self.numberOfClusters) if "seedSource" in params: self.seedSource = params["seedSource"] del params["seedSource"] else: self.seedSource = self.defaultParams["seedSource"] if self.seedSource == "dataPoints": self.seedSource = self.RANDOM_DATAPOINTS elif self.seedSource == "dataWeighted": self.seedSource = self.RANDOM_DATAWEIGHTED elif self.seedSource == "dataCovariance": self.seedSource = self.RANDOM_DATACOVARIANCE elif self.seedSource == "unitRect": self.seedSource = self.RANDOM_UNITRECT else: raise NotImplementedError( "The seedSource must be one of 'dataPoints', 'dataCovariance', 'unitRect'" ) if "numberOfTrials" in params: self.numberOfTrials = int(params["numberOfTrials"]) del params["numberOfTrials"] else: self.numberOfTrials = int(self.defaultParams["numberOfTrials"]) if "numberToConverge" in params: self.numberToConverge = int(params["numberToConverge"]) del params["numberToConverge"] else: self.numberToConverge = int(self.defaultParams["numberToConverge"]) if self.numberToConverge > self.numberOfTrials: raise RuntimeError( "numberToConverge (%d) must not be greater than numberOfTrials (%d)" % (self.numberToConverge, self.numberOfTrials)) if "maxIterations" in params: self.maxIterations = params["maxIterations"] del params["maxIterations"] else: self.maxIterations = self.defaultParams["maxIterations"] try: self.maxIterations = int(self.maxIterations) if self.maxIterations <= 0: raise ValueError except ValueError: if self.maxIterations == "unset": self.maxIterations = None else: raise RuntimeError( "maxIterations must be a positive integer or \"unset\", not \"%s\"" % self.maxIterations) if "closeEnough" in params: self.closeEnough = float(params["closeEnough"]) del params["closeEnough"] else: self.closeEnough = float(self.defaultParams["closeEnough"]) self.model = self.segmentRecord.pmmlModel self.dataDistribution = self.engine.producerUpdateScheme.updator( COVARIANCE(self.model.numberOfFields)) self.distanceMeasure = (self.model.child( pmml.ComparisonMeasure).attrib["kind"] == "distance") if self.seedSource == self.RANDOM_DATAWEIGHTED and self.model.weightField is None: self.seedSource = self.RANDOM_DATAPOINTS # get rid of any PartialSums objects, since they would be misleading (this algorithm doesn't use them) extension = self.model.child(pmml.Extension, exception=False) if extension is not None: newChildren = [] for child in extension.children: if not isinstance(child, pmml.X_ODG_PartialSums): newChildren.append(child) extension.children = newChildren self.buffer = {self.SYNCNUMBER: []} for field in self.model.fields: self.buffer[field] = [] if self.model.weightField is not None: self.buffer[self.model.weightField] = [] if len(params) > 0: raise TypeError("Unrecognized parameters %s" % params)
def initialize(self, **params): if "updateExisting" in params: self.updateExisting = pmml.boolCheck(params["updateExisting"]) del params["updateExisting"] else: self.updateExisting = pmml.boolCheck(self.defaultParams["updateExisting"]) if self.updateExisting: raise NotImplementedError, "Updating from existing TreeModels/RuleSetModels not implemented; use mode='replaceExisting'" if "treeMaxDepth" in params: self.treeMaxDepth = int(params["treeMaxDepth"]) del params["treeMaxDepth"] else: self.treeMaxDepth = int(self.defaultParams["treeMaxDepth"]) if "classification" in params: self.classification = params["classification"] del params["classification"] else: self.classification = self.defaultParams["classification"] if self.classification == "": self.classification = None self.model = self.segmentRecord.pmmlModel if isinstance(self.model, pmml.TreeModel): self.nodeIndex = self.model.index(pmml.Node) self.model.attrib["splitCharacteristic"] = "binarySplit" elif isinstance(self.model, pmml.RuleSetModel): self.ruleSet = self.model.child(pmml.RuleSet) self.nodeIndex = self.ruleSet.index(lambda x: isinstance(x, (pmml.SimpleRule, pmml.CompoundRule)), exception=False) if self.nodeIndex is None: self.nodeIndex = len(self.ruleSet.children) self.ruleSet.children.append(None) self.features = [] self.categorical = {} self.predicted = [] self.data = {} for miningField in self.model.child(pmml.MiningSchema).matches(pmml.MiningField): name = miningField.attrib["name"] usageType = miningField.attrib.get("usageType", "active") if usageType == "active": dataType = self.model.dataContext.dataType[name] optype = self.model.dataContext.optype[name] self.features.append(name) self.categorical[name] = (optype == "categorical") if dataType == "boolean": self.data[name] = array.array("b") elif dataType == "integer": self.data[name] = array.array("l") elif dataType in ("float", "double"): self.data[name] = array.array("d") else: self.data[name] = [] if usageType == "predicted": self.predicted.append(name) if len(self.predicted) == 0: self.classification = INVALID else: if self.classification is None: # by default, take the first 'predicted' feature self.classification = self.predicted[0] else: if self.classification not in self.predicted: raise RuntimeError, "Classification feature not found among the 'predicted' features in the decision tree's MiningSchema%s" % self.model.child(pmml.MiningSchema).fileAndLine() self.data[self.classification] = [] if len(params) > 0: raise TypeError, "Unrecognized parameters %s" % params
def initialize(self, **params): """Initialize a clustering model producer.""" if "resume" in params: self.resume = pmml.boolCheck(params["resume"]) del params["resume"] else: self.resume = False if "numberOfTrials" in params: self.numberOfTrials = int(params["numberOfTrials"]) del params["numberOfTrials"] else: self.numberOfTrials = 10 if "numberToKeep" in params: self.numberToKeep = int(params["numberToKeep"]) del params["numberToKeep"] else: self.numberToKeep = 3 if "maturityThreshold" in params: self.maturityThreshold = int(params["maturityThreshold"]) del params["maturityThreshold"] else: self.maturityThreshold = 100 if "initialStability" in params: self.initialStability = int(params["initialStability"]) del params["initialStability"] else: self.initialStability = 100 if "overrideSignificance" in params: self.overrideSignificance = float(params["overrideSignificance"]) del params["overrideSignificance"] if self.overrideSignificance == 0.: self.overrideSignificance = None else: self.overrideSignificance = 5. self.model = self.segmentRecord.pmmlModel self.dataDistribution = self.engine.producerUpdateScheme.updator(COVARIANCE(self.model.numberOfFields)) self.distanceMeasure = (self.model.child(pmml.ComparisonMeasure).attrib["kind"] == "distance") # put PartialSums in the model if they're not already there; pick up old values if you're resuming extension = self.model.child(pmml.Extension, exception=False) if extension is None: extension = pmml.Extension() self.model.children.append(extension) if self.resume: self.sumOfDistances = extension.child(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == "SumOfDistances", exception=False) else: index = extension.index(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == "SumOfDistances", exception=False) if index is not None: del extension[index] self.sumOfDistances = None if self.sumOfDistances is None: self.sumOfDistances = pmml.X_ODG_PartialSums(name="SumOfDistances", COUNT=0, SUM1=0., SUMX=0., SUMXX=0.) extension.children.append(self.sumOfDistances) self.partialSums = {} for theid, cluster in zip(self.model.ids, self.model.cluster): for i, field in enumerate(self.model.fields): fullname = "%s.%s" % (theid, field) if self.resume: partialSum = extension.child(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == fullname, exception=False) else: index = extension.index(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == fullname, exception=False) if index is not None: del extension[index] partialSum = None if partialSum is None: partialSum = pmml.X_ODG_PartialSums(name=fullname, SUM1=1., SUMX=cluster.value[i]) extension.children.append(partialSum) self.partialSums[fullname] = partialSum # create the first trial using the values constructed above (they come from the PMML file if resume is True) trialFromPmml = new.instance(TrialClusterSet) trialFromPmml.updator = self.engine.producerUpdateScheme.updator(COUNT, SUM1, SUMX, SUMXX) trialFromPmml.updator.initialize({COUNT: self.sumOfDistances.attrib["COUNT"], SUM1: self.sumOfDistances.attrib["SUM1"], SUMX: self.sumOfDistances.attrib["SUMX"], SUMXX: self.sumOfDistances.attrib["SUMXX"]}) trialFromPmml.clusters = [] for theid, cluster in zip(self.model.ids, self.model.cluster): trialCluster = new.instance(TrialCluster) trialCluster.fields = [] trialCluster.initialPosition = [] for field in self.model.fields: partialSum = self.partialSums["%s.%s" % (theid, field)] u = self.engine.producerUpdateScheme.updator(SUM1, SUMX) u.initialize({SUM1: partialSum.attrib["SUM1"], SUMX: partialSum.attrib["SUMX"]}) trialCluster.fields.append(u) trialCluster.initialPosition = list(cluster.value) trialFromPmml.clusters.append(trialCluster) self.trials = [trialFromPmml] if len(params) > 0: raise TypeError, "Unrecognized parameters %s" % params
def initialize(self, **params): """Initialize a baseline producer. Unlike other producers, this creates the update function dynamically, depending on the testStatistic. """ testDistributions = self.segmentRecord.pmmlModel.child( pmml.TestDistributions) self.field = testDistributions.attrib["field"] if "updateExisting" in params: self.updateExisting = pmml.boolCheck(params["updateExisting"]) del params["updateExisting"] else: self.updateExisting = pmml.boolCheck( self.defaultParams["updateExisting"]) self.first = True testStatistic = self.segmentRecord.pmmlModel.child( pmml.TestDistributions).attrib["testStatistic"] if testStatistic in ("CUSUM", "zValue", "GLR"): self.baseline = testDistributions.child(pmml.Baseline).child() self.update = self.updateDistribution if testStatistic == "CUSUM": if "alternateField" in params: self.alternateField = params["alternateField"] del params["alternateField"] else: self.alternateField = None if "alternateValue" in params: self.alternateValue = params["alternateValue"] del params["alternateValue"] else: self.alternateValue = None else: self.alternateField = None self.alternateValue = None elif testStatistic in ("chiSquareDistribution", "scalarProduct"): self.weightField = testDistributions.attrib.get( "weightField", None) self.countTable = testDistributions.child( pmml.Baseline).child(lambda x: isinstance( x, (pmml.CountTable, pmml.NormalizedCountTable))) self.update = self.updateHistogram elif testStatistic == "chiSquareIndependence": self.baseline = testDistributions.child(pmml.Baseline) self.fields = None self.countTable = None self.updators = {} self.total_updator = self.engine.producerUpdateScheme.updator(SUMX) self.update = self.updateChiSquareIndependence if "alternateField" in params: raise NotImplementedError, "The 'alternateField' producerParameter is only used by CUSUM" if "alternateValue" in params: raise NotImplementedError, "The 'alternateValue' producerParameter is only used by CUSUM" if len(params) > 0: raise TypeError, "Unrecognized parameters %s" % params
def initialize(self, **params): if "updateExisting" in params: self.updateExisting = pmml.boolCheck(params["updateExisting"]) del params["updateExisting"] if self.updateExisting: raise NotImplementedError("Updating from existing ClusterModels using 'kmeans' not implemented; use mode='replaceExisting'") else: self.updateExisting = pmml.boolCheck(self.defaultParams["updateExisting"]) if "quickConvergeSteps" in params: try: self.quickConvergeSteps = eval(params["quickConvergeSteps"]) if not isinstance(self.quickConvergeSteps, tuple): raise RuntimeError self.quickConvergeSteps = map(int, self.quickConvergeSteps) except err: raise RuntimeError("quickConvergeSteps must be a tuple of numbers of events") del params["quickConvergeSteps"] else: self.quickConvergeSteps = eval(self.defaultParams["quickConvergeSteps"]) if "numberOfClusters" in params: self.numberOfClusters = params["numberOfClusters"] del params["numberOfClusters"] else: self.numberOfClusters = self.defaultParams["numberOfClusters"] try: self.numberOfClusters = int(self.numberOfClusters) if self.numberOfClusters <= 0: raise ValueError except ValueError: if self.numberOfClusters == "unset": self.numberOfClusters = None else: raise RuntimeError("numberOfClusters must be a positive integer or \"unset\", not \"%s\"" % self.numberOfClusters) if "seedSource" in params: self.seedSource = params["seedSource"] del params["seedSource"] else: self.seedSource = self.defaultParams["seedSource"] if self.seedSource == "dataPoints": self.seedSource = self.RANDOM_DATAPOINTS elif self.seedSource == "dataWeighted": self.seedSource = self.RANDOM_DATAWEIGHTED elif self.seedSource == "dataCovariance": self.seedSource = self.RANDOM_DATACOVARIANCE elif self.seedSource == "unitRect": self.seedSource = self.RANDOM_UNITRECT else: raise NotImplementedError("The seedSource must be one of 'dataPoints', 'dataCovariance', 'unitRect'") if "numberOfTrials" in params: self.numberOfTrials = int(params["numberOfTrials"]) del params["numberOfTrials"] else: self.numberOfTrials = int(self.defaultParams["numberOfTrials"]) if "numberToConverge" in params: self.numberToConverge = int(params["numberToConverge"]) del params["numberToConverge"] else: self.numberToConverge = int(self.defaultParams["numberToConverge"]) if self.numberToConverge > self.numberOfTrials: raise RuntimeError("numberToConverge (%d) must not be greater than numberOfTrials (%d)" % (self.numberToConverge, self.numberOfTrials)) if "maxIterations" in params: self.maxIterations = params["maxIterations"] del params["maxIterations"] else: self.maxIterations = self.defaultParams["maxIterations"] try: self.maxIterations = int(self.maxIterations) if self.maxIterations <= 0: raise ValueError except ValueError: if self.maxIterations == "unset": self.maxIterations = None else: raise RuntimeError("maxIterations must be a positive integer or \"unset\", not \"%s\"" % self.maxIterations) if "closeEnough" in params: self.closeEnough = float(params["closeEnough"]) del params["closeEnough"] else: self.closeEnough = float(self.defaultParams["closeEnough"]) self.model = self.segmentRecord.pmmlModel self.dataDistribution = self.engine.producerUpdateScheme.updator(COVARIANCE(self.model.numberOfFields)) self.distanceMeasure = (self.model.child(pmml.ComparisonMeasure).attrib["kind"] == "distance") if self.seedSource == self.RANDOM_DATAWEIGHTED and self.model.weightField is None: self.seedSource = self.RANDOM_DATAPOINTS # get rid of any PartialSums objects, since they would be misleading (this algorithm doesn't use them) extension = self.model.child(pmml.Extension, exception=False) if extension is not None: newChildren = [] for child in extension.children: if not isinstance(child, pmml.X_ODG_PartialSums): newChildren.append(child) extension.children = newChildren self.buffer = {self.SYNCNUMBER: []} for field in self.model.fields: self.buffer[field] = [] if self.model.weightField is not None: self.buffer[self.model.weightField] = [] if len(params) > 0: raise TypeError("Unrecognized parameters %s" % params)