def calc(self, inputData, inputMask=None, inputState=None, functionTable=None, performanceTable=None): """User interface to quickly make and return a plot. This method is intended for interactive use, since it is more laborious to construct a DataTable by hand. This method modifies the input FunctionTable. Note that PmmlCalculables return a DataTable from C{calc}, wheras PlotCanvas returns an SvgBinding. @type inputData: dict @param inputData: Dictionary from field names to data, as required by the DataTable constructor. @type inputMask: dict or None @param inputMask: Dictionary from field names to missing value masks, as required by the DataTable constructor. @type inputState: DataTableState or None @param inputState: Calculation state, used to continue a calculation over many C{calc} calls. @type functionTable: FunctionTable or None @param functionTable: A table of functions. Initially, it contains only the built-in functions, but any user functions defined in PMML would be added to it. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: SvgBinding @return: A complete SVG image representing the fully drawn plot. """ if functionTable is None: functionTable = FunctionTable() if performanceTable is None: performanceTable = FakePerformanceTable() performanceTable.begin("make DataTable") dataTable = DataTable(self, inputData, inputMask, inputState) performanceTable.end("make DataTable") return self.makePlot(dataTable, functionTable, performanceTable)
def calc(self, inputData, inputMask=None, performanceTable=None): """Build a DataTable from the input data and then run k-means clustering on it to produce a ClusteringModel. This method is intended for interactive use, since it is more laborious to construct a DataTable by hand. Modifies and returns C{self.clusteringModel}. @type inputData: dict @param inputData: Dictionary from field names to data, as required by the DataTable constructor. @type inputMask: dict or None @param inputMask: Dictionary from field names to missing value masks, as required by the DataTable constructor. @type inputState: DataTableState or None @param inputState: Calculation state, used to continue a calculation over many C{calc} calls. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: PmmlBinding @return: The PMML model representing the result of the k-means clustering. """ if performanceTable is None: performanceTable = FakePerformanceTable() performanceTable.begin("make DataTable") dataTable = DataTable(self.clusteringModel, inputData, inputMask, None) performanceTable.end("make DataTable") self.smallTrials(dataTable, performanceTable=performanceTable) self.optimize([dataTable], performanceTable=performanceTable) return self.clusteringModel
def calc(self, inputData, inputMask=None, inputState=None, functionTable=None, performanceTable=None): """Build a DataTable from the input data and then perform a calculation. This method is intended for interactive use, since it is more laborious to construct a DataTable by hand. This method modifies the input FunctionTable. @type inputData: dict @param inputData: Dictionary from field names to data, as required by the DataTable constructor. @type inputMask: dict or None @param inputMask: Dictionary from field names to missing value masks, as required by the DataTable constructor. @type inputState: DataTableState or None @param inputState: Calculation state, used to continue a calculation over many C{calc} calls. @type functionTable: FunctionTable or None @param functionTable: A table of functions. Initially, it contains only the built-in functions, but any user functions defined in PMML would be added to it. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataTable @return: A DataTable containing the result. """ if functionTable is None: functionTable = FunctionTable() if performanceTable is None: performanceTable = FakePerformanceTable() performanceTable.begin("make DataTable") dataTable = DataTable(self, inputData, inputMask, inputState) performanceTable.end("make DataTable") self.calculate(dataTable, functionTable, performanceTable) return dataTable
def makePlot(self, dataTable, functionTable=None, performanceTable=None): """Construct a plot from the data and return a complete SVG image. @type dataTable: DataTable @param dataTable: Contains the data to plot. @type functionTable: FunctionTable @param functionTable: Defines functions that may be used to transform data for plotting. @type performanceTable: PerformanceTable @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process. @rtype: SvgBinding @return: A complete SVG image representing the fully drawn plot. """ if functionTable is None: functionTable = FunctionTable() if performanceTable is None: performanceTable = FakePerformanceTable() svg = SvgBinding.elementMaker performanceTable.begin("PlotCanvas") width = self.get("width", defaultFromXsd=True, convertType=True) height = self.get("height", defaultFromXsd=True, convertType=True) style = self.get("style", defaultFromXsd=True) attrib = self.globalAttrib.copy() svgId = self.get("svgId") if svgId is not None: attrib["id"] = svgId attrib["viewBox"] = "0 0 %d %d" % (width, height) attrib["style"] = style attrib["font-family"] = self.get("font-family", defaultFromXsd=True) attrib["font-weight"] = self.get("font-weight", defaultFromXsd=True) plotCoordinates = PlotCoordinates() plotContentBox = PlotContentBox(0, 0, width, height) plotDefinitions = PlotDefinitions() performanceTable.pause("PlotCanvas") content = [ x.frame(dataTable, functionTable, performanceTable, plotCoordinates, plotContentBox, plotDefinitions) for x in self.childrenOfClass(PmmlPlotFrame) ] performanceTable.unpause("PlotCanvas") content = [svg.defs(*plotDefinitions.values())] + content performanceTable.end("PlotCanvas") return svg.svg(*content, **attrib)
def makePlot(self, dataTable, functionTable=None, performanceTable=None): """Construct a plot from the data and return a complete SVG image. @type dataTable: DataTable @param dataTable: Contains the data to plot. @type functionTable: FunctionTable @param functionTable: Defines functions that may be used to transform data for plotting. @type performanceTable: PerformanceTable @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process. @rtype: SvgBinding @return: A complete SVG image representing the fully drawn plot. """ if functionTable is None: functionTable = FunctionTable() if performanceTable is None: performanceTable = FakePerformanceTable() svg = SvgBinding.elementMaker performanceTable.begin("PlotCanvas") width = self.get("width", defaultFromXsd=True, convertType=True) height = self.get("height", defaultFromXsd=True, convertType=True) style = self.get("style", defaultFromXsd=True) attrib = self.globalAttrib.copy() svgId = self.get("svgId") if svgId is not None: attrib["id"] = svgId attrib["viewBox"] = "0 0 %d %d" % (width, height) attrib["style"] = style attrib["font-family"] = self.get("font-family", defaultFromXsd=True) attrib["font-weight"] = self.get("font-weight", defaultFromXsd=True) plotCoordinates = PlotCoordinates() plotContentBox = PlotContentBox(0, 0, width, height) plotDefinitions = PlotDefinitions() performanceTable.pause("PlotCanvas") content = [x.frame(dataTable, functionTable, performanceTable, plotCoordinates, plotContentBox, plotDefinitions) for x in self.childrenOfClass(PmmlPlotFrame)] performanceTable.unpause("PlotCanvas") content = [svg.defs(*plotDefinitions.values())] + content performanceTable.end("PlotCanvas") return svg.svg(*content, **attrib)
def calculate(self, dataTable, functionTable=None, performanceTable=None): """Calculate a DerivedField. This method modifies the input DataTable. If the data types between the DerivedField and its EXPRESSION are not matched, the DerivedField will need to cast the output. This is a potentially expensive and often unwanted operation. When a DerivedField casts, it reports the cast in the PerformanceTable with DerivedField name, to help the user debug their PMML. @type dataTable: DataTable @param dataTable: The pre-built DataTable. @type functionTable: FunctionTable or None @param functionTable: A table of functions. Initially, it contains only the built-in functions, but any user functions defined in PMML would be added to it. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataTable @return: A DataTable containing the result, usually a modified version of the input. """ if functionTable is None: functionTable = FunctionTable() if performanceTable is None: performanceTable = FakePerformanceTable() dataColumn = self.childOfClass(PmmlExpression).evaluate(dataTable, functionTable, performanceTable) performanceTable.begin("DerivedField") dataType = dataColumn.fieldType.dataType optype = dataColumn.fieldType.optype if self.get("dataType", dataType) == dataType and self.get("optype", optype) == optype and len(self.childrenOfTag("Value")) == 0: dataTable.fields[self.name] = dataColumn else: performanceTable.begin("cast (\"%s\")" % self.name) dataTable.fields[self.name] = FieldCastMethods.cast(FieldType(self), dataColumn) performanceTable.end("cast (\"%s\")" % self.name) performanceTable.end("DerivedField") return dataTable.fields[self.name]
def smallTrials(self, dataTable, numberOfTrials=5, recordsPerTrial=100, performanceTable=None): """Improve the initial seed with a few small trials on random subsets of the data. Modifies C{self.clusteringModel}. @type dataTable: DataTable @param dataTable: The input data. @type numberOfTrials: int @param numberOfTrials: The number of independent trials with the same number of C{recordsPerTrial}. The trial with the smallest sum of in-cluster variances wins. @type recordsPerTrial: int @param recordsPerTrial: The number of rows to randomly select from the DataTable in each trial. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. """ if performanceTable is None: performanceTable = FakePerformanceTable() performanceTable.begin("smallTrials") mapReduce = self.mapReduce() self.KMeansMapReduceApplication.metadata[ "ClusteringModel"] = copy.deepcopy( self.KMeansMapReduceApplication.metadata["ClusteringModel"]) bestVariance = None bestSeed = None for trialNumber in xrange(numberOfTrials): indexes = random.sample(xrange(len(dataTable)), recordsPerTrial) subTable = dataTable.subTable( NP("array", indexes, dtype=NP.dtype(int))) self.randomSeeds(dataTable) mapReduce.metadata["ClusteringModel"] = self.clusteringModel outputRecords, outputKeyValues, numberOfIterations = mapReduce.run( [subTable], parallel=False, frozenClass=False, numberOfMappers=1, numberOfReducers=1, iterationLimit=self.iterationLimit) for extension in self.clusteringModel.xpath( "pmml:Extension[@name='iterations.smallTrials']"): extension["value"] = repr( int(extension["value"]) + numberOfIterations) mapReduce.metadata["ClusteringModel"]["modelName"] = "smallTrials" mapReduce.metadata["ClusteringModel"].subFields = dict( mapReduce.metadata["ClusteringModel"].subFields) mapReduce.metadata["ClusteringModel"].subFields.update( {"affinity": True}) mapReduce.metadata["ClusteringModel"].calculate(subTable) data = subTable.fields["smallTrials.affinity"].data mask = subTable.fields["smallTrials.affinity"].mask if mask is None: variance = NP(data**2).sum() / float(len(subTable)) else: selection = NP(mask == defs.VALID) denom = NP("count_nonzero", selection) if denom > 0: variance = NP(data[selection]**2).sum() / float(denom) else: variance = None if variance is not None and (bestVariance is None or variance < bestVariance): bestVariance = variance bestSeed = mapReduce.metadata["clusterVectors"] if bestSeed is not None: self.explicitSeeds(bestSeed) performanceTable.end("smallTrials")
def smallTrials(self, dataTable, numberOfTrials=5, recordsPerTrial=100, performanceTable=None): """Improve the initial seed with a few small trials on random subsets of the data. Modifies C{self.clusteringModel}. @type dataTable: DataTable @param dataTable: The input data. @type numberOfTrials: int @param numberOfTrials: The number of independent trials with the same number of C{recordsPerTrial}. The trial with the smallest sum of in-cluster variances wins. @type recordsPerTrial: int @param recordsPerTrial: The number of rows to randomly select from the DataTable in each trial. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. """ if performanceTable is None: performanceTable = FakePerformanceTable() performanceTable.begin("smallTrials") mapReduce = self.mapReduce() self.KMeansMapReduceApplication.metadata["ClusteringModel"] = copy.deepcopy(self.KMeansMapReduceApplication.metadata["ClusteringModel"]) bestVariance = None bestSeed = None for trialNumber in xrange(numberOfTrials): indexes = random.sample(xrange(len(dataTable)), recordsPerTrial) subTable = dataTable.subTable(NP("array", indexes, dtype=NP.dtype(int))) self.randomSeeds(dataTable) mapReduce.metadata["ClusteringModel"] = self.clusteringModel outputRecords, outputKeyValues, numberOfIterations = mapReduce.run([subTable], parallel=False, frozenClass=False, numberOfMappers=1, numberOfReducers=1, iterationLimit=self.iterationLimit) for extension in self.clusteringModel.xpath("pmml:Extension[@name='iterations.smallTrials']"): extension["value"] = repr(int(extension["value"]) + numberOfIterations) mapReduce.metadata["ClusteringModel"]["modelName"] = "smallTrials" mapReduce.metadata["ClusteringModel"].subFields = dict(mapReduce.metadata["ClusteringModel"].subFields) mapReduce.metadata["ClusteringModel"].subFields.update({"affinity": True}) mapReduce.metadata["ClusteringModel"].calculate(subTable) data = subTable.fields["smallTrials.affinity"].data mask = subTable.fields["smallTrials.affinity"].mask if mask is None: variance = NP(data**2).sum() / float(len(subTable)) else: selection = NP(mask == defs.VALID) denom = NP("count_nonzero", selection) if denom > 0: variance = NP(data[selection]**2).sum() / float(denom) else: variance = None if variance is not None and (bestVariance is None or variance < bestVariance): bestVariance = variance bestSeed = mapReduce.metadata["clusterVectors"] if bestSeed is not None: self.explicitSeeds(bestSeed) performanceTable.end("smallTrials")
def verify(self, showSuccess=False, performanceTable=None): """Run the model verification tests defined by this element. The output is a list of results (all results or only failures, depending on C{showSuccess}), each of which is a dictionary of field names to values. Fields are: - "success": was the comparison successful? - "expectedMissing", "observedMissing": is the expected/observed value missing? - "expectedValue", "observedValue": result as an internal value. - "expectedPythonValue", "observedPythonValue": result as a Python value. - "expectedDisplayValue", "observedDisplayValue": result as a string displayValue. Only "success", "expectedMissing", and "observedMissing" appear if the "is missing?" comparison was unsuccessful. @type showSuccess: bool @param showSuccess: If True, emit output even if the tests are successful. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: JSON-like list of dicts @return: As described above. """ verificationFields = {} for verificationField in self.xpath("pmml:VerificationFields/pmml:VerificationField"): verificationField.column = verificationField.get("column", verificationField["field"]) verificationField.precision = verificationField.get("precision", defaultFromXsd=True, convertType=True) verificationField.zeroThreshold = verificationField.get("zeroThreshold", defaultFromXsd=True, convertType=True) verificationField.data = [] verificationField.mask = [] verificationFields[verificationField.column] = verificationField inputData = {} inputMask = {} for index, row in enumerate(self.childOfClass(TableInterface).iterate()): for columnName, columnValue in row.items(): verificationField = verificationFields.get(columnName) if verificationField is not None: while len(verificationField.data) < index: verificationField.data.append(defs.PADDING) verificationField.mask.append(True) verificationField.data.append(columnValue) verificationField.mask.append(False) else: inputDataField = inputData.get(columnName) if inputDataField is None: inputDataField = [] inputData[columnName] = inputDataField inputMask[columnName] = [] inputMaskField = inputMask[columnName] while len(inputDataField) < index: inputDataField.append(defs.PADDING) inputMaskField.append(True) inputDataField.append(columnValue) inputMaskField.append(False) for verificationField in verificationFields.values(): while len(verificationField.data) < index: verificationField.data.append(defs.PADDING) verificationField.mask.append(True) for columnName in inputData: inputDataField = inputData[columnName] inputMaskField = inputMask[columnName] while len(inputDataField) < index: inputDataField.append(defs.PADDING) inputMaskField.append(True) for columnName, verificationField in verificationFields.items(): inputData[columnName] = verificationField.data inputMask[columnName] = verificationField.mask model = self.getparent() if performanceTable is None: performanceTable = FakePerformanceTable() performanceTable.begin("make DataTable") dataTable = DataTable(model, inputData, inputMask, inputState=None) performanceTable.end("make DataTable") functionTable = FunctionTable() for miningField in model.xpath("pmml:MiningSchema/pmml:MiningField"): miningField.replaceField(dataTable, functionTable, performanceTable) for calculable in model.calculableTrans(): calculable.calculate(dataTable, functionTable, performanceTable) score = model.calculateScore(dataTable, functionTable, performanceTable) dataTable.score = score[None] if model.name is not None: for key, value in score.items(): if key is None: dataTable.fields[model.name] = value else: dataTable.fields["%s.%s" % (model.name, key)] = value for outputField in self.xpath("../pmml:Output/pmml:OutputField"): displayName = outputField.get("displayName", outputField["name"]) outputField.format(dataTable, functionTable, performanceTable, score) output = [] for verificationField in verificationFields.values(): observedOutput = dataTable.fields.get(verificationField["field"]) if observedOutput is None: raise defs.PmmlValidationError("VerificationField references field \"%s\" but it was not produced by the model") fieldType = observedOutput.fieldType if fieldType.dataType == "object": try: newArray = [float(x) for x in observedOutput.data] except ValueError: pass else: fieldType = FakeFieldType("double", "continuous") observedOutput._data = newArray for index in xrange(len(dataTable)): record = {"field": verificationField["field"], "index": index} record["expectedMissing"] = verificationField.mask[index] record["observedMissing"] = (observedOutput.mask is not None and observedOutput.mask[index] != defs.VALID) if record["expectedMissing"] != record["observedMissing"]: record["success"] = False output.append(record) elif not record["expectedMissing"]: record["expectedValue"] = fieldType.stringToValue(verificationField.data[index]) record["observedValue"] = observedOutput.data[index] record["expectedPythonValue"] = fieldType.valueToPython(record["expectedValue"]) record["observedPythonValue"] = fieldType.valueToPython(record["observedValue"]) record["expectedDisplayValue"] = fieldType.valueToString(record["expectedValue"]) record["observedDisplayValue"] = fieldType.valueToString(record["observedValue"]) if fieldType.optype == "continuous": if (abs(record["expectedValue"]) <= verificationField.zeroThreshold) and (abs(record["observedValue"]) <= verificationField.zeroThreshold): record["success"] = True else: record["success"] = ((record["expectedValue"] * (1.0 - verificationField.precision)) <= record["observedValue"] <= (record["expectedValue"] * (1.0 + verificationField.precision))) if not record["success"] or showSuccess: output.append(record) else: if record["expectedValue"] != record["observedValue"]: record["success"] = False output.append(record) else: record["success"] = True if showSuccess: output.append(record) return output
def verify(self, showSuccess=False, performanceTable=None): """Run the model verification tests defined by this element. The output is a list of results (all results or only failures, depending on C{showSuccess}), each of which is a dictionary of field names to values. Fields are: - "success": was the comparison successful? - "expectedMissing", "observedMissing": is the expected/observed value missing? - "expectedValue", "observedValue": result as an internal value. - "expectedPythonValue", "observedPythonValue": result as a Python value. - "expectedDisplayValue", "observedDisplayValue": result as a string displayValue. Only "success", "expectedMissing", and "observedMissing" appear if the "is missing?" comparison was unsuccessful. @type showSuccess: bool @param showSuccess: If True, emit output even if the tests are successful. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: JSON-like list of dicts @return: As described above. """ verificationFields = {} for verificationField in self.xpath( "pmml:VerificationFields/pmml:VerificationField"): verificationField.column = verificationField.get( "column", verificationField["field"]) verificationField.precision = verificationField.get( "precision", defaultFromXsd=True, convertType=True) verificationField.zeroThreshold = verificationField.get( "zeroThreshold", defaultFromXsd=True, convertType=True) verificationField.data = [] verificationField.mask = [] verificationFields[verificationField.column] = verificationField inputData = {} inputMask = {} for index, row in enumerate( self.childOfClass(TableInterface).iterate()): for columnName, columnValue in row.items(): verificationField = verificationFields.get(columnName) if verificationField is not None: while len(verificationField.data) < index: verificationField.data.append(defs.PADDING) verificationField.mask.append(True) verificationField.data.append(columnValue) verificationField.mask.append(False) else: inputDataField = inputData.get(columnName) if inputDataField is None: inputDataField = [] inputData[columnName] = inputDataField inputMask[columnName] = [] inputMaskField = inputMask[columnName] while len(inputDataField) < index: inputDataField.append(defs.PADDING) inputMaskField.append(True) inputDataField.append(columnValue) inputMaskField.append(False) for verificationField in verificationFields.values(): while len(verificationField.data) < index: verificationField.data.append(defs.PADDING) verificationField.mask.append(True) for columnName in inputData: inputDataField = inputData[columnName] inputMaskField = inputMask[columnName] while len(inputDataField) < index: inputDataField.append(defs.PADDING) inputMaskField.append(True) for columnName, verificationField in verificationFields.items(): inputData[columnName] = verificationField.data inputMask[columnName] = verificationField.mask model = self.getparent() if performanceTable is None: performanceTable = FakePerformanceTable() performanceTable.begin("make DataTable") dataTable = DataTable(model, inputData, inputMask, inputState=None) performanceTable.end("make DataTable") functionTable = FunctionTable() for miningField in model.xpath("pmml:MiningSchema/pmml:MiningField"): miningField.replaceField(dataTable, functionTable, performanceTable) for calculable in model.calculableTrans(): calculable.calculate(dataTable, functionTable, performanceTable) score = model.calculateScore(dataTable, functionTable, performanceTable) dataTable.score = score[None] if model.name is not None: for key, value in score.items(): if key is None: dataTable.fields[model.name] = value else: dataTable.fields["%s.%s" % (model.name, key)] = value for outputField in self.xpath("../pmml:Output/pmml:OutputField"): displayName = outputField.get("displayName", outputField["name"]) outputField.format(dataTable, functionTable, performanceTable, score) output = [] for verificationField in verificationFields.values(): observedOutput = dataTable.fields.get(verificationField["field"]) if observedOutput is None: raise defs.PmmlValidationError( "VerificationField references field \"%s\" but it was not produced by the model" ) fieldType = observedOutput.fieldType if fieldType.dataType == "object": try: newArray = [float(x) for x in observedOutput.data] except ValueError: pass else: fieldType = FakeFieldType("double", "continuous") observedOutput._data = newArray for index in xrange(len(dataTable)): record = {"field": verificationField["field"], "index": index} record["expectedMissing"] = verificationField.mask[index] record["observedMissing"] = ( observedOutput.mask is not None and observedOutput.mask[index] != defs.VALID) if record["expectedMissing"] != record["observedMissing"]: record["success"] = False output.append(record) elif not record["expectedMissing"]: record["expectedValue"] = fieldType.stringToValue( verificationField.data[index]) record["observedValue"] = observedOutput.data[index] record["expectedPythonValue"] = fieldType.valueToPython( record["expectedValue"]) record["observedPythonValue"] = fieldType.valueToPython( record["observedValue"]) record["expectedDisplayValue"] = fieldType.valueToString( record["expectedValue"]) record["observedDisplayValue"] = fieldType.valueToString( record["observedValue"]) if fieldType.optype == "continuous": if (abs(record["expectedValue"]) <= verificationField.zeroThreshold) and ( abs(record["observedValue"]) <= verificationField.zeroThreshold): record["success"] = True else: record["success"] = ( (record["expectedValue"] * (1.0 - verificationField.precision)) <= record["observedValue"] <= (record["expectedValue"] * (1.0 + verificationField.precision))) if not record["success"] or showSuccess: output.append(record) else: if record["expectedValue"] != record["observedValue"]: record["success"] = False output.append(record) else: record["success"] = True if showSuccess: output.append(record) return output