def calc(self, inputData, inputMask=None, inputState=None, functionTable=None, performanceTable=None): """Build a DataTable from the input data and then perform a calculation. This method is intended for interactive use, since it is more laborious to construct a DataTable by hand. This method modifies the input FunctionTable. @type inputData: dict @param inputData: Dictionary from field names to data, as required by the DataTable constructor. @type inputMask: dict or None @param inputMask: Dictionary from field names to missing value masks, as required by the DataTable constructor. @type inputState: DataTableState or None @param inputState: Calculation state, used to continue a calculation over many C{calc} calls. @type functionTable: FunctionTable or None @param functionTable: A table of functions. Initially, it contains only the built-in functions, but any user functions defined in PMML would be added to it. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataTable @return: A DataTable containing the result. """ if functionTable is None: functionTable = FunctionTable() if performanceTable is None: performanceTable = FakePerformanceTable() performanceTable.begin("make DataTable") dataTable = DataTable(self, inputData, inputMask, inputState) performanceTable.end("make DataTable") self.calculate(dataTable, functionTable, performanceTable) return dataTable
def __iter__(self): types = {} for name, fieldType in self.namesToFieldTypes.items(): name = self._removeUnicode(name) if fieldType.dataType == "string": if fieldType.optype == "continuous": types[name] = "string" elif fieldType.optype in ("categorical", "ordinal"): types[name] = "category" elif fieldType.dataType in ("integer", "dateDaysSince[0]", "dateDaysSince[1960]", "dateDaysSince[1970]", "dateDaysSince[1980]", "timeSeconds", "dateTimeSecondsSince[0]", "dateTimeSecondsSince[1960]", "dateTimeSecondsSince[1970]", "dateTimeSecondsSince[1980]"): types[name] = "integer" elif fieldType.dataType == "double": types[name] = "double" if name not in types: raise TypeError("Cannot match %r to an extraction type" % fieldType) if len(types) == 0: raise TypeError("At least one field must be selected") namesToAvroPaths = self._removeUnicode(self.namesToAvroPaths) if self.inputState is None: self.inputState = DataTableState() for fileName in self.fileNames: inputStream = InputStream() inputStream.start(fileName, self.chunkSize, namesToAvroPaths, types) try: while True: arrays = inputStream.next() yield DataTable.buildManually(self.namesToFieldTypes, arrays, inputState=self.inputState) if len(arrays.values()[0]) < self.chunkSize: break except Exception: raise finally: inputStream.close()
def emptyDataTable(self): """Construct an empty DataTable from the serialized DataTableFields and DataTableState. @rtype: DataTable @return: An empty DataTable, suitable for PmmlPlotContent.prepare. """ context = {} inputData = {} inputState = self.unserializeState() for name, value in inputState.iteritems(): if name.endswith(".context"): for fieldName, (dataType, optype) in value.iteritems(): context[fieldName] = FakeFieldType(dataType, optype) inputData[fieldName] = [] return DataTable(context, inputData, inputState=inputState)
def calc(self, inputData, inputMask=None, inputState=None, functionTable=None, performanceTable=None): """User interface to quickly make and return a plot. This method is intended for interactive use, since it is more laborious to construct a DataTable by hand. This method modifies the input FunctionTable. Note that PmmlCalculables return a DataTable from C{calc}, wheras PlotCanvas returns an SvgBinding. @type inputData: dict @param inputData: Dictionary from field names to data, as required by the DataTable constructor. @type inputMask: dict or None @param inputMask: Dictionary from field names to missing value masks, as required by the DataTable constructor. @type inputState: DataTableState or None @param inputState: Calculation state, used to continue a calculation over many C{calc} calls. @type functionTable: FunctionTable or None @param functionTable: A table of functions. Initially, it contains only the built-in functions, but any user functions defined in PMML would be added to it. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: SvgBinding @return: A complete SVG image representing the fully drawn plot. """ if functionTable is None: functionTable = FunctionTable() if performanceTable is None: performanceTable = FakePerformanceTable() performanceTable.begin("make DataTable") dataTable = DataTable(self, inputData, inputMask, inputState) performanceTable.end("make DataTable") return self.makePlot(dataTable, functionTable, performanceTable)
def expressionsToPoints(cls, expression, derivative, samples, loop, functionTable, performanceTable): """Evaluate a set of given string-based formulae to generate numeric points. This is used to plot mathematical curves. @type expression: 1- or 2-tuple of strings @param expression: If a 1-tuple, the string is passed to Formula and interpreted as y(x); if a 2-tuple, the strings are passed to Formula and interpreted as x(t), y(t). @type derivative: 1- or 2-tuple of strings (same length as C{expression}) @param derivative: Strings are passed to Formua and interpreted as dy/dx (if a 1-tuple) or dx/dt, dy/dt (if a 2-tuple). @type samples: 1d Numpy array @param samples: Values of x or t at which to evaluate the expression or expressions. @type loop: bool @param loop: If False, disconnect the end of the set of points from the beginning. @type functionTable: FunctionTable @param functionTable: Functions that may be used to perform the calculation. @type performanceTable: PerformanceTable @param performanceTable: Measures and records performance (time and memory consumption) of the process. @rtype: 6-tuple @return: C{xlist}, C{ylist}, C{dxlist}, C{dylist} (1d Numpy arrays), xfieldType, yfieldType (FieldTypes). """ if len(expression) == 1: sampleTable = DataTable({"x": "double"}, {"x": samples}) parsed = Formula.parse(expression[0]) ydataColumn = parsed.evaluate(sampleTable, functionTable, performanceTable) if not ydataColumn.fieldType.isnumeric( ) and not ydataColumn.fieldType.istemporal(): raise defs.PmmlValidationError( "PlotFormula y(x) must return a numeric expression, not %r" % ydataColumn.fieldType) xfieldType = cls.xfieldType yfieldType = ydataColumn.fieldType selection = None if ydataColumn.mask is not None: selection = NP(ydataColumn.mask == defs.VALID) if derivative[0] is None: if selection is None: xlist = samples ylist = ydataColumn.data else: xlist = samples[selection] ylist = ydataColumn.data[selection] dxlist = NP( (NP("roll", xlist, -1) - NP("roll", xlist, 1)) / 2.0) dylist = NP( (NP("roll", ylist, -1) - NP("roll", ylist, 1)) / 2.0) if not loop: dxlist[0] = 0.0 dxlist[-1] = 0.0 dylist[0] = 0.0 dylist[-1] = 0.0 else: parsed = Formula.parse(derivative[0]) dydataColumn = parsed.evaluate(sampleTable, functionTable, performanceTable) if not dydataColumn.fieldType.isnumeric( ) and not dydataColumn.fieldType.istemporal(): raise defs.PmmlValidationError( "PlotFormula dy/dx must return a numeric expression, not %r" % dydataColumn.fieldType) if dydataColumn.mask is not None: if selection is None: selection = NP(dydataColumn.mask == defs.VALID) else: NP("logical_and", selection, NP(dydataColumn.mask == defs.VALID), selection) if selection is None: xlist = samples ylist = ydataColumn.data dxlist = NP( (NP("roll", xlist, -1) - NP("roll", xlist, 1)) / 2.0) dylist = dydataColumn.data else: xlist = samples[selection] ylist = ydataColumn.data[selection] dxlist = NP( (NP("roll", xlist, -1) - NP("roll", xlist, 1)) / 2.0) dylist = NP(dydataColumn.data[selection] * dxlist) if not loop: dxlist[0] = 0.0 dxlist[-1] = 0.0 dylist[0] = 0.0 dylist[-1] = 0.0 elif len(expression) == 2: sampleTable = DataTable({"t": "double"}, {"t": samples}) parsed = Formula.parse(expression[0]) xdataColumn = parsed.evaluate(sampleTable, functionTable, performanceTable) if not xdataColumn.fieldType.isnumeric( ) and not xdataColumn.fieldType.istemporal(): raise defs.PmmlValidationError( "PlotFormula x(t) must return a numeric expression, not %r" % xdataColumn.fieldType) parsed = Formula.parse(expression[1]) ydataColumn = parsed.evaluate(sampleTable, functionTable, performanceTable) if not ydataColumn.fieldType.isnumeric( ) and not ydataColumn.fieldType.istemporal(): raise defs.PmmlValidationError( "PlotFormula y(t) must return a numeric expression, not %r" % ydataColumn.fieldType) xfieldType = xdataColumn.fieldType yfieldType = ydataColumn.fieldType selection = None if xdataColumn.mask is not None: selection = NP(xdataColumn.mask == defs.VALID) if ydataColumn.mask is not None: if selection is None: selection = NP(ydataColumn.mask == defs.VALID) else: NP("logical_and", selection, NP(ydataColumn.mask == defs.VALID), selection) if derivative[0] is None: if selection is None: xlist = xdataColumn.data ylist = ydataColumn.data else: xlist = xdataColumn.data[selection] ylist = ydataColumn.data[selection] dxlist = NP( (NP("roll", xlist, -1) - NP("roll", xlist, 1)) / 2.0) dylist = NP( (NP("roll", ylist, -1) - NP("roll", ylist, 1)) / 2.0) if not loop: dxlist[0] = 0.0 dxlist[-1] = 0.0 dylist[0] = 0.0 dylist[-1] = 0.0 else: parsed = Formula.parse(derivative[0]) dxdataColumn = parsed.evaluate(sampleTable, functionTable, performanceTable) if not dxdataColumn.fieldType.isnumeric( ) and not dxdataColumn.fieldType.istemporal(): raise defs.PmmlValidationError( "PlotFormula dx/dt must return a numeric expression, not %r" % dxdataColumn.fieldType) parsed = Formula.parse(derivative[1]) dydataColumn = parsed.evaluate(sampleTable, functionTable, performanceTable) if not dydataColumn.fieldType.isnumeric( ) and not dydataColumn.fieldType.istemporal(): raise defs.PmmlValidationError( "PlotFormula dy/dt must return a numeric expression, not %r" % dydataColumn.fieldType) if dxdataColumn.mask is not None: if selection is None: selection = NP(dxdataColumn.mask == defs.VALID) else: NP("logical_and", selection, NP(dxdataColumn.mask == defs.VALID), selection) if dydataColumn.mask is not None: if selection is None: selection = NP(dydataColumn.mask == defs.VALID) else: NP("logical_and", selection, NP(dydataColumn.mask == defs.VALID), selection) if selection is None: dt = NP( (NP("roll", samples, -1) - NP("roll", samples, 1)) / 2.0) xlist = xdataColumn.data ylist = ydataColumn.data dxlist = NP(dxdataColumn.data * dt) dylist = NP(dydataColumn.data * dt) else: dt = NP((NP("roll", samples[selection], -1) - NP("roll", samples[selection], 1)) / 2.0) xlist = xdataColumn.data[selection] ylist = ydataColumn.data[selection] dxlist = NP(dxdataColumn.data[selection] * dt) dylist = NP(dydataColumn.data[selection] * dt) if not loop: dxlist[0] = 0.0 dxlist[-1] = 0.0 dylist[0] = 0.0 dylist[-1] = 0.0 return xlist, ylist, dxlist, dylist, xfieldType, yfieldType
def prepare(self, state, dataTable, functionTable, performanceTable, plotRange): """Prepare a plot element for drawing. This stage consists of calculating all quantities and determing the bounds of the data. These bounds may be unioned with bounds from other plot elements that overlay this plot element, so the drawing (which requires a finalized coordinate system) cannot begin yet. This method modifies C{plotRange}. @type state: ad-hoc Python object @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type dataTable: DataTable @param dataTable: Contains the data to plot. @type functionTable: FunctionTable @param functionTable: Defines functions that may be used to transform data for plotting. @type performanceTable: PerformanceTable @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process. @type plotRange: PlotRange @param plotRange: The bounding box of plot coordinates that this function will expand. """ self.checkRoles(["z(x,y)", "x", "y", "zmean", "zweight"]) performanceTable.begin("PlotHeatMap prepare") self._saveContext(dataTable) zofxy = self.xpath("pmml:PlotFormula[@role='z(x,y)']") xexpr = self.xpath("pmml:PlotNumericExpression[@role='x']") yexpr = self.xpath("pmml:PlotNumericExpression[@role='y']") zmean = self.xpath("pmml:PlotNumericExpression[@role='zmean']") zweight = self.xpath("pmml:PlotNumericExpression[@role='zweight']") cutExpression = self.xpath("pmml:PlotSelection") if len(zofxy) == 1 and len(xexpr) == 0 and len(yexpr) == 0 and len( zmean) == 0 and len(zweight) == 0: xbins = self.get("xbins", convertType=True) xlow = self.get("xlow", convertType=True) xhigh = self.get("xhigh", convertType=True) ybins = self.get("ybins", convertType=True) ylow = self.get("ylow", convertType=True) yhigh = self.get("yhigh", convertType=True) if xbins is None or xlow is None or xhigh is None or ybins is None or ylow is None or yhigh is None: raise defs.PmmlValidationError( "xbins, xlow, xhigh, ybins, ylow, and yhigh are required for HeatMaps of a mathematical formula" ) if xlow >= xhigh or ylow >= yhigh: raise defs.PmmlValidationError( "xlow must be less than xhigh and ylow must be less than yhigh" ) if plotRange.xStrictlyPositive or plotRange.yStrictlyPositive: raise defs.PmmlValidationError( "PlotHeatMap can only be properly displayed in linear x, y coordinates" ) xbinWidth = (xhigh - xlow) / float(xbins) ybinWidth = (yhigh - ylow) / float(ybins) xarray = NP("tile", NP("linspace", xlow, xhigh, xbins, endpoint=True), ybins) yarray = NP("repeat", NP("linspace", ylow, yhigh, ybins, endpoint=True), xbins) sampleTable = DataTable({ "x": "double", "y": "double" }, { "x": xarray, "y": yarray }) parsed = Formula.parse(zofxy[0].text) performanceTable.pause("PlotHeatMap prepare") zdataColumn = parsed.evaluate(sampleTable, functionTable, performanceTable) performanceTable.unpause("PlotHeatMap prepare") if not zdataColumn.fieldType.isnumeric(): raise defs.PmmlValidationError( "PlotFormula z(x,y) must return a numeric expression, not %r" % zdataColumn.fieldType) selection = NP("isfinite", zdataColumn.data) if zdataColumn.mask is not None: NP("logical_and", selection, NP(zdataColumn.mask == defs.VALID), selection) if plotRange.zStrictlyPositive: NP("logical_and", selection, NP(zdataColumn.data > 0.0), selection) gooddata = zdataColumn.data[selection] plotRange.zminPush(gooddata.min(), zdataColumn.fieldType, sticky=False) plotRange.zmaxPush(gooddata.max(), zdataColumn.fieldType, sticky=False) state.zdata = zdataColumn.data state.zmask = NP("logical_not", selection) * defs.INVALID elif len(zofxy) == 0 and len(xexpr) == 1 and len(yexpr) == 1: performanceTable.pause("PlotHeatMap prepare") xdataColumn = xexpr[0].evaluate(dataTable, functionTable, performanceTable) ydataColumn = yexpr[0].evaluate(dataTable, functionTable, performanceTable) performanceTable.unpause("PlotHeatMap prepare") xbins = self.get("xbins", convertType=True) xlow = self.get("xlow", convertType=True) xhigh = self.get("xhigh", convertType=True) ybins = self.get("ybins", convertType=True) ylow = self.get("ylow", convertType=True) yhigh = self.get("yhigh", convertType=True) if len(xdataColumn) > 0: if xlow is None: xlow = NP("nanmin", xdataColumn.data) if xhigh is None: xhigh = NP("nanmax", xdataColumn.data) if ylow is None: ylow = NP("nanmin", ydataColumn.data) if yhigh is None: yhigh = NP("nanmax", ydataColumn.data) else: if xlow is None: xlow = 0.0 if xhigh is None: xhigh = 1.0 if ylow is None: ylow = 0.0 if yhigh is None: yhigh = 1.0 if xbins is None: q1, q3 = NP("percentile", xdataColumn.data, [25.0, 75.0]) binWidth = 2.0 * (q3 - q1) / math.pow(len(xdataColumn.data), 1.0 / 3.0) if binWidth > 0.0: xbins = max(10, int(math.ceil((xhigh - xlow) / binWidth))) else: xbins = 10 if ybins is None: q1, q3 = NP("percentile", ydataColumn.data, [25.0, 75.0]) binWidth = 2.0 * (q3 - q1) / math.pow(len(ydataColumn.data), 1.0 / 3.0) if binWidth > 0.0: ybins = max(10, int(math.ceil((yhigh - ylow) / binWidth))) else: ybins = 10 if xlow >= xhigh or ylow >= yhigh: raise defs.PmmlValidationError( "xlow must be less than xhigh and ylow must be less than yhigh" ) if plotRange.xStrictlyPositive or plotRange.yStrictlyPositive: raise defs.PmmlValidationError( "PlotHeatMap can only be properly displayed in linear x, y coordinates" ) persistentState = {} stateId = self.get("stateId") if stateId is not None: if stateId in dataTable.state: persistentState = dataTable.state[stateId] else: dataTable.state[stateId] = persistentState if len(zmean) == 0: if "xbins" in persistentState: xbins = persistentState["xbins"] if "xlow" in persistentState: xlow = persistentState["xlow"] if "xhigh" in persistentState: xhigh = persistentState["xhigh"] if "ybins" in persistentState: ybins = persistentState["ybins"] if "ylow" in persistentState: ylow = persistentState["ylow"] if "yhigh" in persistentState: yhigh = persistentState["yhigh"] persistentState["xbins"] = xbins persistentState["xlow"] = xlow persistentState["xhigh"] = xhigh persistentState["ybins"] = ybins persistentState["ylow"] = ylow persistentState["yhigh"] = yhigh xbinWidth = (xhigh - xlow) / float(xbins) ybinWidth = (yhigh - ylow) / float(ybins) mask = NP("ones", len(dataTable), dtype=NP.dtype(float)) if xdataColumn.mask is not None: NP("multiply", mask, (xdataColumn.mask == defs.VALID), mask) if ydataColumn.mask is not None: NP("multiply", mask, (ydataColumn.mask == defs.VALID), mask) if len(cutExpression) == 1: performanceTable.pause("PlotHeatMap prepare") NP( "multiply", mask, cutExpression[0].select(dataTable, functionTable, performanceTable), mask) performanceTable.unpause("PlotHeatMap prepare") if len(zmean) == 0 and len(zweight) == 0: histogram, xedges, yedges = NP("histogram2d", ydataColumn.data, xdataColumn.data, bins=(ybins, xbins), range=[[ylow, yhigh], [xlow, xhigh]], weights=mask) if len(dataTable) == 0: # work around Numpy <= 1.6.1 bug histogram = NP("zeros", (ybins, xbins), dtype=NP.dtype(float)) if "histogram" in persistentState: persistentState["histogram"] = NP( persistentState["histogram"] + histogram) else: persistentState["histogram"] = histogram histogram = persistentState["histogram"] if plotRange.zStrictlyPositive: zmin = 0.1 else: zmin = 0.0 zmax = NP("nanmax", histogram) plotRange.zminPush(zmin, self.zfieldType, sticky=True) if zmax > zmin: plotRange.zmaxPush(zmax, self.zfieldType, sticky=False) elif len(zmean) == 0 and len(zweight) == 1: performanceTable.pause("PlotHeatMap prepare") weightsDataColumn = zweight[0].evaluate( dataTable, functionTable, performanceTable) performanceTable.unpause("PlotHeatMap prepare") if weightsDataColumn.mask is not None: NP("multiply", mask, (weightsDataColumn.mask == defs.VALID), mask) weights = NP(weightsDataColumn.data * mask) histogram, xedges, yedges = NP("histogram2d", ydataColumn.data, xdataColumn.data, bins=(ybins, xbins), range=[[ylow, yhigh], [xlow, xhigh]], weights=weights) if "histogram" in persistentState: persistentState["histogram"] = NP( persistentState["histogram"] + histogram) else: persistentState["histogram"] = histogram histogram = persistentState["histogram"] if plotRange.zStrictlyPositive: w = weights[NP(weights > 0.0)] if len(w) > 0: zmin = 0.1 * NP("nanmin", w) else: zmin = 0.1 else: zmin = 0.0 zmax = NP("nanmax", histogram) plotRange.zminPush(zmin, self.zfieldType, sticky=True) if zmax > zmin: plotRange.zmaxPush(zmax, self.zfieldType, sticky=False) elif len(zmean) == 1 and len(zweight) == 0: performanceTable.pause("PlotHeatMap prepare") zdataColumn = zmean[0].evaluate(dataTable, functionTable, performanceTable) performanceTable.unpause("PlotHeatMap prepare") if zdataColumn.mask is not None: NP("multiply", mask, (zdataColumn.mask == defs.VALID), mask) weights = NP(zdataColumn.data * mask) numer, xedges, yedges = NP("histogram2d", ydataColumn.data, xdataColumn.data, bins=(ybins, xbins), range=[[ylow, yhigh], [xlow, xhigh]], weights=weights) denom, xedges, yedges = NP("histogram2d", ydataColumn.data, xdataColumn.data, bins=(ybins, xbins), range=[[ylow, yhigh], [xlow, xhigh]], weights=mask) if "numer" in persistentState: persistentState["numer"] = NP(persistentState["numer"] + numer) persistentState["denom"] = NP(persistentState["denom"] + denom) else: persistentState["numer"] = numer persistentState["denom"] = denom numer = persistentState["numer"] denom = persistentState["denom"] histogram = numer / denom selection = NP("isfinite", histogram) if plotRange.zStrictlyPositive: NP("logical_and", selection, NP(histogram > 0.0), selection) if NP("count_nonzero", selection) > 0: gooddata = histogram[selection] plotRange.zminPush(gooddata.min(), self.zfieldType, sticky=False) plotRange.zmaxPush(gooddata.max(), self.zfieldType, sticky=False) else: raise defs.PmmlValidationError( "The only allowed combinations of PlotFormula/PlotNumericExpressions are: \"z(x,y)\" (function), \"x y\" (histogram), \"x y zmean\" (mean of z in x y bins), \"x y zweight\" (weighted x y histogram)" ) state.zdata = NP("reshape", histogram, xbins * ybins) state.zmask = None else: raise defs.PmmlValidationError( "The only allowed combinations of PlotFormula/PlotNumericExpressions are: \"z(x,y)\" (function), \"x y\" (histogram), \"x y zmean\" (mean of z in x y bins), \"x y zweight\" (weighted x y histogram)" ) plotRange.xminPush(xlow, self.xyfieldType, sticky=True) plotRange.yminPush(ylow, self.xyfieldType, sticky=True) plotRange.xmaxPush(xhigh, self.xyfieldType, sticky=True) plotRange.ymaxPush(yhigh, self.xyfieldType, sticky=True) state.xbins = xbins state.xlow = xlow state.xhigh = xhigh state.ybins = ybins state.ylow = ylow state.yhigh = yhigh performanceTable.end("PlotHeatMap prepare")
def verify(self, showSuccess=False, performanceTable=None): """Run the model verification tests defined by this element. The output is a list of results (all results or only failures, depending on C{showSuccess}), each of which is a dictionary of field names to values. Fields are: - "success": was the comparison successful? - "expectedMissing", "observedMissing": is the expected/observed value missing? - "expectedValue", "observedValue": result as an internal value. - "expectedPythonValue", "observedPythonValue": result as a Python value. - "expectedDisplayValue", "observedDisplayValue": result as a string displayValue. Only "success", "expectedMissing", and "observedMissing" appear if the "is missing?" comparison was unsuccessful. @type showSuccess: bool @param showSuccess: If True, emit output even if the tests are successful. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: JSON-like list of dicts @return: As described above. """ verificationFields = {} for verificationField in self.xpath("pmml:VerificationFields/pmml:VerificationField"): verificationField.column = verificationField.get("column", verificationField["field"]) verificationField.precision = verificationField.get("precision", defaultFromXsd=True, convertType=True) verificationField.zeroThreshold = verificationField.get("zeroThreshold", defaultFromXsd=True, convertType=True) verificationField.data = [] verificationField.mask = [] verificationFields[verificationField.column] = verificationField inputData = {} inputMask = {} for index, row in enumerate(self.childOfClass(TableInterface).iterate()): for columnName, columnValue in row.items(): verificationField = verificationFields.get(columnName) if verificationField is not None: while len(verificationField.data) < index: verificationField.data.append(defs.PADDING) verificationField.mask.append(True) verificationField.data.append(columnValue) verificationField.mask.append(False) else: inputDataField = inputData.get(columnName) if inputDataField is None: inputDataField = [] inputData[columnName] = inputDataField inputMask[columnName] = [] inputMaskField = inputMask[columnName] while len(inputDataField) < index: inputDataField.append(defs.PADDING) inputMaskField.append(True) inputDataField.append(columnValue) inputMaskField.append(False) for verificationField in verificationFields.values(): while len(verificationField.data) < index: verificationField.data.append(defs.PADDING) verificationField.mask.append(True) for columnName in inputData: inputDataField = inputData[columnName] inputMaskField = inputMask[columnName] while len(inputDataField) < index: inputDataField.append(defs.PADDING) inputMaskField.append(True) for columnName, verificationField in verificationFields.items(): inputData[columnName] = verificationField.data inputMask[columnName] = verificationField.mask model = self.getparent() if performanceTable is None: performanceTable = FakePerformanceTable() performanceTable.begin("make DataTable") dataTable = DataTable(model, inputData, inputMask, inputState=None) performanceTable.end("make DataTable") functionTable = FunctionTable() for miningField in model.xpath("pmml:MiningSchema/pmml:MiningField"): miningField.replaceField(dataTable, functionTable, performanceTable) for calculable in model.calculableTrans(): calculable.calculate(dataTable, functionTable, performanceTable) score = model.calculateScore(dataTable, functionTable, performanceTable) dataTable.score = score[None] if model.name is not None: for key, value in score.items(): if key is None: dataTable.fields[model.name] = value else: dataTable.fields["%s.%s" % (model.name, key)] = value for outputField in self.xpath("../pmml:Output/pmml:OutputField"): displayName = outputField.get("displayName", outputField["name"]) outputField.format(dataTable, functionTable, performanceTable, score) output = [] for verificationField in verificationFields.values(): observedOutput = dataTable.fields.get(verificationField["field"]) if observedOutput is None: raise defs.PmmlValidationError("VerificationField references field \"%s\" but it was not produced by the model") fieldType = observedOutput.fieldType if fieldType.dataType == "object": try: newArray = [float(x) for x in observedOutput.data] except ValueError: pass else: fieldType = FakeFieldType("double", "continuous") observedOutput._data = newArray for index in xrange(len(dataTable)): record = {"field": verificationField["field"], "index": index} record["expectedMissing"] = verificationField.mask[index] record["observedMissing"] = (observedOutput.mask is not None and observedOutput.mask[index] != defs.VALID) if record["expectedMissing"] != record["observedMissing"]: record["success"] = False output.append(record) elif not record["expectedMissing"]: record["expectedValue"] = fieldType.stringToValue(verificationField.data[index]) record["observedValue"] = observedOutput.data[index] record["expectedPythonValue"] = fieldType.valueToPython(record["expectedValue"]) record["observedPythonValue"] = fieldType.valueToPython(record["observedValue"]) record["expectedDisplayValue"] = fieldType.valueToString(record["expectedValue"]) record["observedDisplayValue"] = fieldType.valueToString(record["observedValue"]) if fieldType.optype == "continuous": if (abs(record["expectedValue"]) <= verificationField.zeroThreshold) and (abs(record["observedValue"]) <= verificationField.zeroThreshold): record["success"] = True else: record["success"] = ((record["expectedValue"] * (1.0 - verificationField.precision)) <= record["observedValue"] <= (record["expectedValue"] * (1.0 + verificationField.precision))) if not record["success"] or showSuccess: output.append(record) else: if record["expectedValue"] != record["observedValue"]: record["success"] = False output.append(record) else: record["success"] = True if showSuccess: output.append(record) return output
def verify(self, showSuccess=False, performanceTable=None): """Run the model verification tests defined by this element. The output is a list of results (all results or only failures, depending on C{showSuccess}), each of which is a dictionary of field names to values. Fields are: - "success": was the comparison successful? - "expectedMissing", "observedMissing": is the expected/observed value missing? - "expectedValue", "observedValue": result as an internal value. - "expectedPythonValue", "observedPythonValue": result as a Python value. - "expectedDisplayValue", "observedDisplayValue": result as a string displayValue. Only "success", "expectedMissing", and "observedMissing" appear if the "is missing?" comparison was unsuccessful. @type showSuccess: bool @param showSuccess: If True, emit output even if the tests are successful. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: JSON-like list of dicts @return: As described above. """ verificationFields = {} for verificationField in self.xpath( "pmml:VerificationFields/pmml:VerificationField"): verificationField.column = verificationField.get( "column", verificationField["field"]) verificationField.precision = verificationField.get( "precision", defaultFromXsd=True, convertType=True) verificationField.zeroThreshold = verificationField.get( "zeroThreshold", defaultFromXsd=True, convertType=True) verificationField.data = [] verificationField.mask = [] verificationFields[verificationField.column] = verificationField inputData = {} inputMask = {} for index, row in enumerate( self.childOfClass(TableInterface).iterate()): for columnName, columnValue in row.items(): verificationField = verificationFields.get(columnName) if verificationField is not None: while len(verificationField.data) < index: verificationField.data.append(defs.PADDING) verificationField.mask.append(True) verificationField.data.append(columnValue) verificationField.mask.append(False) else: inputDataField = inputData.get(columnName) if inputDataField is None: inputDataField = [] inputData[columnName] = inputDataField inputMask[columnName] = [] inputMaskField = inputMask[columnName] while len(inputDataField) < index: inputDataField.append(defs.PADDING) inputMaskField.append(True) inputDataField.append(columnValue) inputMaskField.append(False) for verificationField in verificationFields.values(): while len(verificationField.data) < index: verificationField.data.append(defs.PADDING) verificationField.mask.append(True) for columnName in inputData: inputDataField = inputData[columnName] inputMaskField = inputMask[columnName] while len(inputDataField) < index: inputDataField.append(defs.PADDING) inputMaskField.append(True) for columnName, verificationField in verificationFields.items(): inputData[columnName] = verificationField.data inputMask[columnName] = verificationField.mask model = self.getparent() if performanceTable is None: performanceTable = FakePerformanceTable() performanceTable.begin("make DataTable") dataTable = DataTable(model, inputData, inputMask, inputState=None) performanceTable.end("make DataTable") functionTable = FunctionTable() for miningField in model.xpath("pmml:MiningSchema/pmml:MiningField"): miningField.replaceField(dataTable, functionTable, performanceTable) for calculable in model.calculableTrans(): calculable.calculate(dataTable, functionTable, performanceTable) score = model.calculateScore(dataTable, functionTable, performanceTable) dataTable.score = score[None] if model.name is not None: for key, value in score.items(): if key is None: dataTable.fields[model.name] = value else: dataTable.fields["%s.%s" % (model.name, key)] = value for outputField in self.xpath("../pmml:Output/pmml:OutputField"): displayName = outputField.get("displayName", outputField["name"]) outputField.format(dataTable, functionTable, performanceTable, score) output = [] for verificationField in verificationFields.values(): observedOutput = dataTable.fields.get(verificationField["field"]) if observedOutput is None: raise defs.PmmlValidationError( "VerificationField references field \"%s\" but it was not produced by the model" ) fieldType = observedOutput.fieldType if fieldType.dataType == "object": try: newArray = [float(x) for x in observedOutput.data] except ValueError: pass else: fieldType = FakeFieldType("double", "continuous") observedOutput._data = newArray for index in xrange(len(dataTable)): record = {"field": verificationField["field"], "index": index} record["expectedMissing"] = verificationField.mask[index] record["observedMissing"] = ( observedOutput.mask is not None and observedOutput.mask[index] != defs.VALID) if record["expectedMissing"] != record["observedMissing"]: record["success"] = False output.append(record) elif not record["expectedMissing"]: record["expectedValue"] = fieldType.stringToValue( verificationField.data[index]) record["observedValue"] = observedOutput.data[index] record["expectedPythonValue"] = fieldType.valueToPython( record["expectedValue"]) record["observedPythonValue"] = fieldType.valueToPython( record["observedValue"]) record["expectedDisplayValue"] = fieldType.valueToString( record["expectedValue"]) record["observedDisplayValue"] = fieldType.valueToString( record["observedValue"]) if fieldType.optype == "continuous": if (abs(record["expectedValue"]) <= verificationField.zeroThreshold) and ( abs(record["observedValue"]) <= verificationField.zeroThreshold): record["success"] = True else: record["success"] = ( (record["expectedValue"] * (1.0 - verificationField.precision)) <= record["observedValue"] <= (record["expectedValue"] * (1.0 + verificationField.precision))) if not record["success"] or showSuccess: output.append(record) else: if record["expectedValue"] != record["observedValue"]: record["success"] = False output.append(record) else: record["success"] = True if showSuccess: output.append(record) return output