Example #1
0
    def calc(self, inputData, inputMask=None, inputState=None, functionTable=None, performanceTable=None):
        """Build a DataTable from the input data and then perform a
        calculation.

        This method is intended for interactive use, since it is more
        laborious to construct a DataTable by hand.

        This method modifies the input FunctionTable.

        @type inputData: dict
        @param inputData: Dictionary from field names to data, as required by the DataTable constructor.
        @type inputMask: dict or None
        @param inputMask: Dictionary from field names to missing value masks, as required by the DataTable constructor.
        @type inputState: DataTableState or None
        @param inputState: Calculation state, used to continue a calculation over many C{calc} calls.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.  Initially, it contains only the built-in functions, but any user functions defined in PMML would be added to it.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataTable
        @return: A DataTable containing the result.        
        """

        if functionTable is None:
            functionTable = FunctionTable()
        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        performanceTable.begin("make DataTable")
        dataTable = DataTable(self, inputData, inputMask, inputState)
        performanceTable.end("make DataTable")

        self.calculate(dataTable, functionTable, performanceTable)
        return dataTable
Example #2
0
    def __iter__(self):
        types = {}
        for name, fieldType in self.namesToFieldTypes.items():
            name = self._removeUnicode(name)

            if fieldType.dataType == "string":
                if fieldType.optype == "continuous":
                    types[name] = "string"
                elif fieldType.optype in ("categorical", "ordinal"):
                    types[name] = "category"
            elif fieldType.dataType in ("integer", "dateDaysSince[0]",
                                        "dateDaysSince[1960]",
                                        "dateDaysSince[1970]",
                                        "dateDaysSince[1980]", "timeSeconds",
                                        "dateTimeSecondsSince[0]",
                                        "dateTimeSecondsSince[1960]",
                                        "dateTimeSecondsSince[1970]",
                                        "dateTimeSecondsSince[1980]"):
                types[name] = "integer"
            elif fieldType.dataType == "double":
                types[name] = "double"

            if name not in types:
                raise TypeError("Cannot match %r to an extraction type" %
                                fieldType)

        if len(types) == 0:
            raise TypeError("At least one field must be selected")

        namesToAvroPaths = self._removeUnicode(self.namesToAvroPaths)

        if self.inputState is None:
            self.inputState = DataTableState()

        for fileName in self.fileNames:
            inputStream = InputStream()

            inputStream.start(fileName, self.chunkSize, namesToAvroPaths,
                              types)
            try:
                while True:
                    arrays = inputStream.next()
                    yield DataTable.buildManually(self.namesToFieldTypes,
                                                  arrays,
                                                  inputState=self.inputState)
                    if len(arrays.values()[0]) < self.chunkSize:
                        break
            except Exception:
                raise
            finally:
                inputStream.close()
Example #3
0
    def emptyDataTable(self):
        """Construct an empty DataTable from the serialized DataTableFields and DataTableState.

        @rtype: DataTable
        @return: An empty DataTable, suitable for PmmlPlotContent.prepare.
        """

        context = {}
        inputData = {}
        inputState = self.unserializeState()

        for name, value in inputState.iteritems():
            if name.endswith(".context"):
                for fieldName, (dataType, optype) in value.iteritems():
                    context[fieldName] = FakeFieldType(dataType, optype)
                    inputData[fieldName] = []

        return DataTable(context, inputData, inputState=inputState)
    def __iter__(self):
        types = {}
        for name, fieldType in self.namesToFieldTypes.items():
            name = self._removeUnicode(name)

            if fieldType.dataType == "string":
                if fieldType.optype == "continuous":
                    types[name] = "string"
                elif fieldType.optype in ("categorical", "ordinal"):
                    types[name] = "category"
            elif fieldType.dataType in ("integer", "dateDaysSince[0]", "dateDaysSince[1960]", "dateDaysSince[1970]", "dateDaysSince[1980]", "timeSeconds", "dateTimeSecondsSince[0]", "dateTimeSecondsSince[1960]", "dateTimeSecondsSince[1970]", "dateTimeSecondsSince[1980]"):
                types[name] = "integer"
            elif fieldType.dataType == "double":
                types[name] = "double"

            if name not in types:
                raise TypeError("Cannot match %r to an extraction type" % fieldType)

        if len(types) == 0:
            raise TypeError("At least one field must be selected")

        namesToAvroPaths = self._removeUnicode(self.namesToAvroPaths)

        if self.inputState is None:
            self.inputState = DataTableState()

        for fileName in self.fileNames:
            inputStream = InputStream()

            inputStream.start(fileName, self.chunkSize, namesToAvroPaths, types)
            try:
                while True:
                    arrays = inputStream.next()
                    yield DataTable.buildManually(self.namesToFieldTypes, arrays, inputState=self.inputState)
                    if len(arrays.values()[0]) < self.chunkSize:
                        break
            except Exception:
                raise
            finally:
                inputStream.close()
Example #5
0
    def calc(self,
             inputData,
             inputMask=None,
             inputState=None,
             functionTable=None,
             performanceTable=None):
        """User interface to quickly make and return a plot.

        This method is intended for interactive use, since it is more
        laborious to construct a DataTable by hand.

        This method modifies the input FunctionTable.

        Note that PmmlCalculables return a DataTable from C{calc},
        wheras PlotCanvas returns an SvgBinding.

        @type inputData: dict
        @param inputData: Dictionary from field names to data, as required by the DataTable constructor.
        @type inputMask: dict or None
        @param inputMask: Dictionary from field names to missing value masks, as required by the DataTable constructor.
        @type inputState: DataTableState or None
        @param inputState: Calculation state, used to continue a calculation over many C{calc} calls.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.  Initially, it contains only the built-in functions, but any user functions defined in PMML would be added to it.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: SvgBinding
        @return: A complete SVG image representing the fully drawn plot.        
        """

        if functionTable is None:
            functionTable = FunctionTable()
        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        performanceTable.begin("make DataTable")
        dataTable = DataTable(self, inputData, inputMask, inputState)
        performanceTable.end("make DataTable")

        return self.makePlot(dataTable, functionTable, performanceTable)
Example #6
0
    def expressionsToPoints(cls, expression, derivative, samples, loop,
                            functionTable, performanceTable):
        """Evaluate a set of given string-based formulae to generate
        numeric points.

        This is used to plot mathematical curves.

        @type expression: 1- or 2-tuple of strings
        @param expression: If a 1-tuple, the string is passed to Formula and interpreted as y(x); if a 2-tuple, the strings are passed to Formula and interpreted as x(t), y(t).
        @type derivative: 1- or 2-tuple of strings (same length as C{expression})
        @param derivative: Strings are passed to Formua and interpreted as dy/dx (if a 1-tuple) or dx/dt, dy/dt (if a 2-tuple).
        @type samples: 1d Numpy array
        @param samples: Values of x or t at which to evaluate the expression or expressions.
        @type loop: bool
        @param loop: If False, disconnect the end of the set of points from the beginning.
        @type functionTable: FunctionTable
        @param functionTable: Functions that may be used to perform the calculation.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the process.
        @rtype: 6-tuple
        @return: C{xlist}, C{ylist}, C{dxlist}, C{dylist} (1d Numpy arrays), xfieldType, yfieldType (FieldTypes).
        """

        if len(expression) == 1:
            sampleTable = DataTable({"x": "double"}, {"x": samples})

            parsed = Formula.parse(expression[0])
            ydataColumn = parsed.evaluate(sampleTable, functionTable,
                                          performanceTable)
            if not ydataColumn.fieldType.isnumeric(
            ) and not ydataColumn.fieldType.istemporal():
                raise defs.PmmlValidationError(
                    "PlotFormula y(x) must return a numeric expression, not %r"
                    % ydataColumn.fieldType)

            xfieldType = cls.xfieldType
            yfieldType = ydataColumn.fieldType

            selection = None
            if ydataColumn.mask is not None:
                selection = NP(ydataColumn.mask == defs.VALID)

            if derivative[0] is None:
                if selection is None:
                    xlist = samples
                    ylist = ydataColumn.data
                else:
                    xlist = samples[selection]
                    ylist = ydataColumn.data[selection]

                dxlist = NP(
                    (NP("roll", xlist, -1) - NP("roll", xlist, 1)) / 2.0)
                dylist = NP(
                    (NP("roll", ylist, -1) - NP("roll", ylist, 1)) / 2.0)
                if not loop:
                    dxlist[0] = 0.0
                    dxlist[-1] = 0.0
                    dylist[0] = 0.0
                    dylist[-1] = 0.0

            else:
                parsed = Formula.parse(derivative[0])
                dydataColumn = parsed.evaluate(sampleTable, functionTable,
                                               performanceTable)
                if not dydataColumn.fieldType.isnumeric(
                ) and not dydataColumn.fieldType.istemporal():
                    raise defs.PmmlValidationError(
                        "PlotFormula dy/dx must return a numeric expression, not %r"
                        % dydataColumn.fieldType)

                if dydataColumn.mask is not None:
                    if selection is None:
                        selection = NP(dydataColumn.mask == defs.VALID)
                    else:
                        NP("logical_and", selection,
                           NP(dydataColumn.mask == defs.VALID), selection)

                if selection is None:
                    xlist = samples
                    ylist = ydataColumn.data
                    dxlist = NP(
                        (NP("roll", xlist, -1) - NP("roll", xlist, 1)) / 2.0)
                    dylist = dydataColumn.data
                else:
                    xlist = samples[selection]
                    ylist = ydataColumn.data[selection]
                    dxlist = NP(
                        (NP("roll", xlist, -1) - NP("roll", xlist, 1)) / 2.0)
                    dylist = NP(dydataColumn.data[selection] * dxlist)

                if not loop:
                    dxlist[0] = 0.0
                    dxlist[-1] = 0.0
                    dylist[0] = 0.0
                    dylist[-1] = 0.0

        elif len(expression) == 2:
            sampleTable = DataTable({"t": "double"}, {"t": samples})

            parsed = Formula.parse(expression[0])
            xdataColumn = parsed.evaluate(sampleTable, functionTable,
                                          performanceTable)
            if not xdataColumn.fieldType.isnumeric(
            ) and not xdataColumn.fieldType.istemporal():
                raise defs.PmmlValidationError(
                    "PlotFormula x(t) must return a numeric expression, not %r"
                    % xdataColumn.fieldType)

            parsed = Formula.parse(expression[1])
            ydataColumn = parsed.evaluate(sampleTable, functionTable,
                                          performanceTable)
            if not ydataColumn.fieldType.isnumeric(
            ) and not ydataColumn.fieldType.istemporal():
                raise defs.PmmlValidationError(
                    "PlotFormula y(t) must return a numeric expression, not %r"
                    % ydataColumn.fieldType)

            xfieldType = xdataColumn.fieldType
            yfieldType = ydataColumn.fieldType

            selection = None
            if xdataColumn.mask is not None:
                selection = NP(xdataColumn.mask == defs.VALID)
            if ydataColumn.mask is not None:
                if selection is None:
                    selection = NP(ydataColumn.mask == defs.VALID)
                else:
                    NP("logical_and", selection,
                       NP(ydataColumn.mask == defs.VALID), selection)

            if derivative[0] is None:
                if selection is None:
                    xlist = xdataColumn.data
                    ylist = ydataColumn.data
                else:
                    xlist = xdataColumn.data[selection]
                    ylist = ydataColumn.data[selection]

                dxlist = NP(
                    (NP("roll", xlist, -1) - NP("roll", xlist, 1)) / 2.0)
                dylist = NP(
                    (NP("roll", ylist, -1) - NP("roll", ylist, 1)) / 2.0)
                if not loop:
                    dxlist[0] = 0.0
                    dxlist[-1] = 0.0
                    dylist[0] = 0.0
                    dylist[-1] = 0.0

            else:
                parsed = Formula.parse(derivative[0])
                dxdataColumn = parsed.evaluate(sampleTable, functionTable,
                                               performanceTable)
                if not dxdataColumn.fieldType.isnumeric(
                ) and not dxdataColumn.fieldType.istemporal():
                    raise defs.PmmlValidationError(
                        "PlotFormula dx/dt must return a numeric expression, not %r"
                        % dxdataColumn.fieldType)

                parsed = Formula.parse(derivative[1])
                dydataColumn = parsed.evaluate(sampleTable, functionTable,
                                               performanceTable)
                if not dydataColumn.fieldType.isnumeric(
                ) and not dydataColumn.fieldType.istemporal():
                    raise defs.PmmlValidationError(
                        "PlotFormula dy/dt must return a numeric expression, not %r"
                        % dydataColumn.fieldType)

                if dxdataColumn.mask is not None:
                    if selection is None:
                        selection = NP(dxdataColumn.mask == defs.VALID)
                    else:
                        NP("logical_and", selection,
                           NP(dxdataColumn.mask == defs.VALID), selection)

                if dydataColumn.mask is not None:
                    if selection is None:
                        selection = NP(dydataColumn.mask == defs.VALID)
                    else:
                        NP("logical_and", selection,
                           NP(dydataColumn.mask == defs.VALID), selection)

                if selection is None:
                    dt = NP(
                        (NP("roll", samples, -1) - NP("roll", samples, 1)) /
                        2.0)

                    xlist = xdataColumn.data
                    ylist = ydataColumn.data
                    dxlist = NP(dxdataColumn.data * dt)
                    dylist = NP(dydataColumn.data * dt)
                else:
                    dt = NP((NP("roll", samples[selection], -1) -
                             NP("roll", samples[selection], 1)) / 2.0)

                    xlist = xdataColumn.data[selection]
                    ylist = ydataColumn.data[selection]
                    dxlist = NP(dxdataColumn.data[selection] * dt)
                    dylist = NP(dydataColumn.data[selection] * dt)

                if not loop:
                    dxlist[0] = 0.0
                    dxlist[-1] = 0.0
                    dylist[0] = 0.0
                    dylist[-1] = 0.0

        return xlist, ylist, dxlist, dylist, xfieldType, yfieldType
Example #7
0
    def prepare(self, state, dataTable, functionTable, performanceTable,
                plotRange):
        """Prepare a plot element for drawing.

        This stage consists of calculating all quantities and
        determing the bounds of the data.  These bounds may be unioned
        with bounds from other plot elements that overlay this plot
        element, so the drawing (which requires a finalized coordinate
        system) cannot begin yet.

        This method modifies C{plotRange}.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type dataTable: DataTable
        @param dataTable: Contains the data to plot.
        @type functionTable: FunctionTable
        @param functionTable: Defines functions that may be used to transform data for plotting.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process.
        @type plotRange: PlotRange
        @param plotRange: The bounding box of plot coordinates that this function will expand.
        """

        self.checkRoles(["z(x,y)", "x", "y", "zmean", "zweight"])

        performanceTable.begin("PlotHeatMap prepare")
        self._saveContext(dataTable)

        zofxy = self.xpath("pmml:PlotFormula[@role='z(x,y)']")
        xexpr = self.xpath("pmml:PlotNumericExpression[@role='x']")
        yexpr = self.xpath("pmml:PlotNumericExpression[@role='y']")
        zmean = self.xpath("pmml:PlotNumericExpression[@role='zmean']")
        zweight = self.xpath("pmml:PlotNumericExpression[@role='zweight']")
        cutExpression = self.xpath("pmml:PlotSelection")

        if len(zofxy) == 1 and len(xexpr) == 0 and len(yexpr) == 0 and len(
                zmean) == 0 and len(zweight) == 0:
            xbins = self.get("xbins", convertType=True)
            xlow = self.get("xlow", convertType=True)
            xhigh = self.get("xhigh", convertType=True)
            ybins = self.get("ybins", convertType=True)
            ylow = self.get("ylow", convertType=True)
            yhigh = self.get("yhigh", convertType=True)

            if xbins is None or xlow is None or xhigh is None or ybins is None or ylow is None or yhigh is None:
                raise defs.PmmlValidationError(
                    "xbins, xlow, xhigh, ybins, ylow, and yhigh are required for HeatMaps of a mathematical formula"
                )

            if xlow >= xhigh or ylow >= yhigh:
                raise defs.PmmlValidationError(
                    "xlow must be less than xhigh and ylow must be less than yhigh"
                )

            if plotRange.xStrictlyPositive or plotRange.yStrictlyPositive:
                raise defs.PmmlValidationError(
                    "PlotHeatMap can only be properly displayed in linear x, y coordinates"
                )

            xbinWidth = (xhigh - xlow) / float(xbins)
            ybinWidth = (yhigh - ylow) / float(ybins)

            xarray = NP("tile",
                        NP("linspace", xlow, xhigh, xbins, endpoint=True),
                        ybins)
            yarray = NP("repeat",
                        NP("linspace", ylow, yhigh, ybins, endpoint=True),
                        xbins)

            sampleTable = DataTable({
                "x": "double",
                "y": "double"
            }, {
                "x": xarray,
                "y": yarray
            })
            parsed = Formula.parse(zofxy[0].text)

            performanceTable.pause("PlotHeatMap prepare")
            zdataColumn = parsed.evaluate(sampleTable, functionTable,
                                          performanceTable)
            performanceTable.unpause("PlotHeatMap prepare")
            if not zdataColumn.fieldType.isnumeric():
                raise defs.PmmlValidationError(
                    "PlotFormula z(x,y) must return a numeric expression, not %r"
                    % zdataColumn.fieldType)

            selection = NP("isfinite", zdataColumn.data)
            if zdataColumn.mask is not None:
                NP("logical_and", selection,
                   NP(zdataColumn.mask == defs.VALID), selection)
            if plotRange.zStrictlyPositive:
                NP("logical_and", selection, NP(zdataColumn.data > 0.0),
                   selection)

            gooddata = zdataColumn.data[selection]
            plotRange.zminPush(gooddata.min(),
                               zdataColumn.fieldType,
                               sticky=False)
            plotRange.zmaxPush(gooddata.max(),
                               zdataColumn.fieldType,
                               sticky=False)

            state.zdata = zdataColumn.data
            state.zmask = NP("logical_not", selection) * defs.INVALID

        elif len(zofxy) == 0 and len(xexpr) == 1 and len(yexpr) == 1:
            performanceTable.pause("PlotHeatMap prepare")
            xdataColumn = xexpr[0].evaluate(dataTable, functionTable,
                                            performanceTable)
            ydataColumn = yexpr[0].evaluate(dataTable, functionTable,
                                            performanceTable)
            performanceTable.unpause("PlotHeatMap prepare")

            xbins = self.get("xbins", convertType=True)
            xlow = self.get("xlow", convertType=True)
            xhigh = self.get("xhigh", convertType=True)
            ybins = self.get("ybins", convertType=True)
            ylow = self.get("ylow", convertType=True)
            yhigh = self.get("yhigh", convertType=True)

            if len(xdataColumn) > 0:
                if xlow is None: xlow = NP("nanmin", xdataColumn.data)
                if xhigh is None: xhigh = NP("nanmax", xdataColumn.data)
                if ylow is None: ylow = NP("nanmin", ydataColumn.data)
                if yhigh is None: yhigh = NP("nanmax", ydataColumn.data)
            else:
                if xlow is None: xlow = 0.0
                if xhigh is None: xhigh = 1.0
                if ylow is None: ylow = 0.0
                if yhigh is None: yhigh = 1.0

            if xbins is None:
                q1, q3 = NP("percentile", xdataColumn.data, [25.0, 75.0])
                binWidth = 2.0 * (q3 - q1) / math.pow(len(xdataColumn.data),
                                                      1.0 / 3.0)
                if binWidth > 0.0:
                    xbins = max(10, int(math.ceil((xhigh - xlow) / binWidth)))
                else:
                    xbins = 10

            if ybins is None:
                q1, q3 = NP("percentile", ydataColumn.data, [25.0, 75.0])
                binWidth = 2.0 * (q3 - q1) / math.pow(len(ydataColumn.data),
                                                      1.0 / 3.0)
                if binWidth > 0.0:
                    ybins = max(10, int(math.ceil((yhigh - ylow) / binWidth)))
                else:
                    ybins = 10

            if xlow >= xhigh or ylow >= yhigh:
                raise defs.PmmlValidationError(
                    "xlow must be less than xhigh and ylow must be less than yhigh"
                )

            if plotRange.xStrictlyPositive or plotRange.yStrictlyPositive:
                raise defs.PmmlValidationError(
                    "PlotHeatMap can only be properly displayed in linear x, y coordinates"
                )

            persistentState = {}
            stateId = self.get("stateId")
            if stateId is not None:
                if stateId in dataTable.state:
                    persistentState = dataTable.state[stateId]
                else:
                    dataTable.state[stateId] = persistentState

            if len(zmean) == 0:
                if "xbins" in persistentState: xbins = persistentState["xbins"]
                if "xlow" in persistentState: xlow = persistentState["xlow"]
                if "xhigh" in persistentState: xhigh = persistentState["xhigh"]
                if "ybins" in persistentState: ybins = persistentState["ybins"]
                if "ylow" in persistentState: ylow = persistentState["ylow"]
                if "yhigh" in persistentState: yhigh = persistentState["yhigh"]

                persistentState["xbins"] = xbins
                persistentState["xlow"] = xlow
                persistentState["xhigh"] = xhigh
                persistentState["ybins"] = ybins
                persistentState["ylow"] = ylow
                persistentState["yhigh"] = yhigh

            xbinWidth = (xhigh - xlow) / float(xbins)
            ybinWidth = (yhigh - ylow) / float(ybins)

            mask = NP("ones", len(dataTable), dtype=NP.dtype(float))
            if xdataColumn.mask is not None:
                NP("multiply", mask, (xdataColumn.mask == defs.VALID), mask)
            if ydataColumn.mask is not None:
                NP("multiply", mask, (ydataColumn.mask == defs.VALID), mask)

            if len(cutExpression) == 1:
                performanceTable.pause("PlotHeatMap prepare")
                NP(
                    "multiply", mask,
                    cutExpression[0].select(dataTable, functionTable,
                                            performanceTable), mask)
                performanceTable.unpause("PlotHeatMap prepare")

            if len(zmean) == 0 and len(zweight) == 0:
                histogram, xedges, yedges = NP("histogram2d",
                                               ydataColumn.data,
                                               xdataColumn.data,
                                               bins=(ybins, xbins),
                                               range=[[ylow, yhigh],
                                                      [xlow, xhigh]],
                                               weights=mask)
                if len(dataTable) == 0:
                    # work around Numpy <= 1.6.1 bug
                    histogram = NP("zeros", (ybins, xbins),
                                   dtype=NP.dtype(float))

                if "histogram" in persistentState:
                    persistentState["histogram"] = NP(
                        persistentState["histogram"] + histogram)
                else:
                    persistentState["histogram"] = histogram

                histogram = persistentState["histogram"]

                if plotRange.zStrictlyPositive:
                    zmin = 0.1
                else:
                    zmin = 0.0
                zmax = NP("nanmax", histogram)

                plotRange.zminPush(zmin, self.zfieldType, sticky=True)
                if zmax > zmin:
                    plotRange.zmaxPush(zmax, self.zfieldType, sticky=False)

            elif len(zmean) == 0 and len(zweight) == 1:
                performanceTable.pause("PlotHeatMap prepare")
                weightsDataColumn = zweight[0].evaluate(
                    dataTable, functionTable, performanceTable)
                performanceTable.unpause("PlotHeatMap prepare")

                if weightsDataColumn.mask is not None:
                    NP("multiply", mask,
                       (weightsDataColumn.mask == defs.VALID), mask)
                weights = NP(weightsDataColumn.data * mask)

                histogram, xedges, yedges = NP("histogram2d",
                                               ydataColumn.data,
                                               xdataColumn.data,
                                               bins=(ybins, xbins),
                                               range=[[ylow, yhigh],
                                                      [xlow, xhigh]],
                                               weights=weights)

                if "histogram" in persistentState:
                    persistentState["histogram"] = NP(
                        persistentState["histogram"] + histogram)
                else:
                    persistentState["histogram"] = histogram

                histogram = persistentState["histogram"]

                if plotRange.zStrictlyPositive:
                    w = weights[NP(weights > 0.0)]
                    if len(w) > 0:
                        zmin = 0.1 * NP("nanmin", w)
                    else:
                        zmin = 0.1
                else:
                    zmin = 0.0
                zmax = NP("nanmax", histogram)

                plotRange.zminPush(zmin, self.zfieldType, sticky=True)
                if zmax > zmin:
                    plotRange.zmaxPush(zmax, self.zfieldType, sticky=False)

            elif len(zmean) == 1 and len(zweight) == 0:
                performanceTable.pause("PlotHeatMap prepare")
                zdataColumn = zmean[0].evaluate(dataTable, functionTable,
                                                performanceTable)
                performanceTable.unpause("PlotHeatMap prepare")

                if zdataColumn.mask is not None:
                    NP("multiply", mask, (zdataColumn.mask == defs.VALID),
                       mask)
                weights = NP(zdataColumn.data * mask)

                numer, xedges, yedges = NP("histogram2d",
                                           ydataColumn.data,
                                           xdataColumn.data,
                                           bins=(ybins, xbins),
                                           range=[[ylow, yhigh], [xlow,
                                                                  xhigh]],
                                           weights=weights)
                denom, xedges, yedges = NP("histogram2d",
                                           ydataColumn.data,
                                           xdataColumn.data,
                                           bins=(ybins, xbins),
                                           range=[[ylow, yhigh], [xlow,
                                                                  xhigh]],
                                           weights=mask)

                if "numer" in persistentState:
                    persistentState["numer"] = NP(persistentState["numer"] +
                                                  numer)
                    persistentState["denom"] = NP(persistentState["denom"] +
                                                  denom)
                else:
                    persistentState["numer"] = numer
                    persistentState["denom"] = denom

                numer = persistentState["numer"]
                denom = persistentState["denom"]
                histogram = numer / denom

                selection = NP("isfinite", histogram)
                if plotRange.zStrictlyPositive:
                    NP("logical_and", selection, NP(histogram > 0.0),
                       selection)

                if NP("count_nonzero", selection) > 0:
                    gooddata = histogram[selection]
                    plotRange.zminPush(gooddata.min(),
                                       self.zfieldType,
                                       sticky=False)
                    plotRange.zmaxPush(gooddata.max(),
                                       self.zfieldType,
                                       sticky=False)

            else:
                raise defs.PmmlValidationError(
                    "The only allowed combinations of PlotFormula/PlotNumericExpressions are: \"z(x,y)\" (function), \"x y\" (histogram), \"x y zmean\" (mean of z in x y bins), \"x y zweight\" (weighted x y histogram)"
                )

            state.zdata = NP("reshape", histogram, xbins * ybins)
            state.zmask = None

        else:
            raise defs.PmmlValidationError(
                "The only allowed combinations of PlotFormula/PlotNumericExpressions are: \"z(x,y)\" (function), \"x y\" (histogram), \"x y zmean\" (mean of z in x y bins), \"x y zweight\" (weighted x y histogram)"
            )

        plotRange.xminPush(xlow, self.xyfieldType, sticky=True)
        plotRange.yminPush(ylow, self.xyfieldType, sticky=True)
        plotRange.xmaxPush(xhigh, self.xyfieldType, sticky=True)
        plotRange.ymaxPush(yhigh, self.xyfieldType, sticky=True)

        state.xbins = xbins
        state.xlow = xlow
        state.xhigh = xhigh
        state.ybins = ybins
        state.ylow = ylow
        state.yhigh = yhigh

        performanceTable.end("PlotHeatMap prepare")
Example #8
0
    def verify(self, showSuccess=False, performanceTable=None):
        """Run the model verification tests defined by this element.

        The output is a list of results (all results or only failures,
        depending on C{showSuccess}), each of which is a dictionary of
        field names to values.  Fields are:

          - "success": was the comparison successful?
          - "expectedMissing", "observedMissing": is the
             expected/observed value missing?
          - "expectedValue", "observedValue": result as an internal
             value.
          - "expectedPythonValue", "observedPythonValue": result as a
             Python value.
          - "expectedDisplayValue", "observedDisplayValue": result as
             a string displayValue.

        Only "success", "expectedMissing", and "observedMissing" appear
        if the "is missing?" comparison was unsuccessful.

        @type showSuccess: bool
        @param showSuccess: If True, emit output even if the tests are successful.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: JSON-like list of dicts
        @return: As described above.
        """

        verificationFields = {}
        for verificationField in self.xpath("pmml:VerificationFields/pmml:VerificationField"):
            verificationField.column = verificationField.get("column", verificationField["field"])
            verificationField.precision = verificationField.get("precision", defaultFromXsd=True, convertType=True)
            verificationField.zeroThreshold = verificationField.get("zeroThreshold", defaultFromXsd=True, convertType=True)

            verificationField.data = []
            verificationField.mask = []
            verificationFields[verificationField.column] = verificationField

        inputData = {}
        inputMask = {}
        for index, row in enumerate(self.childOfClass(TableInterface).iterate()):
            for columnName, columnValue in row.items():
                verificationField = verificationFields.get(columnName)

                if verificationField is not None:
                    while len(verificationField.data) < index:
                        verificationField.data.append(defs.PADDING)
                        verificationField.mask.append(True)
                    
                    verificationField.data.append(columnValue)
                    verificationField.mask.append(False)

                else:
                    inputDataField = inputData.get(columnName)
                    if inputDataField is None:
                        inputDataField = []
                        inputData[columnName] = inputDataField
                        inputMask[columnName] = []
                    inputMaskField = inputMask[columnName]

                    while len(inputDataField) < index:
                        inputDataField.append(defs.PADDING)
                        inputMaskField.append(True)

                    inputDataField.append(columnValue)
                    inputMaskField.append(False)

        for verificationField in verificationFields.values():
            while len(verificationField.data) < index:
                verificationField.data.append(defs.PADDING)
                verificationField.mask.append(True)

        for columnName in inputData:
            inputDataField = inputData[columnName]
            inputMaskField = inputMask[columnName]
            while len(inputDataField) < index:
                inputDataField.append(defs.PADDING)
                inputMaskField.append(True)

        for columnName, verificationField in verificationFields.items():
            inputData[columnName] = verificationField.data
            inputMask[columnName] = verificationField.mask

        model = self.getparent()

        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        performanceTable.begin("make DataTable")
        dataTable = DataTable(model, inputData, inputMask, inputState=None)
        performanceTable.end("make DataTable")

        functionTable = FunctionTable()

        for miningField in model.xpath("pmml:MiningSchema/pmml:MiningField"):
            miningField.replaceField(dataTable, functionTable, performanceTable)

        for calculable in model.calculableTrans():
            calculable.calculate(dataTable, functionTable, performanceTable)

        score = model.calculateScore(dataTable, functionTable, performanceTable)
        dataTable.score = score[None]
        if model.name is not None:
            for key, value in score.items():
                if key is None:
                    dataTable.fields[model.name] = value
                else:
                    dataTable.fields["%s.%s" % (model.name, key)] = value

        for outputField in self.xpath("../pmml:Output/pmml:OutputField"):
            displayName = outputField.get("displayName", outputField["name"])
            outputField.format(dataTable, functionTable, performanceTable, score)

        output = []
        for verificationField in verificationFields.values():
            observedOutput = dataTable.fields.get(verificationField["field"])

            if observedOutput is None:
                raise defs.PmmlValidationError("VerificationField references field \"%s\" but it was not produced by the model")
            fieldType = observedOutput.fieldType

            if fieldType.dataType == "object":
                try:
                    newArray = [float(x) for x in observedOutput.data]
                except ValueError:
                    pass
                else:
                    fieldType = FakeFieldType("double", "continuous")
                    observedOutput._data = newArray
                        
            for index in xrange(len(dataTable)):
                record = {"field": verificationField["field"], "index": index}

                record["expectedMissing"] = verificationField.mask[index]
                record["observedMissing"] = (observedOutput.mask is not None and observedOutput.mask[index] != defs.VALID)

                if record["expectedMissing"] != record["observedMissing"]:
                    record["success"] = False
                    output.append(record)

                elif not record["expectedMissing"]:
                    record["expectedValue"] = fieldType.stringToValue(verificationField.data[index])
                    record["observedValue"] = observedOutput.data[index]
                    record["expectedPythonValue"] = fieldType.valueToPython(record["expectedValue"])
                    record["observedPythonValue"] = fieldType.valueToPython(record["observedValue"])
                    record["expectedDisplayValue"] = fieldType.valueToString(record["expectedValue"])
                    record["observedDisplayValue"] = fieldType.valueToString(record["observedValue"])

                    if fieldType.optype == "continuous":
                        if (abs(record["expectedValue"]) <= verificationField.zeroThreshold) and (abs(record["observedValue"]) <= verificationField.zeroThreshold):
                            record["success"] = True
                        else:
                            record["success"] = ((record["expectedValue"] * (1.0 - verificationField.precision)) <= record["observedValue"] <= (record["expectedValue"] * (1.0 + verificationField.precision)))

                        if not record["success"] or showSuccess:
                            output.append(record)
                            
                    else:
                        if record["expectedValue"] != record["observedValue"]:
                            record["success"] = False
                            output.append(record)
                        else:
                            record["success"] = True
                            if showSuccess:
                                output.append(record)

        return output
Example #9
0
    def verify(self, showSuccess=False, performanceTable=None):
        """Run the model verification tests defined by this element.

        The output is a list of results (all results or only failures,
        depending on C{showSuccess}), each of which is a dictionary of
        field names to values.  Fields are:

          - "success": was the comparison successful?
          - "expectedMissing", "observedMissing": is the
             expected/observed value missing?
          - "expectedValue", "observedValue": result as an internal
             value.
          - "expectedPythonValue", "observedPythonValue": result as a
             Python value.
          - "expectedDisplayValue", "observedDisplayValue": result as
             a string displayValue.

        Only "success", "expectedMissing", and "observedMissing" appear
        if the "is missing?" comparison was unsuccessful.

        @type showSuccess: bool
        @param showSuccess: If True, emit output even if the tests are successful.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: JSON-like list of dicts
        @return: As described above.
        """

        verificationFields = {}
        for verificationField in self.xpath(
                "pmml:VerificationFields/pmml:VerificationField"):
            verificationField.column = verificationField.get(
                "column", verificationField["field"])
            verificationField.precision = verificationField.get(
                "precision", defaultFromXsd=True, convertType=True)
            verificationField.zeroThreshold = verificationField.get(
                "zeroThreshold", defaultFromXsd=True, convertType=True)

            verificationField.data = []
            verificationField.mask = []
            verificationFields[verificationField.column] = verificationField

        inputData = {}
        inputMask = {}
        for index, row in enumerate(
                self.childOfClass(TableInterface).iterate()):
            for columnName, columnValue in row.items():
                verificationField = verificationFields.get(columnName)

                if verificationField is not None:
                    while len(verificationField.data) < index:
                        verificationField.data.append(defs.PADDING)
                        verificationField.mask.append(True)

                    verificationField.data.append(columnValue)
                    verificationField.mask.append(False)

                else:
                    inputDataField = inputData.get(columnName)
                    if inputDataField is None:
                        inputDataField = []
                        inputData[columnName] = inputDataField
                        inputMask[columnName] = []
                    inputMaskField = inputMask[columnName]

                    while len(inputDataField) < index:
                        inputDataField.append(defs.PADDING)
                        inputMaskField.append(True)

                    inputDataField.append(columnValue)
                    inputMaskField.append(False)

        for verificationField in verificationFields.values():
            while len(verificationField.data) < index:
                verificationField.data.append(defs.PADDING)
                verificationField.mask.append(True)

        for columnName in inputData:
            inputDataField = inputData[columnName]
            inputMaskField = inputMask[columnName]
            while len(inputDataField) < index:
                inputDataField.append(defs.PADDING)
                inputMaskField.append(True)

        for columnName, verificationField in verificationFields.items():
            inputData[columnName] = verificationField.data
            inputMask[columnName] = verificationField.mask

        model = self.getparent()

        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        performanceTable.begin("make DataTable")
        dataTable = DataTable(model, inputData, inputMask, inputState=None)
        performanceTable.end("make DataTable")

        functionTable = FunctionTable()

        for miningField in model.xpath("pmml:MiningSchema/pmml:MiningField"):
            miningField.replaceField(dataTable, functionTable,
                                     performanceTable)

        for calculable in model.calculableTrans():
            calculable.calculate(dataTable, functionTable, performanceTable)

        score = model.calculateScore(dataTable, functionTable,
                                     performanceTable)
        dataTable.score = score[None]
        if model.name is not None:
            for key, value in score.items():
                if key is None:
                    dataTable.fields[model.name] = value
                else:
                    dataTable.fields["%s.%s" % (model.name, key)] = value

        for outputField in self.xpath("../pmml:Output/pmml:OutputField"):
            displayName = outputField.get("displayName", outputField["name"])
            outputField.format(dataTable, functionTable, performanceTable,
                               score)

        output = []
        for verificationField in verificationFields.values():
            observedOutput = dataTable.fields.get(verificationField["field"])

            if observedOutput is None:
                raise defs.PmmlValidationError(
                    "VerificationField references field \"%s\" but it was not produced by the model"
                )
            fieldType = observedOutput.fieldType

            if fieldType.dataType == "object":
                try:
                    newArray = [float(x) for x in observedOutput.data]
                except ValueError:
                    pass
                else:
                    fieldType = FakeFieldType("double", "continuous")
                    observedOutput._data = newArray

            for index in xrange(len(dataTable)):
                record = {"field": verificationField["field"], "index": index}

                record["expectedMissing"] = verificationField.mask[index]
                record["observedMissing"] = (
                    observedOutput.mask is not None
                    and observedOutput.mask[index] != defs.VALID)

                if record["expectedMissing"] != record["observedMissing"]:
                    record["success"] = False
                    output.append(record)

                elif not record["expectedMissing"]:
                    record["expectedValue"] = fieldType.stringToValue(
                        verificationField.data[index])
                    record["observedValue"] = observedOutput.data[index]
                    record["expectedPythonValue"] = fieldType.valueToPython(
                        record["expectedValue"])
                    record["observedPythonValue"] = fieldType.valueToPython(
                        record["observedValue"])
                    record["expectedDisplayValue"] = fieldType.valueToString(
                        record["expectedValue"])
                    record["observedDisplayValue"] = fieldType.valueToString(
                        record["observedValue"])

                    if fieldType.optype == "continuous":
                        if (abs(record["expectedValue"]) <=
                                verificationField.zeroThreshold) and (
                                    abs(record["observedValue"]) <=
                                    verificationField.zeroThreshold):
                            record["success"] = True
                        else:
                            record["success"] = (
                                (record["expectedValue"] *
                                 (1.0 - verificationField.precision)) <=
                                record["observedValue"] <=
                                (record["expectedValue"] *
                                 (1.0 + verificationField.precision)))

                        if not record["success"] or showSuccess:
                            output.append(record)

                    else:
                        if record["expectedValue"] != record["observedValue"]:
                            record["success"] = False
                            output.append(record)
                        else:
                            record["success"] = True
                            if showSuccess:
                                output.append(record)

        return output