Example #1
0
    def singleton(self, inputData, inputMask=None, inputState=None):
        """Create a single-row DataTable for event-based processes.

        This static method is to the DataTable constructor, but it
        creates a DataTable with only one row and it uses the Python
        data type of the C{inputData} to define a type, rather than an
        explicit C{context}.

        @type inputData: dict-like mapping from strings to single values (not lists)
        @param inputData: A single data record.
        @type inputMask: dict-like mapping from strings to single C{defs.maskType} values (not lists), or None
        @param inputMask: A single mask.
        @type inputState: DataTableState or None
        @param inputState: Initial state of the DataTable.  To continue a previous calculation, use the C{dataTable.state} from the previous calculation.
        """

        dataColumns = OrderedDict()
        for fieldName in sorted(inputData.keys()):
            value = inputData[fieldName]

            if isinstance(value, basestring):
                fieldType = FakeFieldType("string", "continuous")
            elif isinstance(value, float):
                fieldType = FakeFieldType("double", "continuous")
            elif isinstance(value, int):
                fieldType = FakeFieldType("integer", "continuous")
            elif isinstance(value, bool):
                fieldType = FakeFieldType("boolean", "continuous")

            # TODO: PMML date types (when passed a datetype.datetype object)

            else:
                fieldType = FakeFieldType("object", "any")

            data = NP("empty", 1, dtype=fieldType.dtype)
            data[0] = value

            if inputMask is None or inputMask.get(fieldName) is None:
                mask = None
            else:
                mask = NP("empty", 1, dtype=defs.maskType)
                mask[0] = inputMask.get(fieldName)

            dataColumns[fieldName] = DataColumn(fieldType, data, mask)

        dataTable = DataTable.__new__(DataTable)
        dataTable._configure(dataColumns, inputState)
        return dataTable
Example #2
0
    def functionAverageFake(self, value, howmany, fieldType):
        """Averages rows in a DataColumn when it is known that there are no matches.

        @type value: number
        @param value: Initial and final value.
        @type howmany: int
        @param howmany: Number of rows.
        @type fieldType: FieldType
        @param fieldType: The type of field to emulate.
        @rtype: DataColumn
        @return: The faked results.
        """

        fieldType = FakeFieldType("double", "continuous")
        numerator = NP("empty", howmany, dtype=fieldType.dtype)
        denominator = NP("empty", howmany, dtype=fieldType.dtype)
        numerator[:] = value[0]
        denominator[:] = value[1]
        data = NP(numerator / denominator)
        if value[1] == 0:
            mask = NP("empty", howmany, dtype=defs.maskType)
            mask[:] = defs.INVALID
        else:
            mask = None
        return DataColumn(fieldType, data, mask)
Example #3
0
    def functionAverage(self, dataColumn, whereMask, groupSelection, getstate,
                        setstate):
        """Averages rows in a DataColumn, possibly with an SQL where mask and groupField.

        @type dataColumn: DataColumn
        @param dataColumn: The input data column.
        @type whereMask: 1d Numpy array of bool, or None
        @param whereMask: The result of the SQL where selection.
        @type groupSelection: 1d Numpy array of bool, or None.
        @param groupSelection: Rows corresponding to a particular value of the groupField.
        @type getstate: callable function
        @param getstate: Retrieve staring values from the DataTableState.
        @type setstate: callable function
        @param setstate: Store ending values to the DataTableState.
        @rtype: DataColumn
        @return: A column of averaged rows.
        """

        fieldType = FakeFieldType("double", "continuous")

        if dataColumn.fieldType.dataType not in ("integer", "float", "double"):
            raise defs.PmmlValidationError(
                "Aggregate function \"average\" requires a numeric input field: \"integer\", \"float\", \"double\""
            )

        denominator = NP("ones", len(dataColumn), dtype=fieldType.dtype)
        if dataColumn.mask is not None:
            NP("logical_and", denominator, NP(dataColumn.mask == defs.VALID),
               denominator)

        if whereMask is not None:
            NP("logical_and", denominator, whereMask, denominator)

        if groupSelection is not None:
            NP("logical_and", denominator, groupSelection, denominator)

        numerator = NP("multiply", denominator, dataColumn.data)

        if getstate is not None and len(dataColumn) > 0:
            startingState = getstate()
            if startingState is not None:
                startingNumerator, startingDenominator = startingState
                numerator[0] += startingNumerator
                denominator[0] += startingDenominator

        numerator = NP("cumsum", numerator)
        denominator = NP("cumsum", denominator)

        data = NP(numerator / denominator)
        mask = NP(NP("logical_not", NP("isfinite", data)) * defs.INVALID)
        if not mask.any():
            mask = None

        if setstate is not None and len(dataColumn) > 0:
            setstate((numerator[-1], denominator[-1]))

        return DataColumn(fieldType, data, mask)
Example #4
0
    def functionMultiset(self, dataColumn, whereMask, groupSelection, getstate,
                         setstate):
        """Derives a multiset of rows in a DataColumn, possibly with an SQL where mask and groupField.

        @type dataColumn: DataColumn
        @param dataColumn: The input data column.
        @type whereMask: 1d Numpy array of bool, or None
        @param whereMask: The result of the SQL where selection.
        @type groupSelection: 1d Numpy array of bool, or None.
        @param groupSelection: Rows corresponding to a particular value of the groupField.
        @type getstate: callable function
        @param getstate: Retrieve staring values from the DataTableState.
        @type setstate: callable function
        @param setstate: Store ending values to the DataTableState.
        @rtype: DataColumn of dict objects
        @return: A column of multisetted rows.
        """

        fieldType = FakeFieldType("object", "any")

        selection = NP("ones", len(dataColumn), dtype=NP.dtype(bool))
        if dataColumn.mask is not None:
            selection = NP("logical_and", selection,
                           NP(dataColumn.mask == defs.VALID))

        if whereMask is not None:
            NP("logical_and", selection, whereMask, selection)

        if groupSelection is not None:
            NP("logical_and", selection, groupSelection, selection)

        multiset = {}
        if getstate is not None:
            startingState = getstate()
            if startingState is not None:
                multiset = startingState
        current = dict(multiset)

        data = NP("empty", len(dataColumn), dtype=NP.dtype(object))

        toPython = dataColumn.fieldType.valueToPython
        for i, x in enumerate(dataColumn.data):
            if selection[i]:
                value = toPython(x)
                if value not in multiset:
                    multiset[value] = 0
                multiset[value] += 1
                current = dict(multiset)
            data[i] = current

        if setstate is not None:
            setstate(multiset)

        return DataColumn(fieldType, data, None)
Example #5
0
    def functionSum(self, dataColumn, whereMask, groupSelection, getstate,
                    setstate):
        """Adds up rows in a DataColumn, possibly with an SQL where mask and groupField.

        @type dataColumn: DataColumn
        @param dataColumn: The input data column.
        @type whereMask: 1d Numpy array of bool, or None
        @param whereMask: The result of the SQL where selection.
        @type groupSelection: 1d Numpy array of bool, or None.
        @param groupSelection: Rows corresponding to a particular value of the groupField.
        @type getstate: callable function
        @param getstate: Retrieve staring values from the DataTableState.
        @type setstate: callable function
        @param setstate: Store ending values to the DataTableState.
        @rtype: DataColumn
        @return: A column of added rows.
        """

        fieldType = FakeFieldType("double", "continuous")

        if dataColumn.fieldType.dataType not in ("integer", "float", "double"):
            raise defs.PmmlValidationError(
                "Aggregate function \"sum\" requires a numeric input field: \"integer\", \"float\", \"double\""
            )

        ones = NP("ones", len(dataColumn), dtype=fieldType.dtype)
        if dataColumn.mask is not None:
            NP("logical_and", ones, NP(dataColumn.mask == defs.VALID), ones)

        if whereMask is not None:
            NP("logical_and", ones, whereMask, ones)

        if groupSelection is not None:
            NP("logical_and", ones, groupSelection, ones)

        NP("multiply", ones, dataColumn.data, ones)

        if getstate is not None and len(dataColumn) > 0:
            startingState = getstate()
            if startingState is not None:
                ones[0] += startingState

        data = NP("cumsum", ones)

        if setstate is not None and len(dataColumn) > 0:
            setstate(data[-1])

        return DataColumn(fieldType, data, None)
Example #6
0
    def functionMultisetFake(self, value, howmany, fieldType):
        """Derives a multiset of rows in a DataColumn when it is known that there are no matches.

        @type value: number
        @param value: Initial and final value.
        @type howmany: int
        @param howmany: Number of rows.
        @type fieldType: FieldType
        @param fieldType: The type of field to emulate.
        @rtype: DataColumn
        @return: The faked results.
        """

        fieldType = FakeFieldType("object", "any")
        data = NP("empty", howmany, dtype=fieldType.dtype)
        data[:] = value
        return DataColumn(fieldType, data, None)
Example #7
0
    class Constant(object):
        """Equivalent of a PMML <Constant> element."""

        def __init__(self, dataType, value):
            self.fieldType = FakeFieldType(dataType, "continuous")
            self.value = value

        def evaluate(self, dataTable, functionTable, performanceTable):
            data = NP("empty", len(dataTable), dtype=self.fieldType.dtype)
            data[:] = self.value
            return self.fieldType.toDataColumn(data, None)

        def __repr__(self):
            return repr(self.value)

        def asPmml(self, E):
            return E.Constant(str(self.value), dataType=self.fieldType.dataType)
Example #8
0
    def emptyDataTable(self):
        """Construct an empty DataTable from the serialized DataTableFields and DataTableState.

        @rtype: DataTable
        @return: An empty DataTable, suitable for PmmlPlotContent.prepare.
        """

        context = {}
        inputData = {}
        inputState = self.unserializeState()

        for name, value in inputState.iteritems():
            if name.endswith(".context"):
                for fieldName, (dataType, optype) in value.iteritems():
                    context[fieldName] = FakeFieldType(dataType, optype)
                    inputData[fieldName] = []

        return DataTable(context, inputData, inputState=inputState)
Example #9
0
    def functionCount(self, dataColumn, whereMask, groupSelection, getstate,
                      setstate):
        """Counts rows in a DataColumn, possibly with an SQL where mask and groupField.

        @type dataColumn: DataColumn
        @param dataColumn: The input data column.
        @type whereMask: 1d Numpy array of bool, or None
        @param whereMask: The result of the SQL where selection.
        @type groupSelection: 1d Numpy array of bool, or None.
        @param groupSelection: Rows corresponding to a particular value of the groupField.
        @type getstate: callable function
        @param getstate: Retrieve staring values from the DataTableState.
        @type setstate: callable function
        @param setstate: Store ending values to the DataTableState.
        @rtype: DataColumn
        @return: A column of counted rows.
        """

        fieldType = FakeFieldType("integer", "continuous")

        ones = NP("ones", len(dataColumn), dtype=fieldType.dtype)
        if dataColumn.mask is not None:
            NP("logical_and", ones, NP(dataColumn.mask == defs.VALID), ones)

        if whereMask is not None:
            NP("logical_and", ones, whereMask, ones)

        if groupSelection is not None:
            NP("logical_and", ones, groupSelection, ones)

        if getstate is not None and len(dataColumn) > 0:
            startingState = getstate()
            if startingState is not None:
                ones[0] += startingState

        data = NP("cumsum", ones)

        if setstate is not None and len(dataColumn) > 0:
            setstate(data[-1])

        return DataColumn(fieldType, data, None)
Example #10
0
    def prepare(self, state, dataTable, functionTable, performanceTable,
                plotRange):
        """Prepare a plot element for drawing.

        This stage consists of calculating all quantities and
        determing the bounds of the data.  These bounds may be unioned
        with bounds from other plot elements that overlay this plot
        element, so the drawing (which requires a finalized coordinate
        system) cannot begin yet.

        This method modifies C{plotRange}.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type dataTable: DataTable
        @param dataTable: Contains the data to plot.
        @type functionTable: FunctionTable
        @param functionTable: Defines functions that may be used to transform data for plotting.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process.
        @type plotRange: PlotRange
        @param plotRange: The bounding box of plot coordinates that this function will expand.
        """

        self._saveContext(dataTable)

        for directive in self.xpath("pmml:PlotLine"):
            try:
                x1 = float(directive["x1"])
                y1 = float(directive["y1"])
                x2 = float(directive["x2"])
                y2 = float(directive["y2"])
            except ValueError:
                pass
            else:
                fieldType = FakeFieldType("double", "continuous")
                plotRange.xminPush(x1, fieldType, sticky=False)
                plotRange.yminPush(y1, fieldType, sticky=False)
                plotRange.xmaxPush(x2, fieldType, sticky=False)
                plotRange.ymaxPush(y2, fieldType, sticky=False)
Example #11
0
class PlotCurve(PmmlPlotContent):
    """Represents a curve defined by mathematical formulae or a jagged
    line/smooth curve through a set of data points.

    PMML subelements for a 1d formula:

      - PlotFormula role="y(x)"
      - PlotFormula role="dy/dx" (optional)

    PMML subelements for a parametric formula:

      - PlotFormula role="x(t)"
      - PlotFormula role="y(t)"
      - PlotFormula role="dx/dt" (optional)
      - PlotFormula role="dy/dt" (optional)

    PMML subelements for a fit to data points:

      - PlotNumericExpression role="x"
      - PlotNumericExpression role="y"
      - PlotNumericExpression role="dx" (optional)
      - PlotNumericExpression role="dy" (optional)
      - PlotSelection (optional)

    PMML attributes:

      - svgId: id for the resulting SVG element.
      - stateId: key for persistent storage in a DataTableState.
      - low: low edge of domain (in x or t) for mathematical
        formulae.
      - high: high edge of domain (in x or t) for mathematical
        formulae.
      - numSamples: number of locations to sample for mathematical
        formulae.
      - samplingMethod: "uniform", "random", or "adaptive".
      - loop: if "true", draw a closed loop that connects the first
        and last points.
      - smooth: if "false", draw a jagged line between each data
        point; if "true", fit a smooth curve.
      - smoothingScale: size of the smoothing scale in units of the
        domain (in x or t).
      - style: CSS style properties.

    CSS properties:
      - fill, fill-opacity: color under the curve.
      - stroke, stroke-dasharray, stroke-dashoffset, stroke-linecap,
        stroke-linejoin, stroke-miterlimit, stroke-opacity,
        stroke-width: properties of the line drawing.

    See the source code for the full XSD.
    """

    styleProperties = [
        "fill",
        "fill-opacity",
        "stroke",
        "stroke-dasharray",
        "stroke-dashoffset",
        "stroke-linecap",
        "stroke-linejoin",
        "stroke-miterlimit",
        "stroke-opacity",
        "stroke-width",
    ]

    styleDefaults = {"fill": "none", "stroke": "black"}

    xsd = """<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
    <xs:element name="PlotCurve">
        <xs:complexType>
            <xs:sequence>
                <xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded" />
                <xs:choice minOccurs="1" maxOccurs="1">
                    <xs:element ref="PlotFormula" minOccurs="1" maxOccurs="4" />
                    <xs:sequence>
                        <xs:element ref="PlotNumericExpression" minOccurs="1" maxOccurs="4" />
                        <xs:element ref="PlotSelection" minOccurs="0" maxOccurs="1" />
                    </xs:sequence>
                </xs:choice>
            </xs:sequence>
            <xs:attribute name="svgId" type="xs:string" use="optional" />
            <xs:attribute name="stateId" type="xs:string" use="optional" />
            <xs:attribute name="low" type="xs:double" use="optional" />
            <xs:attribute name="high" type="xs:double" use="optional" />
            <xs:attribute name="numSamples" type="xs:positiveInteger" use="optional" default="100" />
            <xs:attribute name="samplingMethod" use="optional" default="uniform">
                <xs:simpleType>
                    <xs:restriction base="xs:string">
                        <xs:enumeration value="uniform" />
                        <xs:enumeration value="random" />
                        <xs:enumeration value="adaptive" />
                    </xs:restriction>
                </xs:simpleType>
            </xs:attribute>
            <xs:attribute name="loop" type="xs:boolean" use="optional" default="false" />
            <xs:attribute name="smooth" type="xs:boolean" use="optional" default="true" />
            <xs:attribute name="smoothingScale" type="xs:double" use="optional" default="1.0" />
            <xs:attribute name="style" type="xs:string" use="optional" default="%s" />
        </xs:complexType>
    </xs:element>
</xs:schema>
""" % PlotStyle.toString(styleDefaults)

    xfieldType = FakeFieldType("double", "continuous")

    @classmethod
    def expressionsToPoints(cls, expression, derivative, samples, loop,
                            functionTable, performanceTable):
        """Evaluate a set of given string-based formulae to generate
        numeric points.

        This is used to plot mathematical curves.

        @type expression: 1- or 2-tuple of strings
        @param expression: If a 1-tuple, the string is passed to Formula and interpreted as y(x); if a 2-tuple, the strings are passed to Formula and interpreted as x(t), y(t).
        @type derivative: 1- or 2-tuple of strings (same length as C{expression})
        @param derivative: Strings are passed to Formua and interpreted as dy/dx (if a 1-tuple) or dx/dt, dy/dt (if a 2-tuple).
        @type samples: 1d Numpy array
        @param samples: Values of x or t at which to evaluate the expression or expressions.
        @type loop: bool
        @param loop: If False, disconnect the end of the set of points from the beginning.
        @type functionTable: FunctionTable
        @param functionTable: Functions that may be used to perform the calculation.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the process.
        @rtype: 6-tuple
        @return: C{xlist}, C{ylist}, C{dxlist}, C{dylist} (1d Numpy arrays), xfieldType, yfieldType (FieldTypes).
        """

        if len(expression) == 1:
            sampleTable = DataTable({"x": "double"}, {"x": samples})

            parsed = Formula.parse(expression[0])
            ydataColumn = parsed.evaluate(sampleTable, functionTable,
                                          performanceTable)
            if not ydataColumn.fieldType.isnumeric(
            ) and not ydataColumn.fieldType.istemporal():
                raise defs.PmmlValidationError(
                    "PlotFormula y(x) must return a numeric expression, not %r"
                    % ydataColumn.fieldType)

            xfieldType = cls.xfieldType
            yfieldType = ydataColumn.fieldType

            selection = None
            if ydataColumn.mask is not None:
                selection = NP(ydataColumn.mask == defs.VALID)

            if derivative[0] is None:
                if selection is None:
                    xlist = samples
                    ylist = ydataColumn.data
                else:
                    xlist = samples[selection]
                    ylist = ydataColumn.data[selection]

                dxlist = NP(
                    (NP("roll", xlist, -1) - NP("roll", xlist, 1)) / 2.0)
                dylist = NP(
                    (NP("roll", ylist, -1) - NP("roll", ylist, 1)) / 2.0)
                if not loop:
                    dxlist[0] = 0.0
                    dxlist[-1] = 0.0
                    dylist[0] = 0.0
                    dylist[-1] = 0.0

            else:
                parsed = Formula.parse(derivative[0])
                dydataColumn = parsed.evaluate(sampleTable, functionTable,
                                               performanceTable)
                if not dydataColumn.fieldType.isnumeric(
                ) and not dydataColumn.fieldType.istemporal():
                    raise defs.PmmlValidationError(
                        "PlotFormula dy/dx must return a numeric expression, not %r"
                        % dydataColumn.fieldType)

                if dydataColumn.mask is not None:
                    if selection is None:
                        selection = NP(dydataColumn.mask == defs.VALID)
                    else:
                        NP("logical_and", selection,
                           NP(dydataColumn.mask == defs.VALID), selection)

                if selection is None:
                    xlist = samples
                    ylist = ydataColumn.data
                    dxlist = NP(
                        (NP("roll", xlist, -1) - NP("roll", xlist, 1)) / 2.0)
                    dylist = dydataColumn.data
                else:
                    xlist = samples[selection]
                    ylist = ydataColumn.data[selection]
                    dxlist = NP(
                        (NP("roll", xlist, -1) - NP("roll", xlist, 1)) / 2.0)
                    dylist = NP(dydataColumn.data[selection] * dxlist)

                if not loop:
                    dxlist[0] = 0.0
                    dxlist[-1] = 0.0
                    dylist[0] = 0.0
                    dylist[-1] = 0.0

        elif len(expression) == 2:
            sampleTable = DataTable({"t": "double"}, {"t": samples})

            parsed = Formula.parse(expression[0])
            xdataColumn = parsed.evaluate(sampleTable, functionTable,
                                          performanceTable)
            if not xdataColumn.fieldType.isnumeric(
            ) and not xdataColumn.fieldType.istemporal():
                raise defs.PmmlValidationError(
                    "PlotFormula x(t) must return a numeric expression, not %r"
                    % xdataColumn.fieldType)

            parsed = Formula.parse(expression[1])
            ydataColumn = parsed.evaluate(sampleTable, functionTable,
                                          performanceTable)
            if not ydataColumn.fieldType.isnumeric(
            ) and not ydataColumn.fieldType.istemporal():
                raise defs.PmmlValidationError(
                    "PlotFormula y(t) must return a numeric expression, not %r"
                    % ydataColumn.fieldType)

            xfieldType = xdataColumn.fieldType
            yfieldType = ydataColumn.fieldType

            selection = None
            if xdataColumn.mask is not None:
                selection = NP(xdataColumn.mask == defs.VALID)
            if ydataColumn.mask is not None:
                if selection is None:
                    selection = NP(ydataColumn.mask == defs.VALID)
                else:
                    NP("logical_and", selection,
                       NP(ydataColumn.mask == defs.VALID), selection)

            if derivative[0] is None:
                if selection is None:
                    xlist = xdataColumn.data
                    ylist = ydataColumn.data
                else:
                    xlist = xdataColumn.data[selection]
                    ylist = ydataColumn.data[selection]

                dxlist = NP(
                    (NP("roll", xlist, -1) - NP("roll", xlist, 1)) / 2.0)
                dylist = NP(
                    (NP("roll", ylist, -1) - NP("roll", ylist, 1)) / 2.0)
                if not loop:
                    dxlist[0] = 0.0
                    dxlist[-1] = 0.0
                    dylist[0] = 0.0
                    dylist[-1] = 0.0

            else:
                parsed = Formula.parse(derivative[0])
                dxdataColumn = parsed.evaluate(sampleTable, functionTable,
                                               performanceTable)
                if not dxdataColumn.fieldType.isnumeric(
                ) and not dxdataColumn.fieldType.istemporal():
                    raise defs.PmmlValidationError(
                        "PlotFormula dx/dt must return a numeric expression, not %r"
                        % dxdataColumn.fieldType)

                parsed = Formula.parse(derivative[1])
                dydataColumn = parsed.evaluate(sampleTable, functionTable,
                                               performanceTable)
                if not dydataColumn.fieldType.isnumeric(
                ) and not dydataColumn.fieldType.istemporal():
                    raise defs.PmmlValidationError(
                        "PlotFormula dy/dt must return a numeric expression, not %r"
                        % dydataColumn.fieldType)

                if dxdataColumn.mask is not None:
                    if selection is None:
                        selection = NP(dxdataColumn.mask == defs.VALID)
                    else:
                        NP("logical_and", selection,
                           NP(dxdataColumn.mask == defs.VALID), selection)

                if dydataColumn.mask is not None:
                    if selection is None:
                        selection = NP(dydataColumn.mask == defs.VALID)
                    else:
                        NP("logical_and", selection,
                           NP(dydataColumn.mask == defs.VALID), selection)

                if selection is None:
                    dt = NP(
                        (NP("roll", samples, -1) - NP("roll", samples, 1)) /
                        2.0)

                    xlist = xdataColumn.data
                    ylist = ydataColumn.data
                    dxlist = NP(dxdataColumn.data * dt)
                    dylist = NP(dydataColumn.data * dt)
                else:
                    dt = NP((NP("roll", samples[selection], -1) -
                             NP("roll", samples[selection], 1)) / 2.0)

                    xlist = xdataColumn.data[selection]
                    ylist = ydataColumn.data[selection]
                    dxlist = NP(dxdataColumn.data[selection] * dt)
                    dylist = NP(dydataColumn.data[selection] * dt)

                if not loop:
                    dxlist[0] = 0.0
                    dxlist[-1] = 0.0
                    dylist[0] = 0.0
                    dylist[-1] = 0.0

        return xlist, ylist, dxlist, dylist, xfieldType, yfieldType

    @staticmethod
    def pointsToSmoothCurve(xarray, yarray, samples, smoothingScale, loop):
        """Fit a smooth line through a set of given numeric points
        with a characteristic smoothing scale.

        This is a non-parametric locally linear fit, used to plot data
        as a smooth line.

        @type xarray: 1d Numpy array of numbers
        @param xarray: Array of x values.
        @type yarray: 1d Numpy array of numbers
        @param yarray: Array of y values.
        @type samples: 1d Numpy array of numbers
        @param samples: Locations at which to fit the C{xarray} and C{yarray} with best-fit positions and derivatives.
        @type smoothingScale: number
        @param smoothingScale: Standard deviation of the Gaussian kernel used to smooth the locally linear fit.
        @type loop: bool
        @param loop: If False, disconnect the end of the fitted curve from the beginning.
        @rtype: 4-tuple of 1d Numpy arrays
        @return: C{xlist}, C{ylist}, C{dxlist}, C{dylist} appropriate for C{formatPathdata}.
        """

        ylist = []
        dylist = []

        for sample in samples:
            weights = NP(
                NP(
                    NP(
                        "exp",
                        NP(
                            NP(-0.5 * NP("power", NP(xarray - sample), 2)) /
                            NP(smoothingScale * smoothingScale))) /
                    smoothingScale) / (math.sqrt(2.0 * math.pi)))
            sum1 = weights.sum()
            sumx = NP(weights * xarray).sum()
            sumxx = NP(weights * NP(xarray * xarray)).sum()
            sumy = NP(weights * yarray).sum()
            sumxy = NP(weights * NP(xarray * yarray)).sum()

            delta = (sum1 * sumxx) - (sumx * sumx)
            intercept = ((sumxx * sumy) - (sumx * sumxy)) / delta
            slope = ((sum1 * sumxy) - (sumx * sumy)) / delta

            ylist.append(intercept + (sample * slope))
            dylist.append(slope)

        xlist = samples
        ylist = NP("array", ylist, dtype=NP.dtype(float))
        dxlist = NP((NP("roll", xlist, -1) - NP("roll", xlist, 1)) / 2.0)
        dylist = NP("array", dylist, dtype=NP.dtype(float)) * dxlist
        if not loop:
            dxlist[0] = 0.0
            dxlist[-1] = 0.0
            dylist[0] = 0.0
            dylist[-1] = 0.0

        return xlist, ylist, dxlist, dylist

    @staticmethod
    def formatPathdata(xlist, ylist, dxlist, dylist, plotCoordinates, loop,
                       smooth):
        """Compute SVG path data from position and derivatives lists.

        @type xlist: 1d Numpy array of numbers
        @param xlist: Array of x values at each point t.
        @type ylist: 1d Numpy array of numbers
        @param ylist: Array of y values at each point t.
        @type dxlist: 1d Numpy array of numbers
        @param dxlist: Array of dx/dt derivatives at each point t.
        @type dylist: 1d Numpy array of numbers
        @param dylist: Array of dy/dt derivatives at each point t.
        @type plotCoordinates: PlotCoordinates
        @param plotCoordinates: Coordinate system to convert the points.
        @type loop: bool
        @param loop: If True, the last point should be connected to the first point.
        @type smooth: bool
        @param smooth: If True, use the derivatives (C{dxlist} and C{dylist}) to define Bezier curves between the points; otherwise, draw straight lines.
        @rtype: list of strings
        @return: When concatenated with spaces, the return type is appropriate for an SVG path's C{d} attribute.
        """

        pathdata = []
        if not smooth:
            X, Y = plotCoordinates(xlist, ylist)

            nextIsMoveto = True
            for x, y in itertools.izip(X, Y):
                if nextIsMoveto:
                    pathdata.append("M %r %r" % (x, y))
                    nextIsMoveto = False
                else:
                    pathdata.append("L %r %r" % (x, y))

            if loop:
                pathdata.append("Z")

        else:
            C1x = NP("roll", xlist, 1) + NP("roll", dxlist, 1) / 3.0
            C1y = NP("roll", ylist, 1) + NP("roll", dylist, 1) / 3.0
            C2x = xlist - dxlist / 3.0
            C2y = ylist - dylist / 3.0

            X, Y = plotCoordinates(xlist, ylist)
            C1X, C1Y = plotCoordinates(C1x, C1y)
            C2X, C2Y = plotCoordinates(C2x, C2y)

            nextIsMoveto = True
            for x, y, c1x, c1y, c2x, c2y in itertools.izip(
                    X, Y, C1X, C1Y, C2X, C2Y):
                if nextIsMoveto:
                    pathdata.append("M %r %r" % (x, y))
                    nextIsMoveto = False
                else:
                    pathdata.append("C %r %r %r %r %r %r" %
                                    (c1x, c1y, c2x, c2y, x, y))

            if loop:
                pathdata.append("Z")

        return pathdata

    def generateSamples(self, low, high):
        """Used by C{prepare} to generate an array of samples.

        @type low: number
        @param low: Minimum value to sample.
        @type high: number
        @param high: Maximum value to sample.
        @rtype: 1d Numpy array
        @return: An array of uniform, random, or adaptive samples of an interval.
        """

        numSamples = self.get("numSamples",
                              defaultFromXsd=True,
                              convertType=True)
        samplingMethod = self.get("samplingMethod", defaultFromXsd=True)

        if samplingMethod == "uniform":
            samples = NP("linspace", low, high, numSamples, endpoint=True)

        elif samplingMethod == "random":
            samples = NP(
                NP(NP(NP.random.rand(numSamples)) * (high - low)) + low)
            samples.sort()

        else:
            raise NotImplementedError("TODO: add 'adaptive'")

        return samples

    def prepare(self, state, dataTable, functionTable, performanceTable,
                plotRange):
        """Prepare a plot element for drawing.

        This stage consists of calculating all quantities and
        determing the bounds of the data.  These bounds may be unioned
        with bounds from other plot elements that overlay this plot
        element, so the drawing (which requires a finalized coordinate
        system) cannot begin yet.

        This method modifies C{plotRange}.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type dataTable: DataTable
        @param dataTable: Contains the data to plot.
        @type functionTable: FunctionTable
        @param functionTable: Defines functions that may be used to transform data for plotting.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process.
        @type plotRange: PlotRange
        @param plotRange: The bounding box of plot coordinates that this function will expand.
        """

        self.checkRoles([
            "y(x)", "dy/dx", "x(t)", "y(t)", "dx/dt", "dy/dt", "x", "y", "dx",
            "dy"
        ])

        performanceTable.begin("PlotCurve prepare")
        self._saveContext(dataTable)

        yofx = self.xpath("pmml:PlotFormula[@role='y(x)']")
        dydx = self.xpath("pmml:PlotFormula[@role='dy/dx']")

        xoft = self.xpath("pmml:PlotFormula[@role='x(t)']")
        yoft = self.xpath("pmml:PlotFormula[@role='y(t)']")
        dxdt = self.xpath("pmml:PlotFormula[@role='dx/dt']")
        dydt = self.xpath("pmml:PlotFormula[@role='dy/dt']")

        nx = self.xpath("pmml:PlotNumericExpression[@role='x']")
        ny = self.xpath("pmml:PlotNumericExpression[@role='y']")
        ndx = self.xpath("pmml:PlotNumericExpression[@role='dx']")
        ndy = self.xpath("pmml:PlotNumericExpression[@role='dy']")
        cutExpression = self.xpath("pmml:PlotSelection")

        if len(yofx) + len(dydx) + len(xoft) + len(yoft) + len(dxdt) + len(
                dydt) > 0:
            if len(yofx) == 1 and len(dydx) == 0 and len(xoft) == 0 and len(
                    yoft) == 0 and len(dxdt) == 0 and len(dydt) == 0:
                expression = (yofx[0].text, )
                derivative = (None, )

            elif len(yofx) == 1 and len(dydx) == 1 and len(xoft) == 0 and len(
                    yoft) == 0 and len(dxdt) == 0 and len(dydt) == 0:
                expression = (yofx[0].text, )
                derivative = (dydx[0].text, )

            elif len(yofx) == 0 and len(dydx) == 0 and len(xoft) == 1 and len(
                    yoft) == 1 and len(dxdt) == 0 and len(dydt) == 0:
                expression = xoft[0].text, yoft[0].text
                derivative = None, None

            elif len(yofx) == 0 and len(dydx) == 0 and len(xoft) == 1 and len(
                    yoft) == 1 and len(dxdt) == 1 and len(dydt) == 1:
                expression = xoft[0].text, yoft[0].text
                derivative = dxdt[0].text, dydt[0].text

            else:
                raise defs.PmmlValidationError(
                    "The only allowed combinations of PlotFormulae are: \"y(x)\", \"y(x) dy/dx\", \"x(t) y(t)\", and \"x(t) y(t) dx/dt dy/dt\""
                )

            low = self.get("low", convertType=True)
            high = self.get("high", convertType=True)
            if low is None or high is None:
                raise defs.PmmlValidationError(
                    "The \"low\" and \"high\" attributes are required for PlotCurves defined by formulae"
                )

            samples = self.generateSamples(low, high)

            loop = self.get("loop", defaultFromXsd=True, convertType=True)
            state.x, state.y, state.dx, state.dy, xfieldType, yfieldType = self.expressionsToPoints(
                expression, derivative, samples, loop, functionTable,
                performanceTable)

        else:
            performanceTable.pause("PlotCurve prepare")
            if len(ndx) == 1:
                dxdataColumn = ndx[0].evaluate(dataTable, functionTable,
                                               performanceTable)
            else:
                dxdataColumn = None
            if len(ndy) == 1:
                dydataColumn = ndy[0].evaluate(dataTable, functionTable,
                                               performanceTable)
            else:
                dydataColumn = None
            performanceTable.unpause("PlotCurve prepare")

            if len(nx) == 0 and len(ny) == 1:
                performanceTable.pause("PlotCurve prepare")
                ydataColumn = ny[0].evaluate(dataTable, functionTable,
                                             performanceTable)
                performanceTable.unpause("PlotCurve prepare")

                if len(cutExpression) == 1:
                    performanceTable.pause("PlotCurve prepare")
                    selection = cutExpression[0].select(
                        dataTable, functionTable, performanceTable)
                    performanceTable.unpause("PlotCurve prepare")
                else:
                    selection = NP("ones", len(ydataColumn.data),
                                   NP.dtype(bool))

                if ydataColumn.mask is not None:
                    selection = NP("logical_and", selection,
                                   NP(ydataColumn.mask == defs.VALID),
                                   selection)
                if dxdataColumn is not None and dxdataColumn.mask is not None:
                    selection = NP("logical_and", selection,
                                   NP(dxdataColumn.mask == defs.VALID),
                                   selection)
                if dydataColumn is not None and dydataColumn.mask is not None:
                    selection = NP("logical_and", selection,
                                   NP(dydataColumn.mask == defs.VALID),
                                   selection)

                yarray = ydataColumn.data[selection]

                xarray = NP("ones", len(yarray), dtype=NP.dtype(float))
                xarray[0] = 0.0
                xarray = NP("cumsum", xarray)

                dxarray, dyarray = None, None
                if dxdataColumn is not None:
                    dxarray = dxdataColumn.data[selection]
                if dydataColumn is not None:
                    dyarray = dydataColumn.data[selection]

                xfieldType = self.xfieldType
                yfieldType = ydataColumn.fieldType

            elif len(nx) == 1 and len(ny) == 1:
                performanceTable.pause("PlotCurve prepare")
                xdataColumn = nx[0].evaluate(dataTable, functionTable,
                                             performanceTable)
                ydataColumn = ny[0].evaluate(dataTable, functionTable,
                                             performanceTable)
                performanceTable.unpause("PlotCurve prepare")

                if len(cutExpression) == 1:
                    performanceTable.pause("PlotCurve prepare")
                    selection = cutExpression[0].select(
                        dataTable, functionTable, performanceTable)
                    performanceTable.unpause("PlotCurve prepare")
                else:
                    selection = NP("ones", len(ydataColumn.data),
                                   NP.dtype(bool))

                if xdataColumn.mask is not None:
                    selection = NP("logical_and", selection,
                                   NP(xdataColumn.mask == defs.VALID),
                                   selection)
                if ydataColumn.mask is not None:
                    selection = NP("logical_and", selection,
                                   NP(ydataColumn.mask == defs.VALID),
                                   selection)
                if dxdataColumn is not None and dxdataColumn.mask is not None:
                    selection = NP("logical_and", selection,
                                   NP(dxdataColumn.mask == defs.VALID),
                                   selection)
                if dydataColumn is not None and dydataColumn.mask is not None:
                    selection = NP("logical_and", selection,
                                   NP(dydataColumn.mask == defs.VALID),
                                   selection)

                xarray = xdataColumn.data[selection]
                yarray = ydataColumn.data[selection]

                dxarray, dyarray = None, None
                if dxdataColumn is not None:
                    dxarray = dxdataColumn.data[selection]
                if dydataColumn is not None:
                    dyarray = dydataColumn.data[selection]

                xfieldType = xdataColumn.fieldType
                yfieldType = ydataColumn.fieldType

            else:
                raise defs.PmmlValidationError(
                    "The only allowed combinations of PlotNumericExpressions are: \"y(x)\" and \"x(t) y(t)\""
                )

            persistentState = {}
            stateId = self.get("stateId")
            if stateId is not None:
                if stateId in dataTable.state:
                    persistentState = dataTable.state[stateId]
                    xarray = NP("concatenate", [xarray, persistentState["x"]])
                    yarray = NP("concatenate", [yarray, persistentState["y"]])
                    if dxarray is not None:
                        dxarray = NP("concatenate",
                                     [dxarray, persistentState["dx"]])
                    if dyarray is not None:
                        dyarray = NP("concatenate",
                                     [dyarray, persistentState["dy"]])
                else:
                    dataTable.state[stateId] = persistentState

            persistentState["x"] = xarray
            persistentState["y"] = yarray
            if dxarray is not None:
                persistentState["dx"] = dxarray
            if dyarray is not None:
                persistentState["dy"] = dyarray

            smooth = self.get("smooth", defaultFromXsd=True, convertType=True)
            if not smooth:
                if dyarray is not None and dxarray is None:
                    dxarray = NP(
                        (NP("roll", xarray, -1) - NP("roll", xarray, 1)) / 2.0)
                    dyarray = dyarray * dxarray

                loop = self.get("loop", defaultFromXsd=True, convertType=True)
                if dxarray is not None and not loop:
                    dxarray[0] = 0.0
                    dxarray[-1] = 0.0
                if dyarray is not None and not loop:
                    dyarray[0] = 0.0
                    dyarray[-1] = 0.0

                state.x = xarray
                state.y = yarray
                state.dx = dxarray
                state.dy = dyarray

            else:
                smoothingScale = self.get("smoothingScale",
                                          defaultFromXsd=True,
                                          convertType=True)
                loop = self.get("loop", defaultFromXsd=True, convertType=True)

                samples = self.generateSamples(xarray.min(), xarray.max())
                state.x, state.y, state.dx, state.dy = self.pointsToSmoothCurve(
                    xarray, yarray, samples, smoothingScale, loop)

        if plotRange is not None:
            plotRange.expand(state.x, state.y, xfieldType, yfieldType)

        performanceTable.end("PlotCurve prepare")

    def draw(self, state, plotCoordinates, plotDefinitions, performanceTable):
        """Draw the plot element.

        This stage consists of creating an SVG image of the
        pre-computed data.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type plotCoordinates: PlotCoordinates
        @param plotCoordinates: The coordinate system in which this plot element will be placed.
        @type plotDefinitions: PlotDefinitions
        @type plotDefinitions: The dictionary of key-value pairs that forms the <defs> section of the SVG document.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process.
        @rtype: SvgBinding
        @return: An SVG fragment representing the fully drawn plot element.
        """

        svg = SvgBinding.elementMaker
        performanceTable.begin("PlotCurve draw")

        loop = self.get("loop", defaultFromXsd=True, convertType=True)
        pathdata = self.formatPathdata(
            state.x, state.y, state.dx, state.dy, plotCoordinates, loop,
            (state.dx is not None and state.dy is not None))
        output = svg.g()

        style = self.getStyleState()
        strokeStyle = dict(
            (x, style[x]) for x in style if x.startswith("stroke"))
        fillStyle = dict((x, style[x]) for x in style if x.startswith("fill"))
        fillStyle["stroke"] = "none"

        if style["fill"] != "none":
            if len(self.xpath("pmml:PlotFormula[@role='y(x)']")) > 0 and len(
                    pathdata) > 1:
                firstPoint = plotCoordinates(state.x[0], 0.0)
                lastPoint = plotCoordinates(state.x[-1], 0.0)

                X0, Y0 = plotCoordinates(state.x[0], state.y[0])

                pathdata2 = ["M %r %r" % firstPoint]
                pathdata2.append("L %r %r" % (X0, Y0))
                pathdata2.extend(pathdata[1:])
                pathdata2.append("L %r %r" % lastPoint)

                output.append(
                    svg.path(d=" ".join(pathdata2),
                             style=PlotStyle.toString(fillStyle)))

            else:
                output.append(
                    svg.path(d=" ".join(pathdata),
                             style=PlotStyle.toString(fillStyle)))

        output.append(
            svg.path(d=" ".join(pathdata),
                     style=PlotStyle.toString(strokeStyle)))

        svgId = self.get("svgId")
        if svgId is not None:
            output["id"] = svgId

        performanceTable.end("PlotCurve draw")
        return output
Example #12
0
class PlotSvgContent(PmmlPlotContent):
    """PlotSvgContent represents an SVG image embedded in a coordinate
    system.

    PMML subelements:

      - SvgBinding for inline SVG.

    PMML attributes:

      - svgId: id for the resulting SVG element.
      - fileName: for external SVG.
      - x1: left edge.
      - y1: bottom edge.
      - x2: right edge.
      - y2: top edge.

    Inline and external SVG are mutually exclusive.

    See the source code for the full XSD.
    """

    xsd = """<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
    <xs:element name="PlotSvgContent">
        <xs:complexType>
            <xs:complexContent>
                <xs:restriction base="xs:anyType">
                    <xs:sequence>
                        <xs:any minOccurs="0" maxOccurs="1" processContents="skip" />
                    </xs:sequence>
                    <xs:attribute name="svgId" type="xs:string" use="optional" />
                    <xs:attribute name="fileName" type="xs:string" use="optional" />
                    <xs:attribute name="x1" type="xs:double" use="required" />
                    <xs:attribute name="y1" type="xs:double" use="required" />
                    <xs:attribute name="x2" type="xs:double" use="required" />
                    <xs:attribute name="y2" type="xs:double" use="required" />
                </xs:restriction>
            </xs:complexContent>
        </xs:complexType>
    </xs:element>
</xs:schema>
"""

    fieldTypeNumeric = FakeFieldType("double", "continuous")

    def prepare(self, state, dataTable, functionName, performanceTable,
                plotRange):
        """Prepare a plot element for drawing.

        This stage consists of calculating all quantities and
        determing the bounds of the data.  These bounds may be unioned
        with bounds from other plot elements that overlay this plot
        element, so the drawing (which requires a finalized coordinate
        system) cannot begin yet.

        This method modifies C{plotRange}.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type dataTable: DataTable
        @param dataTable: Contains the data to plot.
        @type functionTable: FunctionTable
        @param functionTable: Defines functions that may be used to transform data for plotting.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process.
        @type plotRange: PlotRange
        @param plotRange: The bounding box of plot coordinates that this function will expand.
        """

        self._saveContext(dataTable)

        x1 = float(self["x1"])
        y1 = float(self["y1"])
        x2 = float(self["x2"])
        y2 = float(self["y2"])

        if x1 >= x2 or y1 >= y2:
            raise defs.PmmlValidationError(
                "x1 must be less than x2 and y1 must be less than y2")

        if plotRange.xStrictlyPositive or plotRange.yStrictlyPositive:
            raise defs.PmmlValidationError(
                "PlotSvgContent can only be properly displayed in linear coordinates"
            )

        plotRange.xminPush(x1, self.fieldTypeNumeric, sticky=True)
        plotRange.yminPush(y1, self.fieldTypeNumeric, sticky=True)
        plotRange.xmaxPush(x2, self.fieldTypeNumeric, sticky=True)
        plotRange.ymaxPush(y2, self.fieldTypeNumeric, sticky=True)

    def draw(self, state, plotCoordinates, plotDefinitions, performanceTable):
        """Draw the plot element.

        This stage consists of creating an SVG image of the
        pre-computed data.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type plotCoordinates: PlotCoordinates
        @param plotCoordinates: The coordinate system in which this plot element will be placed.
        @type plotDefinitions: PlotDefinitions
        @type plotDefinitions: The dictionary of key-value pairs that forms the <defs> section of the SVG document.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process.
        @rtype: SvgBinding
        @return: An SVG fragment representing the fully drawn plot element.
        """

        svg = SvgBinding.elementMaker

        x1 = float(self["x1"])
        y1 = float(self["y1"])
        x2 = float(self["x2"])
        y2 = float(self["y2"])

        inlineSvg = self.getchildren()
        fileName = self.get("fileName")
        if len(inlineSvg) == 1 and fileName is None:
            svgBinding = inlineSvg[0]
        elif len(inlineSvg) == 0 and fileName is not None:
            svgBinding = SvgBinding.loadXml(fileName)
        else:
            raise defs.PmmlValidationError(
                "PlotSvgContent should specify an inline SVG or a fileName but not both or neither"
            )

        sx1, sy1, sx2, sy2 = PlotSvgAnnotation.findSize(svgBinding)
        subCoordinates = PlotCoordinatesWindow(plotCoordinates, sx1, sy1, sx2,
                                               sy2, x1, y1, x2 - x1, y2 - y1)

        tx0, ty0 = subCoordinates(0.0, 0.0)
        tx1, ty1 = subCoordinates(1.0, 1.0)
        transform = "translate(%r, %r) scale(%r, %r)" % (tx0, ty0, tx1 - tx0,
                                                         ty1 - ty0)

        attribs = {"transform": transform}
        svgId = self.get("svgId")
        if svgId is not None:
            attribs["id"] = svgId
        if "style" in svgBinding.attrib:
            attribs["style"] = svgBinding.attrib["style"]

        return svg.g(*(copy.deepcopy(svgBinding).getchildren()), **attribs)
Example #13
0
    def calculateScore(self, dataTable, functionTable, performanceTable):
        """Calculate the score of this model.

        This method is called by C{calculate} to separate operations
        that are performed by all models (in C{calculate}) from
        operations that are performed by specific models (in
        C{calculateScore}).

        @type subTable: DataTable
        @param subTable: The DataTable representing this model's lexical scope.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: A DataColumn containing the score.
        """

        performanceTable.begin("TreeModel")

        performanceTable.begin("set up")

        missingValueStrategy = self.get("missingValueStrategy",
                                        defaultFromXsd=True)
        if missingValueStrategy == "lastPrediction":
            missingValueStrategy = Node.LAST_PREDICTION
        elif missingValueStrategy == "nullPrediction":
            missingValueStrategy = Node.NULL_PREDICTION
        elif missingValueStrategy == "defaultChild":
            missingValueStrategy = Node.DEFAULT_CHILD
        elif missingValueStrategy == "weightedConfidence":
            missingValueStrategy = Node.WEIGHTED_CONFIDENCE
        elif missingValueStrategy == "aggregateNodes":
            missingValueStrategy = Node.AGGREGATE_NODES
        elif missingValueStrategy == "none":
            missingValueStrategy = Node.NONE

        missingValuePenalty = self.get("missingValuePenalty",
                                       defaultFromXsd=True,
                                       convertType=True)

        noTrueChildStrategy = self.get("noTrueChildStrategy",
                                       defaultFromXsd=True)
        if noTrueChildStrategy == "returnNullPredication":
            noTrueChildStrategy = Node.RETURN_NULL_PREDICTION
        elif noTrueChildStrategy == "returnLastPrediction":
            noTrueChildStrategy = Node.RETURN_LAST_PREDICTION

        if self["functionName"] == "classification":
            fieldType = FakeFieldType("string", "categorical")
        elif self["functionName"] == "regression":
            fieldType = FakeFieldType("double", "continuous")
        else:
            raise defs.PmmlValidationError(
                "TreeModel functionName may only be \"classification\" or \"regression\", not \"%s\""
                % self["functionName"])

        performanceTable.end("set up")

        score = {
            None:
            DataColumn(fieldType,
                       NP("empty", len(dataTable), dtype=fieldType.dtype),
                       NP("ones", len(dataTable), dtype=defs.maskType))
        }
        score[None]._unlock()

        if self.subFields["entity"]:
            fieldType = FakeFieldType("object", "any")
            score["entity"] = DataColumn(
                fieldType, NP("empty", len(dataTable), dtype=fieldType.dtype),
                NP("ones", len(dataTable), dtype=defs.maskType))
            score["entity"]._unlock()

        if self.subFields["entityId"]:
            fieldType = FakeFieldType("string", "categorical")
            score["entityId"] = DataColumn(
                fieldType, NP("empty", len(dataTable), dtype=fieldType.dtype),
                NP("ones", len(dataTable), dtype=defs.maskType))
            score["entityId"]._unlock()

        if self.subFields["confidence"]:
            fieldType = FakeFieldType("double", "continuous")
            score["confidence"] = DataColumn(
                fieldType, NP("empty", len(dataTable), dtype=fieldType.dtype),
                NP("ones", len(dataTable), dtype=defs.maskType))
            score["confidence"]._unlock()

            fieldType = FakeFieldType("double", "continuous")
            score["penaltyProduct"] = DataColumn(
                fieldType, NP("ones", len(dataTable), dtype=fieldType.dtype),
                None)
            score["penaltyProduct"]._unlock()

        if self.subFields["probability"]:
            fieldType = FakeFieldType("double", "continuous")
            score["probability"] = DataColumn(
                fieldType, NP("empty", len(dataTable), dtype=fieldType.dtype),
                NP("ones", len(dataTable), dtype=defs.maskType))
            score["probability"]._unlock()

        node = self.childOfClass(Node)
        selection = node.evaluatePredicate(dataTable,
                                           functionTable,
                                           performanceTable,
                                           returnUnknowns=False)
        node.applyScore(dataTable, functionTable, performanceTable, selection,
                        score, missingValueStrategy, missingValuePenalty,
                        noTrueChildStrategy)

        if "confidence" in score:
            score["confidence"]._data *= score["penaltyProduct"].data
            del score["penaltyProduct"]

        for field in score.values():
            if not field.mask.any():
                field._mask = None
            else:
                field._mask *= defs.INVALID
            field._lock()

        performanceTable.end("TreeModel")
        return score
Example #14
0
class PlotHeatMap(PmmlPlotContent):
    """Represents a 2d heat map of a mathematical formula or a 2d
    histogram of data.

    PMML subelements for mathematical function plotting:

      - PlotFormula role="z(x,y)"

    PMML subelements for 2d histograms:

      - PlotNumericExpression role="x"
      - PlotNumericExpression role="y"
      - PlotNumericExpression role="zweight" (optional)
      - PlotSelection: expression or predicate to filter the data
        before plotting.

    PMML subelements for plotting the mean of a third coordinate z:

      - PlotNumericExpression role="x"
      - PlotNumericExpression role="y"
      - PlotNumericExpression role="zmean"
      - PlotSelection: expression or predicate to filter the data
        before plotting.

    PMML attribute:

      - svgId: id for the resulting SVG element.
      - stateId: key for persistent storage in a DataTableState.
      - xbins: number of histogram bins in the x direction.
      - ybins: number of histogram bins in the y direction.
      - xlow: low edge of the x range of the histogram.
      - ylow: low edge of the y range of the histogram.
      - xhigh: high edge of the x range of the histogram.
      - yhigh: high edge of the y range of the histogram.
      - imageRendering: "optimizeQuality", "optimizeSpeed"
      - onePixelBeyondBorder: if "true", extend the image beyond
        the border by one pixel.  This is to work around a feature
        of many SVG viewers that blend the borders of a raster
        image into the background.

    See the source code for the full XSD.
    """

    xsd = """<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
    <xs:element name="PlotHeatMap">
        <xs:complexType>
            <xs:sequence>
                <xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded" />
                <xs:choice minOccurs="1" maxOccurs="1">
                    <xs:element ref="PlotFormula" minOccurs="1" maxOccurs="1" />
                    <xs:sequence>
                        <xs:element ref="PlotNumericExpression" minOccurs="2" maxOccurs="3" />
                        <xs:element ref="PlotSelection" minOccurs="0" maxOccurs="1" />
                    </xs:sequence>
                </xs:choice>
            </xs:sequence>
            <xs:attribute name="svgId" type="xs:string" use="optional" />
            <xs:attribute name="stateId" type="xs:string" use="optional" />
            <xs:attribute name="xbins" type="xs:positiveInteger" use="optional" />
            <xs:attribute name="ybins" type="xs:positiveInteger" use="optional" />
            <xs:attribute name="xlow" type="xs:double" use="optional" />
            <xs:attribute name="ylow" type="xs:double" use="optional" />
            <xs:attribute name="xhigh" type="xs:double" use="optional" />
            <xs:attribute name="yhigh" type="xs:double" use="optional" />
            <xs:attribute name="imageRendering" use="optional" default="optimizeQuality">
                <xs:simpleType>
                    <xs:restriction base="xs:string">
                        <xs:enumeration value="optimizeQuality" />
                        <xs:enumeration value="optimizeSpeed" />
                    </xs:restriction>
                </xs:simpleType>
            </xs:attribute>
            <xs:attribute name="onePixelBeyondBorder" type="xs:boolean" use="optional" default="true" />
        </xs:complexType>
    </xs:element>
</xs:schema>
"""

    xyfieldType = FakeFieldType("double", "continuous")
    zfieldType = FakeFieldType("double", "continuous")

    def prepare(self, state, dataTable, functionTable, performanceTable,
                plotRange):
        """Prepare a plot element for drawing.

        This stage consists of calculating all quantities and
        determing the bounds of the data.  These bounds may be unioned
        with bounds from other plot elements that overlay this plot
        element, so the drawing (which requires a finalized coordinate
        system) cannot begin yet.

        This method modifies C{plotRange}.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type dataTable: DataTable
        @param dataTable: Contains the data to plot.
        @type functionTable: FunctionTable
        @param functionTable: Defines functions that may be used to transform data for plotting.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process.
        @type plotRange: PlotRange
        @param plotRange: The bounding box of plot coordinates that this function will expand.
        """

        self.checkRoles(["z(x,y)", "x", "y", "zmean", "zweight"])

        performanceTable.begin("PlotHeatMap prepare")
        self._saveContext(dataTable)

        zofxy = self.xpath("pmml:PlotFormula[@role='z(x,y)']")
        xexpr = self.xpath("pmml:PlotNumericExpression[@role='x']")
        yexpr = self.xpath("pmml:PlotNumericExpression[@role='y']")
        zmean = self.xpath("pmml:PlotNumericExpression[@role='zmean']")
        zweight = self.xpath("pmml:PlotNumericExpression[@role='zweight']")
        cutExpression = self.xpath("pmml:PlotSelection")

        if len(zofxy) == 1 and len(xexpr) == 0 and len(yexpr) == 0 and len(
                zmean) == 0 and len(zweight) == 0:
            xbins = self.get("xbins", convertType=True)
            xlow = self.get("xlow", convertType=True)
            xhigh = self.get("xhigh", convertType=True)
            ybins = self.get("ybins", convertType=True)
            ylow = self.get("ylow", convertType=True)
            yhigh = self.get("yhigh", convertType=True)

            if xbins is None or xlow is None or xhigh is None or ybins is None or ylow is None or yhigh is None:
                raise defs.PmmlValidationError(
                    "xbins, xlow, xhigh, ybins, ylow, and yhigh are required for HeatMaps of a mathematical formula"
                )

            if xlow >= xhigh or ylow >= yhigh:
                raise defs.PmmlValidationError(
                    "xlow must be less than xhigh and ylow must be less than yhigh"
                )

            if plotRange.xStrictlyPositive or plotRange.yStrictlyPositive:
                raise defs.PmmlValidationError(
                    "PlotHeatMap can only be properly displayed in linear x, y coordinates"
                )

            xbinWidth = (xhigh - xlow) / float(xbins)
            ybinWidth = (yhigh - ylow) / float(ybins)

            xarray = NP("tile",
                        NP("linspace", xlow, xhigh, xbins, endpoint=True),
                        ybins)
            yarray = NP("repeat",
                        NP("linspace", ylow, yhigh, ybins, endpoint=True),
                        xbins)

            sampleTable = DataTable({
                "x": "double",
                "y": "double"
            }, {
                "x": xarray,
                "y": yarray
            })
            parsed = Formula.parse(zofxy[0].text)

            performanceTable.pause("PlotHeatMap prepare")
            zdataColumn = parsed.evaluate(sampleTable, functionTable,
                                          performanceTable)
            performanceTable.unpause("PlotHeatMap prepare")
            if not zdataColumn.fieldType.isnumeric():
                raise defs.PmmlValidationError(
                    "PlotFormula z(x,y) must return a numeric expression, not %r"
                    % zdataColumn.fieldType)

            selection = NP("isfinite", zdataColumn.data)
            if zdataColumn.mask is not None:
                NP("logical_and", selection,
                   NP(zdataColumn.mask == defs.VALID), selection)
            if plotRange.zStrictlyPositive:
                NP("logical_and", selection, NP(zdataColumn.data > 0.0),
                   selection)

            gooddata = zdataColumn.data[selection]
            plotRange.zminPush(gooddata.min(),
                               zdataColumn.fieldType,
                               sticky=False)
            plotRange.zmaxPush(gooddata.max(),
                               zdataColumn.fieldType,
                               sticky=False)

            state.zdata = zdataColumn.data
            state.zmask = NP("logical_not", selection) * defs.INVALID

        elif len(zofxy) == 0 and len(xexpr) == 1 and len(yexpr) == 1:
            performanceTable.pause("PlotHeatMap prepare")
            xdataColumn = xexpr[0].evaluate(dataTable, functionTable,
                                            performanceTable)
            ydataColumn = yexpr[0].evaluate(dataTable, functionTable,
                                            performanceTable)
            performanceTable.unpause("PlotHeatMap prepare")

            xbins = self.get("xbins", convertType=True)
            xlow = self.get("xlow", convertType=True)
            xhigh = self.get("xhigh", convertType=True)
            ybins = self.get("ybins", convertType=True)
            ylow = self.get("ylow", convertType=True)
            yhigh = self.get("yhigh", convertType=True)

            if len(xdataColumn) > 0:
                if xlow is None: xlow = NP("nanmin", xdataColumn.data)
                if xhigh is None: xhigh = NP("nanmax", xdataColumn.data)
                if ylow is None: ylow = NP("nanmin", ydataColumn.data)
                if yhigh is None: yhigh = NP("nanmax", ydataColumn.data)
            else:
                if xlow is None: xlow = 0.0
                if xhigh is None: xhigh = 1.0
                if ylow is None: ylow = 0.0
                if yhigh is None: yhigh = 1.0

            if xbins is None:
                q1, q3 = NP("percentile", xdataColumn.data, [25.0, 75.0])
                binWidth = 2.0 * (q3 - q1) / math.pow(len(xdataColumn.data),
                                                      1.0 / 3.0)
                if binWidth > 0.0:
                    xbins = max(10, int(math.ceil((xhigh - xlow) / binWidth)))
                else:
                    xbins = 10

            if ybins is None:
                q1, q3 = NP("percentile", ydataColumn.data, [25.0, 75.0])
                binWidth = 2.0 * (q3 - q1) / math.pow(len(ydataColumn.data),
                                                      1.0 / 3.0)
                if binWidth > 0.0:
                    ybins = max(10, int(math.ceil((yhigh - ylow) / binWidth)))
                else:
                    ybins = 10

            if xlow >= xhigh or ylow >= yhigh:
                raise defs.PmmlValidationError(
                    "xlow must be less than xhigh and ylow must be less than yhigh"
                )

            if plotRange.xStrictlyPositive or plotRange.yStrictlyPositive:
                raise defs.PmmlValidationError(
                    "PlotHeatMap can only be properly displayed in linear x, y coordinates"
                )

            persistentState = {}
            stateId = self.get("stateId")
            if stateId is not None:
                if stateId in dataTable.state:
                    persistentState = dataTable.state[stateId]
                else:
                    dataTable.state[stateId] = persistentState

            if len(zmean) == 0:
                if "xbins" in persistentState: xbins = persistentState["xbins"]
                if "xlow" in persistentState: xlow = persistentState["xlow"]
                if "xhigh" in persistentState: xhigh = persistentState["xhigh"]
                if "ybins" in persistentState: ybins = persistentState["ybins"]
                if "ylow" in persistentState: ylow = persistentState["ylow"]
                if "yhigh" in persistentState: yhigh = persistentState["yhigh"]

                persistentState["xbins"] = xbins
                persistentState["xlow"] = xlow
                persistentState["xhigh"] = xhigh
                persistentState["ybins"] = ybins
                persistentState["ylow"] = ylow
                persistentState["yhigh"] = yhigh

            xbinWidth = (xhigh - xlow) / float(xbins)
            ybinWidth = (yhigh - ylow) / float(ybins)

            mask = NP("ones", len(dataTable), dtype=NP.dtype(float))
            if xdataColumn.mask is not None:
                NP("multiply", mask, (xdataColumn.mask == defs.VALID), mask)
            if ydataColumn.mask is not None:
                NP("multiply", mask, (ydataColumn.mask == defs.VALID), mask)

            if len(cutExpression) == 1:
                performanceTable.pause("PlotHeatMap prepare")
                NP(
                    "multiply", mask,
                    cutExpression[0].select(dataTable, functionTable,
                                            performanceTable), mask)
                performanceTable.unpause("PlotHeatMap prepare")

            if len(zmean) == 0 and len(zweight) == 0:
                histogram, xedges, yedges = NP("histogram2d",
                                               ydataColumn.data,
                                               xdataColumn.data,
                                               bins=(ybins, xbins),
                                               range=[[ylow, yhigh],
                                                      [xlow, xhigh]],
                                               weights=mask)
                if len(dataTable) == 0:
                    # work around Numpy <= 1.6.1 bug
                    histogram = NP("zeros", (ybins, xbins),
                                   dtype=NP.dtype(float))

                if "histogram" in persistentState:
                    persistentState["histogram"] = NP(
                        persistentState["histogram"] + histogram)
                else:
                    persistentState["histogram"] = histogram

                histogram = persistentState["histogram"]

                if plotRange.zStrictlyPositive:
                    zmin = 0.1
                else:
                    zmin = 0.0
                zmax = NP("nanmax", histogram)

                plotRange.zminPush(zmin, self.zfieldType, sticky=True)
                if zmax > zmin:
                    plotRange.zmaxPush(zmax, self.zfieldType, sticky=False)

            elif len(zmean) == 0 and len(zweight) == 1:
                performanceTable.pause("PlotHeatMap prepare")
                weightsDataColumn = zweight[0].evaluate(
                    dataTable, functionTable, performanceTable)
                performanceTable.unpause("PlotHeatMap prepare")

                if weightsDataColumn.mask is not None:
                    NP("multiply", mask,
                       (weightsDataColumn.mask == defs.VALID), mask)
                weights = NP(weightsDataColumn.data * mask)

                histogram, xedges, yedges = NP("histogram2d",
                                               ydataColumn.data,
                                               xdataColumn.data,
                                               bins=(ybins, xbins),
                                               range=[[ylow, yhigh],
                                                      [xlow, xhigh]],
                                               weights=weights)

                if "histogram" in persistentState:
                    persistentState["histogram"] = NP(
                        persistentState["histogram"] + histogram)
                else:
                    persistentState["histogram"] = histogram

                histogram = persistentState["histogram"]

                if plotRange.zStrictlyPositive:
                    w = weights[NP(weights > 0.0)]
                    if len(w) > 0:
                        zmin = 0.1 * NP("nanmin", w)
                    else:
                        zmin = 0.1
                else:
                    zmin = 0.0
                zmax = NP("nanmax", histogram)

                plotRange.zminPush(zmin, self.zfieldType, sticky=True)
                if zmax > zmin:
                    plotRange.zmaxPush(zmax, self.zfieldType, sticky=False)

            elif len(zmean) == 1 and len(zweight) == 0:
                performanceTable.pause("PlotHeatMap prepare")
                zdataColumn = zmean[0].evaluate(dataTable, functionTable,
                                                performanceTable)
                performanceTable.unpause("PlotHeatMap prepare")

                if zdataColumn.mask is not None:
                    NP("multiply", mask, (zdataColumn.mask == defs.VALID),
                       mask)
                weights = NP(zdataColumn.data * mask)

                numer, xedges, yedges = NP("histogram2d",
                                           ydataColumn.data,
                                           xdataColumn.data,
                                           bins=(ybins, xbins),
                                           range=[[ylow, yhigh], [xlow,
                                                                  xhigh]],
                                           weights=weights)
                denom, xedges, yedges = NP("histogram2d",
                                           ydataColumn.data,
                                           xdataColumn.data,
                                           bins=(ybins, xbins),
                                           range=[[ylow, yhigh], [xlow,
                                                                  xhigh]],
                                           weights=mask)

                if "numer" in persistentState:
                    persistentState["numer"] = NP(persistentState["numer"] +
                                                  numer)
                    persistentState["denom"] = NP(persistentState["denom"] +
                                                  denom)
                else:
                    persistentState["numer"] = numer
                    persistentState["denom"] = denom

                numer = persistentState["numer"]
                denom = persistentState["denom"]
                histogram = numer / denom

                selection = NP("isfinite", histogram)
                if plotRange.zStrictlyPositive:
                    NP("logical_and", selection, NP(histogram > 0.0),
                       selection)

                if NP("count_nonzero", selection) > 0:
                    gooddata = histogram[selection]
                    plotRange.zminPush(gooddata.min(),
                                       self.zfieldType,
                                       sticky=False)
                    plotRange.zmaxPush(gooddata.max(),
                                       self.zfieldType,
                                       sticky=False)

            else:
                raise defs.PmmlValidationError(
                    "The only allowed combinations of PlotFormula/PlotNumericExpressions are: \"z(x,y)\" (function), \"x y\" (histogram), \"x y zmean\" (mean of z in x y bins), \"x y zweight\" (weighted x y histogram)"
                )

            state.zdata = NP("reshape", histogram, xbins * ybins)
            state.zmask = None

        else:
            raise defs.PmmlValidationError(
                "The only allowed combinations of PlotFormula/PlotNumericExpressions are: \"z(x,y)\" (function), \"x y\" (histogram), \"x y zmean\" (mean of z in x y bins), \"x y zweight\" (weighted x y histogram)"
            )

        plotRange.xminPush(xlow, self.xyfieldType, sticky=True)
        plotRange.yminPush(ylow, self.xyfieldType, sticky=True)
        plotRange.xmaxPush(xhigh, self.xyfieldType, sticky=True)
        plotRange.ymaxPush(yhigh, self.xyfieldType, sticky=True)

        state.xbins = xbins
        state.xlow = xlow
        state.xhigh = xhigh
        state.ybins = ybins
        state.ylow = ylow
        state.yhigh = yhigh

        performanceTable.end("PlotHeatMap prepare")

    def draw(self, state, plotCoordinates, plotDefinitions, performanceTable):
        """Draw the plot element.

        This stage consists of creating an SVG image of the
        pre-computed data.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type plotCoordinates: PlotCoordinates
        @param plotCoordinates: The coordinate system in which this plot element will be placed.
        @type plotDefinitions: PlotDefinitions
        @type plotDefinitions: The dictionary of key-value pairs that forms the <defs> section of the SVG document.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process.
        @rtype: SvgBinding
        @return: An SVG fragment representing the fully drawn plot element.
        """

        svg = SvgBinding.elementMaker

        svgId = self.get("svgId")
        if svgId is None:
            output = svg.g()
        else:
            output = svg.g(id=svgId)

        if not hasattr(plotCoordinates, "zmin"):
            return output

        performanceTable.begin("PlotHeatMap draw")

        xbins = state.xbins
        xlow = state.xlow
        xhigh = state.xhigh
        ybins = state.ybins
        ylow = state.ylow
        yhigh = state.yhigh

        reddata = NP("empty", len(state.zdata), dtype=NP.uint8)
        greendata = NP("empty", len(state.zdata), dtype=NP.uint8)
        bluedata = NP("empty", len(state.zdata), dtype=NP.uint8)
        alphadata = NP("empty", len(state.zdata), dtype=NP.uint8)

        if len(plotCoordinates.gradient) == 0:
            offsets = [0.0, 1.0]
            reds = [255, 0]
            greens = [255, 0]
            blues = [255, 255]
            alphas = [255, 255]
        else:
            offsets = [float(g["offset"]) for g in plotCoordinates.gradient]
            reds = [
                min(int(math.floor(256 * float(g["red"]))), 255)
                for g in plotCoordinates.gradient
            ]
            greens = [
                min(int(math.floor(256 * float(g["green"]))), 255)
                for g in plotCoordinates.gradient
            ]
            blues = [
                min(int(math.floor(256 * float(g["blue"]))), 255)
                for g in plotCoordinates.gradient
            ]
            alphas = [
                min(int(math.floor(256 * float(g.get("opacity", 1.0)))), 255)
                for g in plotCoordinates.gradient
            ]

        if not plotCoordinates.zlog:
            normalized = NP(
                NP(state.zdata - plotCoordinates.zmin) /
                (plotCoordinates.zmax - plotCoordinates.zmin))
        else:
            normalized = NP(
                NP(
                    NP("log10", state.zdata) -
                    NP("log10", plotCoordinates.zmin)) / NP(
                        NP("log10", plotCoordinates.zmax) -
                        NP("log10", plotCoordinates.zmin)))

        for index in xrange(len(offsets) - 1):
            if index == 0:
                under = NP(normalized < offsets[index])
                reddata[under] = reds[index]
                greendata[under] = greens[index]
                bluedata[under] = blues[index]
                alphadata[under] = alphas[index]

            if index == len(offsets) - 2:
                over = NP(normalized >= offsets[index + 1])
                reddata[over] = reds[index + 1]
                greendata[over] = greens[index + 1]
                bluedata[over] = blues[index + 1]
                alphadata[over] = alphas[index + 1]

            selection = NP(normalized >= offsets[index])
            NP("logical_and", selection, NP(normalized < offsets[index + 1]),
               selection)

            subset = NP(NP(normalized[selection]) - offsets[index])
            norm = 1. / (offsets[index + 1] - offsets[index])

            reddata[selection] = NP(
                "array",
                NP(
                    NP(subset * ((reds[index + 1] - reds[index]) * norm)) +
                    reds[index]),
                dtype=NP.uint8)
            greendata[selection] = NP(
                "array",
                NP(
                    NP(subset * ((greens[index + 1] - greens[index]) * norm)) +
                    greens[index]),
                dtype=NP.uint8)
            bluedata[selection] = NP(
                "array",
                NP(
                    NP(subset * ((blues[index + 1] - blues[index]) * norm)) +
                    blues[index]),
                dtype=NP.uint8)
            alphadata[selection] = NP(
                "array",
                NP(
                    NP(subset * ((alphas[index + 1] - alphas[index]) * norm)) +
                    alphas[index]),
                dtype=NP.uint8)

        badpixels = NP("isnan", normalized)
        NP("logical_or", badpixels, NP("isinf", normalized), badpixels)
        if state.zmask is not None:
            NP("logical_or", badpixels, NP(state.zmask != defs.VALID),
               badpixels)

        alphadata[badpixels] = 0

        X1, Y1 = plotCoordinates(xlow, ylow)
        X2, Y2 = plotCoordinates(xhigh, yhigh)

        onePixelBeyondBorder = self.get("onePixelBeyondBorder",
                                        defaultFromXsd=True,
                                        convertType=True)
        if onePixelBeyondBorder:
            Xwidth = (X2 - X1) / xbins
            Yheight = (Y1 - Y2) / ybins
            X1 -= Xwidth
            X2 += Xwidth
            Y1 += Yheight
            Y2 -= Yheight

        arrayToPng = ArrayToPng()
        arrayToPng.putdata(xbins,
                           ybins,
                           reddata,
                           greendata,
                           bluedata,
                           alphadata,
                           flipy=True,
                           onePixelBeyondBorder=onePixelBeyondBorder)

        output.append(
            svg.image(
                **{
                    defs.XLINK_HREF:
                    "data:image/png;base64," + arrayToPng.b64encode(),
                    "x":
                    repr(X1),
                    "y":
                    repr(Y2),
                    "width":
                    repr(X2 - X1),
                    "height":
                    repr(Y1 - Y2),
                    "image-rendering":
                    self.get("imageRendering", defaultFromXsd=True),
                    "preserveAspectRatio":
                    "none"
                }))

        performanceTable.end("PlotHeatMap draw")
        return output
Example #15
0
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        function = self["function"]
        groupField = self.get("groupField")

        if groupField is None:
            performanceTable.begin("Aggregate %s" % function)
        else:
            performanceTable.begin("Aggregate %s groupField" % function)

        dataColumn = dataTable.fields[self["field"]]
        whereMask = self.where(dataTable, functionTable, performanceTable)
        stateId = self.get("stateId")

        if groupField is None:
            if stateId is None:
                getstate = None
                setstate = None
            else:

                def getstate():
                    return dataTable.state.get(stateId)

                def setstate(value):
                    dataTable.state[stateId] = value

            if function == "count":
                dataColumn = self.functionCount(dataColumn, whereMask, None,
                                                getstate, setstate)

            elif function == "sum":
                dataColumn = self.functionSum(dataColumn, whereMask, None,
                                              getstate, setstate)

            elif function == "average":
                dataColumn = self.functionAverage(dataColumn, whereMask, None,
                                                  getstate, setstate)

            elif function == "min":
                dataColumn = self.functionMin(dataColumn, whereMask, None,
                                              getstate, setstate)

            elif function == "max":
                dataColumn = self.functionMax(dataColumn, whereMask, None,
                                              getstate, setstate)

            elif function == "multiset":
                dataColumn = self.functionMultiset(dataColumn, whereMask, None,
                                                   getstate, setstate)

            performanceTable.end("Aggregate %s" % function)
            return dataColumn

        else:
            groupColumn = dataTable.fields[groupField]
            if groupColumn.mask is None:
                validGroup = groupColumn.data
            else:
                validGroup = groupColumn.data[NP(
                    groupColumn.mask == defs.VALID)]

            if stateId is not None:
                state = dataTable.state.get(stateId)
                if state is None:
                    record = {}
                else:
                    record = state

            valuesSeen = dict((stringValue, False) for stringValue in record)

            groupTables = {}
            groupColumnFieldType = None
            for groupValue in NP("unique", validGroup):
                groupSelection = NP(groupColumn.data == groupValue)
                if groupColumn.mask is not None:
                    NP("logical_and", groupSelection,
                       NP(groupColumn.mask == defs.VALID), groupSelection)

                groupColumnFieldType = groupColumn.fieldType
                stringValue = groupColumnFieldType.valueToString(groupValue)

                if stringValue in record:

                    def getstate():
                        return record[stringValue]
                else:
                    getstate = None

                def setstate(value):
                    record[stringValue] = value

                valuesSeen[stringValue] = True
                value = groupColumnFieldType.valueToPython(groupValue)

                if function == "count":
                    groupTables[value] = self.functionCount(
                        dataColumn, whereMask, groupSelection, getstate,
                        setstate)

                elif function == "sum":
                    groupTables[value] = self.functionSum(
                        dataColumn, whereMask, groupSelection, getstate,
                        setstate)

                elif function == "average":
                    groupTables[value] = self.functionAverage(
                        dataColumn, whereMask, groupSelection, getstate,
                        setstate)

                elif function == "min":
                    groupTables[value] = self.functionMin(
                        dataColumn, whereMask, groupSelection, getstate,
                        setstate)

                elif function == "max":
                    groupTables[value] = self.functionMax(
                        dataColumn, whereMask, groupSelection, getstate,
                        setstate)

                elif function == "multiset":
                    groupTables[value] = self.functionMultiset(
                        dataColumn, whereMask, groupSelection, getstate,
                        setstate)

            if stateId is not None:
                dataTable.state[stateId] = record

            for stringValue in valuesSeen:
                if not valuesSeen[stringValue]:
                    value = groupColumnFieldType.valueToPython(
                        groupColumnFieldType.stringToValue(stringValue))

                    if function == "count":
                        groupTables[value] = self.functionCountFake(
                            record[stringValue], len(dataTable),
                            dataColumn.fieldType)

                    elif function == "sum":
                        groupTables[value] = self.functionSumFake(
                            record[stringValue], len(dataTable),
                            dataColumn.fieldType)

                    elif function == "average":
                        groupTables[value] = self.functionAverageFake(
                            record[stringValue], len(dataTable),
                            dataColumn.fieldType)

                    elif function in ("min", "max"):
                        groupTables[value] = self.functionMinMaxFake(
                            record[stringValue], len(dataTable),
                            dataColumn.fieldType)

                    elif function == "multiset":
                        groupTables[value] = self.functionMultisetFake(
                            record[stringValue], len(dataTable),
                            dataColumn.fieldType)

            performanceTable.begin("Aggregate %s groupField collect" %
                                   function)

            fieldType = FakeFieldType("object", "any")
            data = NP("empty", len(dataTable), dtype=NP.dtype(object))

            if function == "count":
                for i in xrange(len(dataTable)):
                    data[i] = dict((value, table.data[i])
                                   for value, table in groupTables.items()
                                   if table.data[i] != 0)

            elif function == "sum":
                for i in xrange(len(dataTable)):
                    data[i] = dict((value, table.data[i])
                                   for value, table in groupTables.items()
                                   if table.data[i] != 0.0)

            elif function == "average":
                for i in xrange(len(dataTable)):
                    data[i] = dict(
                        (value, table.data[i])
                        for value, table in groupTables.items()
                        if table.data[i] > 0.0 or table.data[i] <= 0.0)

            elif function in ("min", "max"):
                for table in groupTables.values():
                    if table.mask is None:
                        table._mask = NP("zeros",
                                         len(table),
                                         dtype=defs.maskType)
                for i in xrange(len(dataTable)):
                    data[i] = dict((value, table.data[i])
                                   for value, table in groupTables.items()
                                   if table.mask[i] == defs.VALID)

            elif function == "multiset":
                for i in xrange(len(dataTable)):
                    data[i] = dict((value, table.data[i])
                                   for value, table in groupTables.items()
                                   if len(table.data[i]) > 0)

            performanceTable.end("Aggregate %s groupField collect" % function)
            performanceTable.end("Aggregate %s groupField" % function)
            return DataColumn(fieldType, data, None)
Example #16
0
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("MapValues")
        
        fieldType = FakeFieldType(self.get("dataType", "string"), self.get("optype", self._optype))
        fieldType._newValuesAllowed = True

        defaultValue = self.get("defaultValue")
        if defaultValue is not None:
            defaultValue = fieldType.stringToValue(defaultValue)

        data = NP("empty", len(dataTable), dtype=fieldType.dtype)
        if defaultValue is not None:
            data[:] = defaultValue

        outputColumn = self["outputColumn"]
        columnNameToField = {}
        for fieldColumnPair in self.childrenOfTag("FieldColumnPair"):
            dataColumn = dataTable.fields[fieldColumnPair["field"]]
            columnNameToField[fieldColumnPair["column"]] = dataColumn

        # cache partial selections because they'll be used over and over in intersecting sets
        dataSelections = {}
        missingSelections = {}
        coverage = NP("zeros", len(dataTable), dtype=NP.dtype(bool))

        for index, row in enumerate(self.childOfClass(TableInterface).iterate()):
            outputValue = row.get(outputColumn)
            if outputValue is None:
                raise defs.PmmlValidationError("MapValues has outputColumn \"%s\" but a column with that name does not appear in row %d of the table" % (outputColumn, index))
            del row[outputColumn]
            outputValue = fieldType.stringToValue(outputValue)

            # this is an intersection of all matching columns
            selection = NP("ones", len(dataTable), dtype=NP.dtype(bool))

            for columnName, columnValueString in row.items():
                dataColumn = columnNameToField.get(columnName)
                if dataColumn is not None:
                    columnValue = dataColumn.fieldType.stringToValue(columnValueString)

                    # one cached data array per column (name, value) pair
                    if (columnName, columnValueString) not in dataSelections:
                        selectData = NP(dataColumn.data == columnValue)
                        if dataColumn.mask is not None:
                            NP("logical_and", selectData, NP(dataColumn.mask == defs.VALID), selectData)
                        dataSelections[columnName, columnValueString] = selectData
                    NP("logical_and", selection, dataSelections[columnName, columnValueString], selection)

                    # one cached mask array per column name ("missing" has only one possible value, though I consider any non-VALID "missing")
                    if columnName not in missingSelections and dataColumn.mask is not None:
                        missingSelections[columnName] = NP(dataColumn.mask != defs.VALID)
                        
            # set the intersection to the output value
            data[selection] = outputValue
            NP("logical_or", coverage, selection, coverage)
        
        missing = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
        for missingSelection in missingSelections.values():
            NP("logical_or", missing, missingSelection, missing)
        coverage -= missing

        mask = missing * defs.MISSING

        data, mask = FieldCastMethods.applyMapMissingTo(fieldType, data, mask, self.get("mapMissingTo"))

        if defaultValue is None:
            NP("logical_not", coverage, coverage)
            if mask is None:
                mask = NP(coverage * defs.MISSING)
            else:
                mask[coverage] = defs.MISSING

        performanceTable.end("MapValues")
        return DataColumn(fieldType, data, mask)
Example #17
0
    def calculateScore(self, dataTable, functionTable, performanceTable):
        """Calculate the score of this model.

        This method is called by C{calculate} to separate operations
        that are performed by all models (in C{calculate}) from
        operations that are performed by specific models (in
        C{calculateScore}).

        @type subTable: DataTable
        @param subTable: The DataTable representing this model's lexical scope.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: A DataColumn containing the score.
        """

        performanceTable.begin("ClusteringModel")

        performanceTable.begin("set up")

        distributionBased = (self["modelClass"] == "distributionBased")
        clusteringFields = self.xpath("pmml:ClusteringField[not(@isCenterField='false')]")
        fieldWeights = [clusteringField.get("fieldWeight", defaultFromXsd=True, convertType=True) for clusteringField in clusteringFields]
        for fieldWeight in fieldWeights:
            if fieldWeight < 0.0:
                raise defs.PmmlValidationError("ClusteringField fieldWeights must all be non-negative (encountered %g)" % fieldWeight)
        clusters = self.xpath("pmml:Cluster")
        comparisonMeasure = self.childOfClass(ComparisonMeasure)
        defaultCompareFunction = comparisonMeasure.get("compareFunction", defaultFromXsd=True)
        metric = comparisonMeasure.childOfClass(PmmlClusteringMetric)
        metrictag = metric.t

        performanceTable.end("set up")

        for clusteringField in clusteringFields:
            dataType = dataTable.fields[clusteringField["field"]].fieldType.dataType
            if dataType == "string":
                raise defs.PmmlValidationError("ClusteringField \"%s\" has dataType \"%s\", which cannot be used for clustering" % (clusteringField["field"], dataType))

        missingValueWeights = self.childOfTag("MissingValueWeights")
        if missingValueWeights is None:
            adjustM = None

        else:
            performanceTable.begin("MissingValueWeights")

            missingWeights = missingValueWeights.childOfClass(PmmlArray).values(convertType=True)

            sumNMqi = NP("zeros", len(dataTable), dtype=NP.dtype(float))
            for clusteringField, missingWeight in zip(clusteringFields, missingWeights):
                clusteringField.addToAdjustM(dataTable, functionTable, performanceTable, sumNMqi, missingWeight)

            adjustM = NP(sum(missingWeights) / sumNMqi)
            adjustM[NP(sumNMqi == 0.0)] = 1.0

            performanceTable.end("MissingValueWeights")

        anyInvalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
        for clusteringField in clusteringFields:
            mask = dataTable.fields[clusteringField["field"]].mask
            if mask is not None:
                NP("logical_or", anyInvalid, NP(mask == defs.INVALID), anyInvalid)

        bestClusterId = None
        bestClusterAffinity = None
        allClusterAffinities = {}

        for index, cluster in enumerate(clusters):
            array = cluster.childOfClass(PmmlArray)
            if array is None:
                raise defs.PmmlValidationError("Cluster must have an array to designate its center")

            centerStrings = array.values(convertType=False)
            if len(centerStrings) != len(clusteringFields):
                raise defs.PmmlValidationError("Cluster array has %d components, but there are %d ClusteringFields with isCenterField=true" % (len(centerStrings), len(clusteringFields)))

            performanceTable.begin(metrictag)

            if distributionBased:
                matrix = cluster.xpath("pmml:Covariances/pmml:Matrix")
                if len(matrix) != 1:
                    raise defs.PmmlValidationError("In distribution-based clustering, all clusters must have a Covariances/Matrix")
                try:
                    covarianceMatrix = NP("array", matrix[0].values(), dtype=NP.dtype(float))
                except ValueError:
                    raise defs.PmmlValidationError("Covariances/Matrix must contain real numbers for distribution-based clustering")

            else:
                covarianceMatrix = None

            state = self._State()
            metric.initialize(state, len(dataTable), len(clusteringFields), distributionBased)

            for clusteringField, centerString, fieldWeight in zip(clusteringFields, centerStrings, fieldWeights):
                if isinstance(metric, PmmlClusteringMetricBinary):
                    metric.accumulateBinary(state, dataTable.fields[clusteringField["field"]], centerString, distributionBased)
                else:
                    performanceTable.pause(metrictag)
                    cxy = clusteringField.compare(dataTable, functionTable, performanceTable, centerString, defaultCompareFunction, anyInvalid)
                    performanceTable.unpause(metrictag)
                    metric.accumulate(state, cxy, fieldWeight, distributionBased)

            distance = metric.finalizeDistance(state, adjustM, distributionBased, covarianceMatrix)
            del state

            performanceTable.end(metrictag)

            if index == 0:
                bestClusterId = NP("ones", len(dataTable), dtype=NP.dtype(int))   # 1-based index
                bestClusterAffinity = distance

            better = NP(distance < bestClusterAffinity)
            bestClusterId[better] = index + 1   # 1-based index
            bestClusterAffinity[better] = distance[better]

            allClusterAffinities[cluster.get("id", "%d" % (index + 1))] = distance

        if not anyInvalid.any():
            scoreMask = None
        else:
            scoreMask = NP(anyInvalid * defs.INVALID)

        performanceTable.begin("set scores")
        score = {}

        performanceTable.begin("predictedValue")
        fieldType = FakeFieldType("string", "categorical")
        clusterIdentifiers = NP("empty", len(dataTable), dtype=fieldType.dtype)
        for index, cluster in enumerate(clusters):
            value = fieldType.stringToValue(cluster.get("id", "%d" % (index + 1)))
            clusterIdentifiers[NP(bestClusterId == (index + 1))] = value
        score[None] = DataColumn(fieldType, clusterIdentifiers, scoreMask)
        performanceTable.end("predictedValue")

        if self.subFields["predictedDisplayValue"]:
            performanceTable.begin("predictedDisplayValue")
            fieldType = FakeFieldType("string", "categorical")
            clusterNames = NP("empty", len(dataTable), dtype=fieldType.dtype)
            for index, cluster in enumerate(clusters):
                value = fieldType.stringToValue(cluster.get("name", ""))
                clusterNames[NP(bestClusterId == (index + 1))] = value
            score["predictedDisplayValue"] = DataColumn(fieldType, clusterNames, scoreMask)
            performanceTable.end("predictedDisplayValue")

        if self.subFields["entity"]:
            performanceTable.begin("entity")
            fieldType = FakeFieldType("object", "any")
            entities = NP("empty", len(dataTable), dtype=fieldType.dtype)
            for index, cluster in enumerate(clusters):
                value = fieldType.stringToValue(cluster.get("name", ""))
                indexPlusOne = index + 1
                for i in xrange(len(entities)):
                    if bestClusterId[i] == indexPlusOne:
                        entities[i] = cluster
            score["entity"] = DataColumn(fieldType, entities, scoreMask)
            performanceTable.end("entity")

        if self.subFields["clusterId"]:
            performanceTable.begin("clusterId")
            fieldType = FakeFieldType("integer", "continuous")
            score["clusterId"] = DataColumn(fieldType, bestClusterId, scoreMask)
            performanceTable.end("clusterId")

        if self.subFields["entityId"]:
            performanceTable.begin("entityId")
            fieldType = FakeFieldType("integer", "continuous")
            score["entityId"] = DataColumn(fieldType, bestClusterId, scoreMask)
            performanceTable.end("entityId")

        if self.subFields["clusterAffinity"]:
            performanceTable.begin("clusterAffinity")
            fieldType = FakeFieldType("double", "continuous")
            score["clusterAffinity"] = DataColumn(fieldType, bestClusterAffinity, scoreMask)
            performanceTable.end("clusterAffinity")

        if self.subFields["affinity"]:
            performanceTable.begin("affinity")
            fieldType = FakeFieldType("double", "continuous")
            score["affinity"] = DataColumn(fieldType, bestClusterAffinity, scoreMask)
            performanceTable.end("affinity")

        if self.subFields["all"]:
            performanceTable.begin("all")
            fieldType = FakeFieldType("double", "continuous")
            for identifier, distance in allClusterAffinities.items():
                score["all.%s" % identifier] = DataColumn(fieldType, distance, scoreMask)
            performanceTable.end("all")

        performanceTable.end("set scores")
        performanceTable.end("ClusteringModel")
        return score
Example #18
0
 def __init__(self, dataType, value):
     self.fieldType = FakeFieldType(dataType, "continuous")
     self.value = value
Example #19
0
    def __init__(self, context, inputData, inputMask=None, inputState=None):
        """Create a DataTable from a type-context, input data,
        possible input masks, and possible input states.

        For maximum flexibility, very few assumptions are made about
        the format of C{inputData}.  It need only have a structure
        that is equivalent to a dictionary mapping strings (field
        names) to lists of values (data columns).  Numpy
        U{record arrays<http://docs.scipy.org/doc/numpy/user/basics.rec.html>},
        U{NpzFiles <http://docs.scipy.org/doc/numpy/reference/generated/numpy.savez.html>},
        and U{Pandas data frames<http://pandas.pydata.org/>}
        effectively present their data in this format because::

            inputData[fieldName]

        yields a column of values.  Regardless of the input type,
        these values are then interpreted by the C{context} to set
        their PMML type.

        The length of the resulting DataTable is equal to the length
        of the shortest DataColumn.  Generally, one should use
        equal-length arrays to build a DataTable.

        @type context: PmmlBinding, FieldType, string, dict, or None
        @param context: If a rooted PmmlBinding, use the PMML's DataDictionary to interpret C{inputData}.  If a FieldType, use that FieldType to interpret all fields.  If a string, use that dataType (e.g. "integer", "dateDaysSince[1960]") to interpret all fields.  If a dictionary from field names to FieldTypes or dataType strings, use them on a per-field basis.  Otherwise, assume a FieldType from the Numpy C{dtype}.  The last option only works if all C{inputData} columns are Numpy arrays.
        @type inputData: any dict-like mapping from strings to lists
        @param inputData: Maps field names (strings) to columns of data (lists or Numpy arrays) that are interpreted by C{context}.
        @type inputMask: dict-like mapping from strings to lists of bool, or None
        @param inputMask: If None, missing data are identified by C{NaN} values in the C{inputData} (Pandas convention).  Otherwise, C{NaN} or a True value in the corresponding {inputMask} would label a data item as MISSING.
        @type inputState: DataTableState or None
        @param inputState: Initial state of the DataTable.  To continue a previous calculation, use the C{dataTable.state} from the previous calculation.
        @raise TypeError: If the C{inputData} columns are not Numpy arrays and a C{context} is not given, this method raises an error.
        """

        if isinstance(context, PmmlBinding) and len(
                context.xpath("ancestor-or-self::pmml:PMML")) != 0:
            # get types from PMML
            dataColumns = OrderedDict()
            for fieldName, fieldDefinition in context.fieldContext().items():
                fieldType = FieldType(fieldDefinition)

                try:
                    dataField = inputData[fieldName]
                except KeyError:
                    dataField = None
                else:
                    try:
                        maskField = inputMask[fieldName]
                    except (KeyError, TypeError):
                        maskField = None

                if dataField is not None:
                    dataColumns[fieldName] = fieldType.toDataColumn(
                        dataField, maskField)

        else:
            if not isinstance(context, dict):
                context = dict((x, context) for x in inputData)

            if all(isinstance(x, FieldType) for x in context.values()):
                # FieldTypes provided explicitly
                dataColumns = OrderedDict()
                for fieldName in sorted(context.keys()):
                    data = inputData[fieldName]
                    if inputMask is None:
                        mask = None
                    else:
                        mask = inputMask[fieldName]

                    dataColumns[fieldName] = context[fieldName].toDataColumn(
                        data, mask)

            elif all(isinstance(x, basestring) for x in context.values()):
                # FieldTypes provided by dataType name
                dataColumns = OrderedDict()
                for fieldName in sorted(context.keys()):
                    data = inputData[fieldName]
                    if inputMask is None:
                        mask = None
                    else:
                        mask = inputMask[fieldName]

                    if context[fieldName] == "string":
                        fieldType = FakeFieldType(context[fieldName],
                                                  "categorical")
                    else:
                        fieldType = FakeFieldType(context[fieldName],
                                                  "continuous")
                    dataColumns[fieldName] = fieldType.toDataColumn(data, mask)

            elif all(
                    isinstance(inputData[x], NP.ndarray)
                    for x in inputData.keys()):
                # FieldTypes provided by NumPy types
                dataColumns = OrderedDict()
                for fieldName in sorted(context.keys()):
                    data = inputData[fieldName]
                    if inputMask is None:
                        mask = None
                    else:
                        mask = inputMask[fieldName]

                    if data.dtype in (NP.object, NP.object0, NP.object_,
                                      NP.str, NP.str_, NP.string0,
                                      NP.string_) or re.match(
                                          "\|S[0-9]+", str(
                                              data.dtype)) is not None:
                        fieldType = FakeFieldType("string", "categorical")
                    elif data.dtype in (NP.int, NP.int0, NP.int8, NP.int16,
                                        NP.int32, NP.int64, NP.int_,
                                        NP.integer):
                        fieldType = FakeFieldType("integer", "continuous")
                    elif data.dtype in (NP.float,
                                        NP.__getattr__("float16",
                                                       noneIfMissing=True),
                                        NP.float32):
                        fieldType = FakeFieldType("float", "continuous")
                    elif data.dtype in (NP.float64, NP.float128, NP.float_,
                                        NP.double):
                        fieldType = FakeFieldType("double", "continuous")
                    elif data.dtype in (NP.bool, NP.bool8, NP.bool_):
                        fieldType = FakeFieldType("boolean", "continuous")
                    else:
                        raise TypeError("Unrecognized NumPy dtype: %r" %
                                        data.dtype)

                    dataColumns[fieldName] = fieldType.toDataColumn(data, mask)

            else:
                raise TypeError(
                    "Context must be PMML (anchored by a <PMML> ancestor), a dictionary of FieldType objects, dataType strings, or inputData must consist entirely of NumPy arrays"
                )

        self._configure(dataColumns, inputState)
Example #20
0
class NormContinuous(PmmlExpression):
    """NormContinuous implements an expression that performs piecewise
    linear, everywhere continuous, transformations on a continuous
    field.

    U{PMML specification<http://www.dmg.org/v4-1/Transformations.html>}.
    """

    _fieldType = FakeFieldType("double", "continuous")

    def transformSelection(self, linearNorm1, linearNorm2, indata, outdata,
                           selection):
        """Linearly transform a Subset of the dataset as part of an
        overall piecewise linear transformation.

        @type linearNorm1: PmmlBinding
        @param linearNorm1: The left-side <LinearNorm> object.
        @type linearNorm2: PmmlBinding
        @param linearNorm2: The right-side <LinearNorm> object.
        @type indata: 1d Numpy array
        @param indata: Unselected input data.
        @type outdata: 1d Numpy array
        @param outdata: Output data, modified by this function.
        @type selection: 1d Numpy array of bool
        @param selection: The Numpy selector for this piecewise region.
        """

        a1 = linearNorm1.orig
        b1 = linearNorm1.norm
        a2 = linearNorm2.orig
        b2 = linearNorm2.norm
        outdata[selection] = NP(
            b1 +
            NP(NP(NP(indata[selection] - a1) / NP(a2 - a1)) * NP(b2 - b1)))

    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("NormContinuous")

        dataColumn = dataTable.fields[self["field"]]
        if dataColumn.fieldType.dataType in ("object", "string", "boolean"):
            raise defs.PmmlValidationError(
                "NormContinuous requires a numeric input field, but \"%s\" is"
                % dataColumn.fieldType.dataType)

        outliers = self.get("outliers")

        linearNorms = self.childrenOfTag("LinearNorm")
        for linearNorm in linearNorms:
            linearNorm.orig = float(linearNorm["orig"])
            linearNorm.norm = float(linearNorm["norm"])

        linearNorms.sort(lambda x, y: cmp(x.orig, y.orig)
                         )  # technically, it's invalid if not already sorted

        data = NP("empty", len(dataTable), self._fieldType.dtype)
        mask = dataColumn.mask

        # extrapolate before the first
        selection = NP(dataColumn.data <= linearNorms[0].orig)
        if outliers == "asMissingValues":
            mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask,
                                                      selection)
        elif outliers == "asExtremeValues":
            data[selection] = linearNorms[0].norm
        else:
            self.transformSelection(linearNorms[0], linearNorms[1],
                                    dataColumn.data, data, selection)

        for i in xrange(len(linearNorms) - 1):
            selection = NP(linearNorms[i].orig < dataColumn.data)
            NP("logical_and", selection,
               NP(dataColumn.data <= linearNorms[i + 1].orig), selection)

            self.transformSelection(linearNorms[i], linearNorms[i + 1],
                                    dataColumn.data, data, selection)

        selection = NP(linearNorms[-1].orig < dataColumn.data)
        if outliers == "asMissingValues":
            mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask,
                                                      selection)
        elif outliers == "asExtremeValues":
            data[selection] = linearNorms[-1].norm
        else:
            self.transformSelection(linearNorms[-2], linearNorms[-1],
                                    dataColumn.data, data, selection)

        data, mask = FieldCastMethods.applyMapMissingTo(
            self._fieldType, data, mask, self.get("mapMissingTo"))

        performanceTable.end("NormContinuous")
        return DataColumn(self._fieldType, data, mask)
Example #21
0
class PlotBoxAndWhisker(PmmlPlotContent):
    """Represents a "box-and-whiskers" plot or a "profile histogram."

    PMML subelements:

      - PlotExpression role="sliced": expression to be sliced like a
        histogram.
      - PlotNumericExpression role="profiled": expression to be
        profiled in each slice.
      - PlotSelection: expression or predicate to filter the data
        before plotting.
      - Intervals: non-uniform (numerical) histogram bins.
      - Values: explicit (categorical) histogram values.

    PMML attributes:

      - svgId: id for the resulting SVG element.
      - stateId: key for persistent storage in a DataTableState.
      - numBins: number of histogram bins.
      - low: histogram low edge.
      - high: histogram high edge.
      - levels: "percentage" for quartile-like box-and-whiskers,
        "standardDeviation" for mean and standard deviation, as in
        a profile histogram.
      - lowWhisker: bottom of the lower whisker, usually the 0th
        percentile (absolute minimum).
      - lowBox: bottom of the box, usually the 25th percentile.
      - midLine: middle line of the box, usually the median.
      - highBox: top of the box, usually the 75th percentile.
      - highWhisker: top of the upper whisker, usually the 100th
        percentile (absolute maximum).
      - vertical: if "true", plot the "sliced" expression on the
        x axis and the "profiled" expression on the y axis.
      - gap: size of the space between boxes in SVG coordinates.
      - style: CSS style properties.

    CSS properties:

      - fill, fill-opacity: color of the box.
      - stroke, stroke-dasharray, stroke-dashoffset, stroke-linecap,
        stroke-linejoin, stroke-miterlimit, stroke-opacity,
        stroke-width: properties of the line drawing the box and
        the whiskers.

    See the source code for the full XSD.
    """

    styleProperties = ["fill", "fill-opacity", 
                       "stroke", "stroke-dasharray", "stroke-dashoffset", "stroke-linecap", "stroke-linejoin", "stroke-miterlimit", "stroke-opacity", "stroke-width",
                       ]

    styleDefaults = {"fill": "none", "stroke": "black"}

    xsd = """<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
    <xs:element name="PlotBoxAndWhisker">
        <xs:complexType>
            <xs:sequence>
                <xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded" />
                <xs:element ref="PlotExpression" minOccurs="1" maxOccurs="1" />
                <xs:element ref="PlotNumericExpression" minOccurs="1" maxOccurs="1" />
                <xs:element ref="PlotSelection" minOccurs="0" maxOccurs="1" />
                <xs:choice minOccurs="0" maxOccurs="1">
                    <xs:element ref="Interval" minOccurs="1" maxOccurs="unbounded" />
                    <xs:element ref="Value" minOccurs="1" maxOccurs="unbounded" />
                </xs:choice>
            </xs:sequence>
            <xs:attribute name="svgId" type="xs:string" use="optional" />
            <xs:attribute name="stateId" type="xs:string" use="optional" />
            <xs:attribute name="numBins" type="xs:positiveInteger" use="optional" />
            <xs:attribute name="low" type="xs:double" use="optional" />
            <xs:attribute name="high" type="xs:double" use="optional" />
            <xs:attribute name="levels" use="optional" default="percentage">
                <xs:simpleType>
                    <xs:restriction base="xs:string">
                        <xs:enumeration value="percentage" />
                        <xs:enumeration value="standardDeviation" />
                    </xs:restriction>
                </xs:simpleType>
            </xs:attribute>
            <xs:attribute name="lowWhisker" type="xs:double" use="optional" default="0" />
            <xs:attribute name="lowBox" type="xs:double" use="optional" default="25" />
            <xs:attribute name="midLine" type="xs:double" use="optional" default="50" />
            <xs:attribute name="highBox" type="xs:double" use="optional" default="75" />
            <xs:attribute name="highWhisker" type="xs:double" use="optional" default="100" />
            <xs:attribute name="vertical" type="xs:boolean" use="optional" default="true" />
            <xs:attribute name="gap" type="xs:double" use="optional" default="10" />
            <xs:attribute name="style" type="xs:string" use="optional" default="%s" />
        </xs:complexType>
    </xs:element>
</xs:schema>
""" % PlotStyle.toString(styleDefaults)

    fieldTypeNumeric = FakeFieldType("double", "continuous")

    def prepare(self, state, dataTable, functionTable, performanceTable, plotRange):
        """Prepare a plot element for drawing.

        This stage consists of calculating all quantities and
        determing the bounds of the data.  These bounds may be unioned
        with bounds from other plot elements that overlay this plot
        element, so the drawing (which requires a finalized coordinate
        system) cannot begin yet.

        This method modifies C{plotRange}.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type dataTable: DataTable
        @param dataTable: Contains the data to plot.
        @type functionTable: FunctionTable
        @param functionTable: Defines functions that may be used to transform data for plotting.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process.
        @type plotRange: PlotRange
        @param plotRange: The bounding box of plot coordinates that this function will expand.
        """

        self.checkRoles(["sliced", "profiled"])

        slicedExpression = self.xpath("pmml:PlotExpression[@role='sliced']")
        profiledExpression = self.xpath("pmml:PlotNumericExpression[@role='profiled']")
        cutExpression = self.xpath("pmml:PlotSelection")
        if len(slicedExpression) != 1:
            raise defs.PmmlValidationError("PlotHistogram requires a PlotExpression with role \"sliced\"")
        if len(profiledExpression) != 1:
            raise defs.PmmlValidationError("PlotHistogram requires a PlotNumericExpression with role \"profiled\"")

        slicedDataColumn = slicedExpression[0].evaluate(dataTable, functionTable, performanceTable)
        profiledDataColumn = profiledExpression[0].evaluate(dataTable, functionTable, performanceTable)

        if len(cutExpression) == 1:
            selection = cutExpression[0].select(dataTable, functionTable, performanceTable)
        else:
            selection = NP("ones", len(dataTable), NP.dtype(bool))

        performanceTable.begin("PlotBoxAndWhisker prepare")
        self._saveContext(dataTable)

        if slicedDataColumn.mask is not None:
            NP("logical_and", selection, NP(slicedDataColumn.mask == defs.VALID), selection)
        if profiledDataColumn.mask is not None:
            NP("logical_and", selection, NP(profiledDataColumn.mask == defs.VALID), selection)

        slicedArray = slicedDataColumn.data[selection]
        profiledArray = profiledDataColumn.data[selection]
        
        persistentState = {}
        stateId = self.get("stateId")
        if stateId is not None:
            if stateId in dataTable.state:
                persistentState = dataTable.state[stateId]
            else:
                dataTable.state[stateId] = persistentState

        intervals = self.xpath("pmml:Interval")
        values = self.xpath("pmml:Value")

        if "binType" not in persistentState:
            performanceTable.begin("establish binType")

            binType = PlotHistogram.establishBinType(slicedDataColumn.fieldType, intervals, values)
            persistentState["binType"] = binType

            if binType == "nonuniform":
                persistentState["distributions"] = [NP("empty", 0, dtype=profiledDataColumn.fieldType.dtype) for x in xrange(len(intervals))]

            elif binType == "explicit":
                persistentState["distributions"] = [NP("empty", 0, dtype=profiledDataColumn.fieldType.dtype) for x in xrange(len(values))]

            elif binType == "unique":
                persistentState["distributions"] = {}

            elif binType == "scale":
                numBins = self.get("numBins", convertType=True)
                low = self.get("low", convertType=True)
                high = self.get("high", convertType=True)

                numBins, low, high = PlotHistogram.determineScaleBins(numBins, low, high, slicedArray)

                persistentState["low"] = low
                persistentState["high"] = high
                persistentState["numBins"] = numBins
                persistentState["distributions"] = [NP("empty", 0, dtype=profiledDataColumn.fieldType.dtype) for x in xrange(numBins)]

            performanceTable.end("establish binType")

        if persistentState["binType"] == "nonuniform":
            performanceTable.begin("binType nonuniform")

            distributions = [None] * len(intervals)
            state.edges = []
            lastLimitPoint = None
            lastClosed = None
            lastInterval = None

            for index, interval in enumerate(intervals):
                selection, lastLimitPoint, lastClosed, lastInterval = PlotHistogram.selectInterval(slicedDataColumn.fieldType, slicedArray, index, len(intervals) - 1, interval, state.edges, lastLimitPoint, lastClosed, lastInterval)

                if selection is None:
                    distributions[index] = profiledArray
                else:
                    distributions[index] = profiledArray[selection]

            persistentState["distributions"] = [NP("concatenate", [x, y]) for x, y in itertools.izip(persistentState["distributions"], distributions)]
            distributions = persistentState["distributions"]
            lowEdge = min(low for low, high in state.edges if low is not None)
            highEdge = max(high for low, high in state.edges if high is not None)
            state.slicedFieldType = self.fieldTypeNumeric

            performanceTable.end("binType nonuniform")

        elif persistentState["binType"] == "explicit":
            performanceTable.begin("binType explicit")

            distributions = [None] * len(values)
            displayValues = []

            for index, value in enumerate(values):
                internalValue = slicedDataColumn.fieldType.stringToValue(value["value"])
                displayValues.append(value.get("displayValue", slicedDataColumn.fieldType.valueToString(internalValue, displayValue=True)))

                selection = NP(slicedArray == internalValue)
                distributions[index] = profiledArray[selection]
                
            persistentState["distributions"] = [NP("concatenate", [x, y]) for x, y in itertools.izip(persistentState["distributions"], distributions)]
            distributions = persistentState["distributions"]
            state.edges = displayValues
            state.slicedFieldType = slicedDataColumn.fieldType

            performanceTable.end("binType explicit")

        elif persistentState["binType"] == "unique":
            performanceTable.begin("binType unique")

            uniques, inverse = NP("unique", slicedArray, return_inverse=True)

            persistentDistributions = persistentState["distributions"]
            for i, u in enumerate(uniques):
                string = slicedDataColumn.fieldType.valueToString(u, displayValue=False)
                selection = NP(inverse == i)

                if string in persistentDistributions:
                    persistentDistributions[string] = NP("concatenate", [persistentDistributions[string], profiledArray[selection]])
                else:
                    persistentDistributions[string] = profiledArray[selection]

            tosort = [(len(distribution), string) for string, distribution in persistentDistributions.items()]
            tosort.sort(reverse=True)

            numBins = self.get("numBins", convertType=True)
            if numBins is not None:
                tosort = tosort[:numBins]

            distributions = [persistentDistributions[string] for count, string in tosort]
            state.edges = [slicedDataColumn.fieldType.valueToString(slicedDataColumn.fieldType.stringToValue(string), displayValue=True) for count, string in tosort]
            state.slicedFieldType = slicedDataColumn.fieldType
            
            performanceTable.end("binType unique")

        elif persistentState["binType"] == "scale":
            performanceTable.begin("binType scale")

            numBins = persistentState["numBins"]
            low = persistentState["low"]
            high = persistentState["high"]
            binWidth = (high - low) / float(numBins)

            binAssignments = NP("array", NP("floor", NP(NP(slicedArray - low)/binWidth)), dtype=NP.dtype(int))
            distributions = [None] * numBins

            for index in xrange(numBins):
                selection = NP(binAssignments == index)
                distributions[index] = profiledArray[selection]
                
            persistentState["distributions"] = [NP("concatenate", [x, y]) for x, y in itertools.izip(persistentState["distributions"], distributions)]
            distributions = persistentState["distributions"]
            state.edges = [(low + i*binWidth, low + (i + 1)*binWidth) for i in xrange(numBins)]
            lowEdge = low
            highEdge = high
            state.slicedFieldType = self.fieldTypeNumeric
        
            performanceTable.end("binType scale")

        levels = self.get("levels", defaultFromXsd=True)
        lowWhisker = self.get("lowWhisker", defaultFromXsd=True, convertType=True)
        lowBox = self.get("lowBox", defaultFromXsd=True, convertType=True)
        midLine = self.get("midLine", defaultFromXsd=True, convertType=True)
        highBox = self.get("highBox", defaultFromXsd=True, convertType=True)
        highWhisker = self.get("highWhisker", defaultFromXsd=True, convertType=True)

        state.ranges = []
        minProfiled = None
        maxProfiled = None
        for distribution in distributions:
            if levels == "percentage":
                if len(distribution) > 0:
                    state.ranges.append(NP("percentile", distribution, [lowWhisker, lowBox, midLine, highBox, highWhisker]))
                else:
                    state.ranges.append(None)

            elif levels == "standardDeviation":
                mu = NP("mean", distribution)
                sigma = NP("std", distribution, ddof=1)

                if NP("isfinite", sigma) and sigma > 0.0:
                    state.ranges.append([(lowWhisker - mu)/sigma, (lowBox - mu)/sigma, (midLine - mu)/sigma, (highBox - mu)/sigma, (highWhisker - mu)/sigma])
                else:
                    state.ranges.append(None)

            if state.ranges[-1] is not None:
                if minProfiled is None:
                    minProfiled = min(state.ranges[-1])
                    maxProfiled = max(state.ranges[-1])
                else:
                    minProfiled = min(minProfiled, min(state.ranges[-1]))
                    maxProfiled = max(maxProfiled, max(state.ranges[-1]))

        state.profiledFieldType = profiledDataColumn.fieldType

        if self.get("vertical", defaultFromXsd=True, convertType=True):
            if state.slicedFieldType is self.fieldTypeNumeric:
                plotRange.xminPush(lowEdge, state.slicedFieldType, sticky=False)
                plotRange.xmaxPush(highEdge, state.slicedFieldType, sticky=False)
                if minProfiled is not None:
                    plotRange.yminPush(minProfiled, state.profiledFieldType, sticky=False)
                    plotRange.ymaxPush(maxProfiled, state.profiledFieldType, sticky=False)

            else:
                strings = NP("array", state.edges, dtype=NP.dtype(object))
                if minProfiled is not None:
                    values = NP("ones", len(state.edges), dtype=state.profiledFieldType.dtype) * maxProfiled
                    values[0] = minProfiled
                else:
                    values = NP("zeros", len(state.edges), dtype=state.profiledFieldType.dtype)

                plotRange.expand(strings, values, state.slicedFieldType, state.profiledFieldType)

        else:
            if state.slicedFieldType is self.fieldTypeNumeric:
                plotRange.yminPush(lowEdge, state.slicedFieldType, sticky=False)
                plotRange.ymaxPush(highEdge, state.slicedFieldType, sticky=False)
                if minProfiled is not None:
                    plotRange.xminPush(minProfiled, state.profiledFieldType, sticky=False)
                    plotRange.xmaxPush(maxProfiled, state.profiledFieldType, sticky=False)

            else:
                strings = NP("array", state.edges, dtype=NP.dtype(object))
                if minProfiled is not None:
                    values = NP("ones", len(state.edges), dtype=state.profiledFieldType.dtype) * maxProfiled
                    values[0] = minProfiled
                else:
                    values = NP("zeros", len(state.edges), dtype=state.profiledFieldType.dtype)
                
                plotRange.expand(values, strings, state.profiledFieldType, state.slicedFieldType)

        performanceTable.end("PlotBoxAndWhisker prepare")

    def draw(self, state, plotCoordinates, plotDefinitions, performanceTable):
        """Draw the plot element.

        This stage consists of creating an SVG image of the
        pre-computed data.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type plotCoordinates: PlotCoordinates
        @param plotCoordinates: The coordinate system in which this plot element will be placed.
        @type plotDefinitions: PlotDefinitions
        @type plotDefinitions: The dictionary of key-value pairs that forms the <defs> section of the SVG document.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process.
        @rtype: SvgBinding
        @return: An SVG fragment representing the fully drawn plot element.
        """

        svg = SvgBinding.elementMaker
        performanceTable.begin("PlotBoxAndWhisker draw")

        vertical = self.get("vertical", defaultFromXsd=True, convertType=True)
        gap = self.get("gap", defaultFromXsd=True, convertType=True)

        if state.slicedFieldType is not self.fieldTypeNumeric:
            if vertical:
                strings = plotCoordinates.xstrings
            else:
                strings = plotCoordinates.ystrings

            newRanges = []
            for string in strings:
                try:
                    index = state.edges.index(string)
                except ValueError:
                    newRanges.append(None)
                else:
                    newRanges.append(state.ranges[index])

            state.ranges = newRanges
            state.edges = [(i - 0.5, i + 0.5) for i in xrange(len(strings))]

        lowEdge = NP("array", [low if low is not None else float("-inf") for low, high in state.edges], dtype=NP.dtype(float))
        highEdge = NP("array", [high if high is not None else float("inf") for low, high in state.edges], dtype=NP.dtype(float))

        selection = NP("array", [levels is not None for levels in state.ranges], dtype=NP.dtype(bool))
        lowEdge = lowEdge[selection]
        highEdge = highEdge[selection]

        lowWhisker  = NP("array", [levels[0] for levels in state.ranges if levels is not None], dtype=state.profiledFieldType.dtype)
        lowBox      = NP("array", [levels[1] for levels in state.ranges if levels is not None], dtype=state.profiledFieldType.dtype)
        midLine     = NP("array", [levels[2] for levels in state.ranges if levels is not None], dtype=state.profiledFieldType.dtype)
        highBox     = NP("array", [levels[3] for levels in state.ranges if levels is not None], dtype=state.profiledFieldType.dtype)
        highWhisker = NP("array", [levels[4] for levels in state.ranges if levels is not None], dtype=state.profiledFieldType.dtype)
        
        output = svg.g()
        if len(lowEdge) > 0:
            if vertical:
                Ax = lowEdge
                Bx = lowEdge
                Cx = lowEdge
                Dx = highEdge
                Ex = highEdge
                Fx = highEdge
                Gx = NP(NP(lowEdge + highEdge) / 2.0)
                Hx = Gx
                Ix = Gx
                Jx = Gx

                Ay = lowBox
                By = midLine
                Cy = highBox
                Dy = lowBox
                Ey = midLine
                Fy = highBox
                Gy = lowWhisker
                Hy = lowBox
                Iy = highBox
                Jy = highWhisker

            else:
                Ax = lowBox
                Bx = midLine
                Cx = highBox
                Dx = lowBox
                Ex = midLine
                Fx = highBox
                Gx = lowWhisker
                Hx = lowBox
                Ix = highBox
                Jx = highWhisker

                Ay = lowEdge
                By = lowEdge
                Cy = lowEdge
                Dy = highEdge
                Ey = highEdge
                Fy = highEdge
                Gy = NP(NP(lowEdge + highEdge) / 2.0)
                Hy = Gy
                Iy = Gy
                Jy = Gy

            AX, AY = plotCoordinates(Ax, Ay)
            BX, BY = plotCoordinates(Bx, By)
            CX, CY = plotCoordinates(Cx, Cy)
            DX, DY = plotCoordinates(Dx, Dy)
            EX, EY = plotCoordinates(Ex, Ey)
            FX, FY = plotCoordinates(Fx, Fy)
            GX, GY = plotCoordinates(Gx, Gy)
            HX, HY = plotCoordinates(Hx, Hy)
            IX, IY = plotCoordinates(Ix, Iy)
            JX, JY = plotCoordinates(Jx, Jy)

            if vertical:
                if gap > 0.0 and NP(NP(DX - gap/2.0) - NP(AX + gap/2.0)).min() > 0.0:
                    AX += gap/2.0
                    BX += gap/2.0
                    CX += gap/2.0
                    DX -= gap/2.0
                    EX -= gap/2.0
                    FX -= gap/2.0
            else:
                if gap > 0.0 and NP(NP(DY - gap/2.0) - NP(AY + gap/2.0)).min() > 0.0:
                    AY += gap/2.0
                    BY += gap/2.0
                    CY += gap/2.0
                    DY -= gap/2.0
                    EY -= gap/2.0
                    FY -= gap/2.0

            style = self.getStyleState()
            strokeStyle = dict((x, style[x]) for x in style if x.startswith("stroke"))
            strokeStyle["fill"] = "none"
            style = PlotStyle.toString(style)
            strokeStyle = PlotStyle.toString(strokeStyle)

            for i in xrange(len(lowEdge)):
                pathdata = ["M %r %r" % (HX[i], HY[i]),
                            "L %r %r" % (AX[i], AY[i]),
                            "L %r %r" % (BX[i], BY[i]),
                            "L %r %r" % (CX[i], CY[i]),
                            "L %r %r" % (IX[i], IY[i]),
                            "L %r %r" % (FX[i], FY[i]),
                            "L %r %r" % (EX[i], EY[i]),
                            "L %r %r" % (DX[i], DY[i]),
                            "L %r %r" % (HX[i], HY[i]),
                            "Z"]
                output.append(svg.path(d=" ".join(pathdata), style=style))
                output.append(svg.path(d="M %r %r L %r %r" % (BX[i], BY[i], EX[i], EY[i]), style=strokeStyle))
                output.append(svg.path(d="M %r %r L %r %r" % (HX[i], HY[i], GX[i], GY[i]), style=strokeStyle))
                output.append(svg.path(d="M %r %r L %r %r" % (IX[i], IY[i], JX[i], JY[i]), style=strokeStyle))

                if vertical:
                    width = (DX[i] - AX[i]) / 4.0
                    output.append(svg.path(d="M %r %r L %r %r" % (GX[i] - width, GY[i], GX[i] + width, GY[i]), style=strokeStyle))
                    output.append(svg.path(d="M %r %r L %r %r" % (JX[i] - width, JY[i], JX[i] + width, JY[i]), style=strokeStyle))
                else:
                    width = (DY[i] - AY[i]) / 4.0
                    output.append(svg.path(d="M %r %r L %r %r" % (GX[i], GY[i] - width, GX[i], GY[i] + width), style=strokeStyle))
                    output.append(svg.path(d="M %r %r L %r %r" % (JX[i], JY[i] - width, JX[i], JY[i] + width), style=strokeStyle))

        performanceTable.end("PlotBoxAndWhisker draw")

        svgId = self.get("svgId")
        if svgId is not None:
            output["id"] = svgId

        return output
Example #22
0
    def __init__(self,
                 fileNames,
                 namesToFieldTypes=None,
                 namesToAvroPaths=None,
                 inputState=None,
                 chunkSize=1000000):
        if InputStream is None:
            raise RuntimeError(
                "The optional augustus.avrostream module is required for \"AvroDataTableStream\" but it hasn't been installed or the Avro C++ library is not accessible;%sRecommendations: re-build Augustus with \"python setup.py install --with-avrostream\" or correct your LD_LIBRARY_PATH"
                % os.linesep)

        if isinstance(fileNames, basestring):
            self.fileNames = glob.glob(fileNames)
            if len(self.fileNames) == 0:
                raise IOError("No files matched the fileName pattern \"%s\"" %
                              fileNames)
        else:
            self.fileNames = fileNames

        self.schema = None
        for fileName in self.fileNames:
            inputStream = InputStream()
            inputStream.start(fileName, 0, {}, {})
            try:
                schema = json.loads(inputStream.schema())
                if self.schema is not None and schema != self.schema:
                    raise ValueError(
                        "these files do not all have the same schema")
                self.schema = schema
            except Exception:
                raise
            finally:
                inputStream.close()

        if self.schema["type"] != "record":
            raise TypeError(
                "Top level of schema must describe a record, not %r" %
                self.schema)

        if namesToFieldTypes is None:
            if namesToAvroPaths is None:
                namesToFieldTypes = dict(
                    (x["name"], None) for x in self.schema["fields"])

                # If no parameters are given and this is a map-reduce result, drill down and get the values.
                if set(namesToFieldTypes.keys()) == set(["key", "value"]) and [
                        x["type"]
                        for x in self.schema["fields"] if x["name"] == "key"
                ][0] == "string" and [
                        x["type"]
                        for x in self.schema["fields"] if x["name"] == "value"
                ][0]["type"] == "record":
                    del namesToFieldTypes["value"]
                    namesToAvroPaths = {"key": ("key", )}
                    for x in [
                            x["type"] for x in self.schema["fields"]
                            if x["name"] == "value"
                    ][0]["fields"]:
                        name = x["name"]
                        if name != "key":
                            namesToFieldTypes[name] = None
                            namesToAvroPaths[name] = ("value", name)

            else:
                namesToFieldTypes = dict((x, None) for x in namesToAvroPaths)

        if isinstance(namesToFieldTypes, (list, tuple)):
            namesToFieldTypes = dict((x, None) for x in namesToFieldTypes)

        if namesToAvroPaths is None:
            self.namesToAvroPaths = {}
            for name in namesToFieldTypes:
                self.namesToAvroPaths[name] = (name, )
        else:
            self.namesToAvroPaths = dict(namesToAvroPaths)
            for name, path in self.namesToAvroPaths.items():
                if isinstance(path, basestring):
                    self.namesToAvroPaths[name] = (path, )

        self.namesToFieldTypes = dict(namesToFieldTypes)
        for name, fieldType in namesToFieldTypes.items():
            schemaObject = self.schema
            path = self.namesToAvroPaths[name]

            for pathname in path:
                if schemaObject["type"] == "record":
                    pass
                elif isinstance(
                        schemaObject["type"],
                        dict) and schemaObject["type"].get("type") == "record":
                    schemaObject = schemaObject["type"]
                else:
                    raise LookupError("path %r not found in the schema" %
                                      (path, ))

                fieldNames = [x["name"] for x in schemaObject["fields"]]
                if pathname not in fieldNames:
                    raise LookupError("path %r not found in the schema" %
                                      (path, ))

                schemaObject, = (x for x in schemaObject["fields"]
                                 if x["name"] == pathname)

            avroType = schemaObject["type"]
            if isinstance(avroType, dict):
                avroType = avroType["type"]

            if avroType == "enum":
                values = [
                    FakeFieldValue(x) for x in schemaObject["type"]["symbols"]
                ]
            else:
                values = []

            if fieldType == "string":
                self.namesToFieldTypes[name] = FakeFieldType(
                    "string", "continuous")
            elif fieldType == "categorical":
                self.namesToFieldTypes[name] = FakeFieldType("string",
                                                             "categorical",
                                                             values=values)
                self._setupMaps(self.namesToFieldTypes[name])
            elif fieldType == "ordinal":
                self.namesToFieldTypes[name] = FakeFieldType("string",
                                                             "ordinal",
                                                             values=values)
                self._setupMaps(self.namesToFieldTypes[name])
            elif isinstance(fieldType, basestring):
                self.namesToFieldTypes[name] = FakeFieldType(
                    fieldType, "continuous")
            elif fieldType is None:
                if avroType in ("null", "record", "array", "map", "fixed"):
                    del self.namesToFieldTypes[name]
                    del self.namesToAvroPaths[name]
                elif avroType in ("boolean", "int", "long"):
                    self.namesToFieldTypes[name] = FakeFieldType(
                        "integer", "continuous")
                elif avroType in ("float", "double"):
                    self.namesToFieldTypes[name] = FakeFieldType(
                        "double", "continuous")
                elif avroType in ("bytes", "string"):
                    self.namesToFieldTypes[name] = FakeFieldType(
                        "string", "continuous")
                elif avroType == "enum":
                    self.namesToFieldTypes[name] = FakeFieldType("string",
                                                                 "categorical",
                                                                 values=values)
                    self._setupMaps(self.namesToFieldTypes[name])
                else:
                    raise TypeError("Unrecognized Avro type: %s" % avroType)

            if name in self.namesToFieldTypes:
                fieldType = self.namesToFieldTypes[name]
                if not isinstance(fieldType, FieldType):
                    raise TypeError("namesToFieldTypes must map to FieldTypes")

                # TODO: make this more sensible

                if fieldType.dataType in ("date", "time", "dateTime",
                                          "dateDaysSince[0]",
                                          "dateDaysSince[1960]",
                                          "dateDaysSince[1970]",
                                          "dateDaysSince[1980]", "timeSeconds",
                                          "dateTimeSecondsSince[0]",
                                          "dateTimeSecondsSince[1960]",
                                          "dateTimeSecondsSince[1970]",
                                          "dateTimeSecondsSince[1980]"):
                    raise NotImplementedError

                if fieldType.dataType == "object":
                    raise TypeError(
                        "PMML type %r and Avro type \"%s\" are incompatible" %
                        (fieldType, avroType))

                elif fieldType.dataType == "string":
                    if fieldType.optype == "continuous":
                        if avroType not in ("boolean", "int", "long", "float",
                                            "double", "string", "bytes"):
                            raise TypeError(
                                "PMML type %r and Avro type \"%s\" are incompatible"
                                % (fieldType, avroType))
                    elif fieldType.optype == "categorical":
                        if avroType != "enum":
                            raise TypeError(
                                "PMML type %r and Avro type \"%s\" are incompatible"
                                % (fieldType, avroType))
                    elif fieldType.optype == "ordinal":
                        if avroType != "enum":
                            raise TypeError(
                                "PMML type %r and Avro type \"%s\" are incompatible"
                                % (fieldType, avroType))

                elif fieldType.dataType in ("boolean", "integer",
                                            "dateDaysSince[0]",
                                            "dateDaysSince[1960]",
                                            "dateDaysSince[1970]",
                                            "dateDaysSince[1980]",
                                            "timeSeconds",
                                            "dateTimeSecondsSince[0]",
                                            "dateTimeSecondsSince[1960]",
                                            "dateTimeSecondsSince[1970]",
                                            "dateTimeSecondsSince[1980]"):
                    if avroType not in ("boolean", "int", "long"):
                        raise TypeError(
                            "PMML type %r and Avro type \"%s\" are incompatible"
                            % (fieldType, avroType))

                elif fieldType.dataType in ("float", "double"):
                    if avroType not in ("boolean", "int", "long", "float",
                                        "double"):
                        raise TypeError(
                            "PMML type %r and Avro type \"%s\" are incompatible"
                            % (fieldType, avroType))

                elif fieldType.dataType == "boolean":
                    raise TypeError(
                        "PMML type %r and Avro type \"%s\" are incompatible" %
                        (fieldType, avroType))

                elif fieldType.dataType in ("date", "time", "dateTime"):
                    if avroType != "string":
                        raise TypeError(
                            "PMML type %r and Avro type \"%s\" are incompatible"
                            % (fieldType, avroType))

        self.inputState = inputState
        self.chunkSize = chunkSize
Example #23
0
    _YEAR = 365 * _DAY
    _monthName = {
        1: "Jan",
        2: "Feb",
        3: "Mar",
        4: "Apr",
        5: "May",
        6: "Jun",
        7: "Jul",
        8: "Aug",
        9: "Sep",
        10: "Oct",
        11: "Nov",
        12: "Dec"
    }
    _fieldType = FakeFieldType("dateTime", "continuous")

    @staticmethod
    def _explicitTimeTicks(low, high, initialize, skip, bigTick,
                           contextGranularity, firstIsContext, anyContext,
                           renderContext, renderOther):
        lowDateTime = PlotTickMarks._fieldType.valueToPython(low)
        highDateTime = PlotTickMarks._fieldType.valueToPython(high)

        ticks = {}
        miniticks = []

        runner = PlotTickMarks._fieldType.valueToPython(low).replace(
            **initialize)
        while runner <= highDateTime:
            td = runner - FakeFieldType._dateTimeOrigin
Example #24
0
class MiningModel(PmmlModel):
    """MiningModel implements segmentation, the application of a large
    pool of models to a dataset, with models selected for individual
    data records by the data's features.

    U{PMML specification<http://www.dmg.org/v4-1/MultipleModels.html>}.
    """

    scoreType = FakeFieldType("object", "any")
    scoreTypeSegment = FakeFieldType("object", "any")
    scoreTypeCardinality = FakeFieldType("integer", "continuous")

    SELECT_ALL = object()
    MEDIAN = object()

    SUM = object()
    AVERAGE = object()
    WEIGHTED_AVERAGE = object()

    MAJORITY_VOTE = object()
    WEIGHTED_MAJORITY_VOTE = object()

    def calculateScore(self, dataTable, functionTable, performanceTable):
        """Calculate the score of this model.

        This method is called by C{calculate} to separate operations
        that are performed by all models (in C{calculate}) from
        operations that are performed by specific models (in
        C{calculateScore}).

        @type subTable: DataTable
        @param subTable: The DataTable representing this model's lexical scope.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: A DataColumn containing the score.
        """

        segmentation = self.childOfTag("Segmentation")
        if segmentation is None:
            return dataTable

        multipleModelMethod = segmentation.get("multipleModelMethod")

        if multipleModelMethod == "selectAll":
            return self._selectAllMedianMajority(dataTable, functionTable,
                                                 performanceTable,
                                                 segmentation, self.SELECT_ALL)

        if multipleModelMethod == "median":
            return self._selectAllMedianMajority(dataTable, functionTable,
                                                 performanceTable,
                                                 segmentation, self.MEDIAN)

        elif multipleModelMethod == "majorityVote":
            return self._selectAllMedianMajority(dataTable, functionTable,
                                                 performanceTable,
                                                 segmentation,
                                                 self.MAJORITY_VOTE)

        elif multipleModelMethod == "weightedMajorityVote":
            return self._selectAllMedianMajority(dataTable, functionTable,
                                                 performanceTable,
                                                 segmentation,
                                                 self.WEIGHTED_MAJORITY_VOTE)

        elif multipleModelMethod == "selectFirst":
            return self._selectFirst(dataTable, functionTable,
                                     performanceTable, segmentation)

        elif multipleModelMethod == "sum":
            return self._sumAverageWeighted(dataTable, functionTable,
                                            performanceTable, segmentation,
                                            self.SUM)

        elif multipleModelMethod == "average":
            return self._sumAverageWeighted(dataTable, functionTable,
                                            performanceTable, segmentation,
                                            self.AVERAGE)

        elif multipleModelMethod == "weightedAverage":
            return self._sumAverageWeighted(dataTable, functionTable,
                                            performanceTable, segmentation,
                                            self.WEIGHTED_AVERAGE)

        elif multipleModelMethod == "max":
            return self._selectMax(dataTable, functionTable, performanceTable,
                                   segmentation)

        else:
            raise NotImplementedError(
                "multipleModelMethod \"%s\" has not been implemented" %
                multipleModelMethod)

    def _selectAllMedianMajority(self, dataTable, functionTable,
                                 performanceTable, segmentation, which):
        """Used by C{calculateScore}."""

        if which is self.SELECT_ALL:
            performanceLabel = "Segmentation selectAll"
        elif which is self.MEDIAN:
            performanceLabel = "Segmentation median"
        elif which is self.MAJORITY_VOTE:
            performanceLabel = "Segmentation majorityVote"
        elif which is self.WEIGHTED_MAJORITY_VOTE:
            performanceLabel = "Segmentation weightedMajorityVote"
        performanceTable.begin(performanceLabel)

        scores = [[] for x in xrange(len(dataTable))]
        if which is self.SELECT_ALL:
            segments = [[] for x in xrange(len(dataTable))]

        newOutputData = {}
        for segment in segmentation.childrenOfTag("Segment", iterator=True):
            performanceTable.pause(performanceLabel)
            selection = segment.childOfClass(PmmlPredicate).evaluate(
                dataTable, functionTable, performanceTable)
            performanceTable.unpause(performanceLabel)
            if not selection.any():
                continue

            segmentName = segment.get("id")
            indexes = NP("nonzero", selection)[0]

            subTable = dataTable.subTable(selection)
            subModel = segment.childOfClass(PmmlModel)

            performanceTable.pause(performanceLabel)
            subModel.calculate(subTable, functionTable, performanceTable)
            performanceTable.unpause(performanceLabel)

            if which is self.MEDIAN and subTable.score.fieldType.dataType in (
                    "string", "boolean", "object"):
                raise defs.PmmlValidationError(
                    "Segmentation with multipleModelMethod=\"median\" cannot be applied to models that produce dataType \"%s\""
                    % subTable.score.fieldType.dataType)

            scoreData = subTable.score.data
            scoreMask = subTable.score.mask
            indexesUsed = indexes
            if which is self.SELECT_ALL:
                for subIndex, index in enumerate(indexes):
                    if scoreMask is None or scoreMask[subIndex] == defs.VALID:
                        scores[index].append(scoreData[subIndex])
                        segments[index].append(segmentName)

            elif which is self.MEDIAN:
                for subIndex, index in enumerate(indexes):
                    if scoreMask is None or scoreMask[subIndex] == defs.VALID:
                        scores[index].append(scoreData[subIndex])

            elif which in (self.MAJORITY_VOTE, self.WEIGHTED_MAJORITY_VOTE):
                if which is self.MAJORITY_VOTE:
                    weight = 1.0
                else:
                    weight = float(segment.get("weight", 1.0))
                for subIndex, index in enumerate(indexes):
                    if scoreMask is None or scoreMask[subIndex] == defs.VALID:
                        newValue = scoreData[subIndex]
                        score = scores[index]
                        found = False
                        for pair in score:
                            if pair[0] == newValue:
                                pair[1] += weight
                                found = True
                                break
                        if not found:
                            score.append([newValue, weight])

            if which is self.SELECT_ALL:
                for fieldName, dataColumn in subTable.output.items():
                    newData = newOutputData.get(fieldName)
                    if newData is None:
                        newData = [[] for x in xrange(len(dataTable))]
                        newOutputData[fieldName] = newData

                    dataColumnData = dataColumn.data
                    dataColumnMask = dataColumn.mask
                    for subIndex, index in enumerate(indexes):
                        if scoreMask is None or scoreMask[
                                subIndex] == defs.VALID:
                            if dataColumnMask is None or dataColumnMask[
                                    subIndex] == defs.VALID:
                                newData[index].append(dataColumnData[subIndex])
                            else:
                                newData[index].append(None)

        if which is self.SELECT_ALL:
            for fieldName, newData in newOutputData.items():
                finalNewData = NP("empty",
                                  len(dataTable),
                                  dtype=NP.dtype(object))
                for index, newDatum in enumerate(newData):
                    finalNewData[index] = tuple(newDatum)
                dataTable.output[fieldName] = DataColumn(
                    self.scoreType, finalNewData, None)

            finalScoresData = NP("empty",
                                 len(dataTable),
                                 dtype=NP.dtype(object))
            for index, score in enumerate(scores):
                finalScoresData[index] = tuple(score)
            finalScores = DataColumn(self.scoreType, finalScoresData, None)

            if self.name is None:
                performanceTable.end(performanceLabel)
                return {None: finalScores}
            else:
                finalSegmentsData = NP("empty",
                                       len(dataTable),
                                       dtype=NP.dtype(object))
                for index, segment in enumerate(segments):
                    finalSegmentsData[index] = tuple(segment)

                performanceTable.end(performanceLabel)
                return {
                    None:
                    finalScores,
                    "segment":
                    DataColumn(self.scoreTypeSegment, finalSegmentsData, None)
                }

        elif which is self.MEDIAN:
            finalScoresData = NP("empty",
                                 len(dataTable),
                                 dtype=NP.dtype(object))
            finalScoresMask = NP("empty", len(dataTable), dtype=defs.maskType)
            for index, score in enumerate(scores):
                if len(score) > 0:
                    finalScoresData[index] = NP("median", score)
                    finalScoresMask[index] = defs.VALID
                else:
                    finalScoresMask[index] = defs.INVALID

            if not finalScoresMask.any():
                finalScoresMask = None
            finalScores = DataColumn(self.scoreType, finalScoresData,
                                     finalScoresMask)

            performanceTable.end(performanceLabel)
            return {None: finalScores}

        elif which in (self.MAJORITY_VOTE, self.WEIGHTED_MAJORITY_VOTE):
            finalScoresData = NP("empty",
                                 len(dataTable),
                                 dtype=NP.dtype(object))
            finalScoresMask = NP("empty", len(dataTable), dtype=defs.maskType)
            cardinality = NP("empty",
                             len(dataTable),
                             dtype=self.scoreTypeCardinality.dtype)

            for index, score in enumerate(scores):
                bestN, bestValue = None, None
                for value, N in score:
                    if bestN is None or N > bestN:
                        bestN = N
                        bestValue = value
                if bestN is not None:
                    finalScoresData[index] = bestValue
                    finalScoresMask[index] = defs.VALID
                    cardinality[index] = bestN
                else:
                    finalScoresMask[index] = defs.INVALID
                    cardinality[index] = 0

            if not finalScoresMask.any():
                finalScoresMask = None
            finalScores = DataColumn(self.scoreType, finalScoresData,
                                     finalScoresMask)

            if self.name is None:
                performanceTable.end(performanceLabel)
                return {None: finalScores}
            else:
                finalCardinality = DataColumn(self.scoreTypeCardinality,
                                              cardinality, None)

                performanceTable.end(performanceLabel)
                return {None: finalScores, "cardinality": finalCardinality}

    def _selectFirst(self, dataTable, functionTable, performanceTable,
                     segmentation):
        """Used by C{calculateScore}."""

        performanceTable.begin("Segmentation selectFirst")

        scoresData = NP("empty", len(dataTable), dtype=NP.dtype(object))
        scoresMask = NP("zeros", len(dataTable), dtype=defs.maskType)
        unfilled = NP("ones", len(dataTable), dtype=NP.dtype(bool))
        segments = NP("empty", len(dataTable), dtype=NP.dtype(object))

        newOutputData = []
        for segment in segmentation.childrenOfTag("Segment", iterator=True):
            performanceTable.pause("Segmentation selectFirst")
            selection = segment.childOfClass(PmmlPredicate).evaluate(
                dataTable, functionTable, performanceTable)
            performanceTable.unpause("Segmentation selectFirst")
            NP("logical_and", selection, unfilled, selection)
            if not selection.any():
                continue

            subTable = dataTable.subTable(selection)
            subModel = segment.childOfClass(PmmlModel)
            performanceTable.pause("Segmentation selectFirst")

            subModel.calculate(subTable, functionTable, performanceTable)
            performanceTable.unpause("Segmentation selectFirst")

            scoresData[selection] = subTable.score.data
            if subTable.score.mask is not None:
                scoresMask[selection] = subTable.score.mask
            else:
                scoresMask[selection] = defs.VALID

            segmentName = segment.get("id")
            if segmentName is not None:
                segments[selection] = segmentName

            for fieldName, dataColumn in subTable.output.items():
                if fieldName not in dataTable.output:
                    data = NP("empty",
                              len(dataTable),
                              dtype=dataColumn.fieldType.dtype)
                    data[selection] = dataColumn.data

                    mask = NP(
                        NP("ones", len(dataTable), dtype=defs.maskType) *
                        defs.MISSING)
                    if dataColumn.mask is None:
                        mask[selection] = defs.VALID
                    else:
                        mask[selection] = dataColumn.mask

                    newDataColumn = DataColumn(dataColumn.fieldType, data,
                                               mask)
                    newDataColumn._unlock()
                    dataTable.output[fieldName] = newDataColumn
                    newOutputData.append(newDataColumn)

                else:
                    newDataColumn = dataTable.output[fieldName]

                    newDataColumn.data[selection] = dataColumn.data
                    if dataColumn.mask is None:
                        newDataColumn.mask[selection] = defs.VALID
                    else:
                        newDataColumn.mask[selection] = dataColumn.mask

            unfilled -= selection
            if not unfilled.any():
                break

        for newDataColumn in newOutputData:
            if not newDataColumn.mask.any():
                newDataColumn._mask = None
            newDataColumn._lock()

        if not scoresMask.any():
            scoresMask = None

        scores = DataColumn(self.scoreType, scoresData, scoresMask)

        if self.name is None:
            performanceTable.end("Segmentation selectFirst")
            return {None: scores}
        else:
            performanceTable.end("Segmentation selectFirst")
            return {
                None: scores,
                "segment": DataColumn(self.scoreTypeSegment, segments, None)
            }

    def _sumAverageWeighted(self, dataTable, functionTable, performanceTable,
                            segmentation, which):
        """Used by C{calculateScore}."""

        if which is self.SUM:
            performanceLabel = "Segmentation sum"
        elif which is self.AVERAGE:
            performanceLabel = "Segmentation average"
        elif which is self.WEIGHTED_AVERAGE:
            performanceLabel = "Segmentation weightedAverage"
        performanceTable.begin(performanceLabel)

        scoresData = NP("zeros", len(dataTable), dtype=NP.dtype(object))
        if which is not self.SUM:
            denominator = NP("zeros", len(dataTable), dtype=NP.dtype(float))
        invalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool))

        for segment in segmentation.childrenOfTag("Segment", iterator=True):
            performanceTable.pause(performanceLabel)
            selection = segment.childOfClass(PmmlPredicate).evaluate(
                dataTable, functionTable, performanceTable)
            performanceTable.unpause(performanceLabel)
            if not selection.any():
                continue

            subTable = dataTable.subTable(selection)
            subModel = segment.childOfClass(PmmlModel)
            performanceTable.pause(performanceLabel)
            subModel.calculate(subTable, functionTable, performanceTable)
            performanceTable.unpause(performanceLabel)

            if subTable.score.fieldType.dataType in ("string", "boolean",
                                                     "object"):
                raise defs.PmmlValidationError(
                    "Segmentation with multipleModelMethod=\"%s\" cannot be applied to models that produce dataType \"%s\""
                    % (self.childOfTag("Segmentation").get(
                        "multipleModelMethod"),
                       subTable.score.fieldType.dataType))

            # ignore invalid in matches (like the built-in "+" and "avg" Apply functions)
            if subTable.score.mask is not None:
                NP("logical_and", selection,
                   NP(subTable.score.mask == defs.VALID), selection)

            if which is self.SUM:
                scoresData[selection] += subTable.score.data
            if which is self.AVERAGE:
                scoresData[selection] += subTable.score.data
                denominator[selection] += 1.0
            elif which is self.WEIGHTED_AVERAGE:
                weight = float(segment.get("weight", 1.0))
                scoresData[selection] += (subTable.score.data * weight)
                denominator[selection] += weight

            if subTable.score.mask is not None:
                invalid[selection] = NP("logical_or", invalid[selection],
                                        NP(subTable.score.mask != defs.VALID))

        if which is not self.SUM:
            NP("logical_or", invalid, NP(denominator == 0.0), invalid)
            valid = NP("logical_not", invalid)
            scoresData[valid] /= denominator[valid]

        if invalid.any():
            scoresMask = NP(
                NP("array", invalid, dtype=defs.maskType) * defs.INVALID)
        else:
            scoresMask = None

        scores = DataColumn(self.scoreType, scoresData, scoresMask)

        performanceTable.end(performanceLabel)
        return {None: scores}

    def _selectMax(self, dataTable, functionTable, performanceTable,
                   segmentation):
        """Used by C{calculateScore}."""

        performanceTable.begin("Segmentation max")

        scoresData = NP("empty", len(dataTable), dtype=NP.dtype(object))
        filled = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
        unfilled = NP("ones", len(dataTable), dtype=NP.dtype(bool))

        newOutputData = []
        for segment in segmentation.childrenOfTag("Segment", iterator=True):
            performanceTable.pause("Segmentation max")
            selection = segment.childOfClass(PmmlPredicate).evaluate(
                dataTable, functionTable, performanceTable)
            performanceTable.unpause("Segmentation max")
            if not selection.any():
                continue

            subTable = dataTable.subTable(selection)
            subModel = segment.childOfClass(PmmlModel)
            performanceTable.pause("Segmentation max")
            subModel.calculate(subTable, functionTable, performanceTable)
            performanceTable.unpause("Segmentation max")

            if subTable.score.fieldType.dataType in ("string", "boolean",
                                                     "object"):
                raise defs.PmmlValidationError(
                    "Segmentation with multipleModelMethod=\"max\" cannot be applied to models that produce dataType \"%s\""
                    % subTable.score.fieldType.dataType)

            # ignore invalid in matches (like the built-in "min" Apply function)
            if subTable.score.mask is not None:
                NP("logical_and", selection,
                   NP(subTable.score.mask == defs.VALID), selection)

            selectionFilled = NP("logical_and", selection, filled)
            selectionUnfilled = NP("logical_and", selection, unfilled)
            filled_selection = filled[selection]
            unfilled_selection = unfilled[selection]

            left, right = subTable.score.data[filled_selection], scoresData[
                selectionFilled]
            condition = NP(left > right)
            scoresData[selectionFilled] = NP("where", condition, left, right)
            scoresData[selectionUnfilled] = subTable.score.data[
                unfilled_selection]

            for fieldName, dataColumn in subTable.output.items():
                if fieldName not in dataTable.output:
                    data = NP("empty",
                              len(dataTable),
                              dtype=dataColumn.fieldType.dtype)
                    data[selectionUnfilled] = dataColumn.data

                    mask = NP(
                        NP("ones", len(dataTable), dtype=defs.maskType) *
                        defs.MISSING)
                    if dataColumn.mask is None:
                        mask[selectionUnfilled] = defs.VALID
                    else:
                        mask[selectionUnfilled] = dataColumn.mask

                    newDataColumn = DataColumn(dataColumn.fieldType, data,
                                               mask)
                    newDataColumn._unlock()
                    dataTable.output[fieldName] = newDataColumn
                    newOutputData.append(newDataColumn)

                else:
                    newDataColumn = dataTable.output[fieldName]

                    newDataColumn.data[selectionFilled] = NP(
                        "where", condition, dataColumn.data[filled_selection],
                        newDataColumn.data[selectionFilled])
                    newDataColumn.data[selectionUnfilled] = dataColumn.data[
                        unfilled_selection]

                    if dataColumn.mask is None:
                        newDataColumn.mask[selectionUnfilled] = defs.VALID
                    else:
                        newDataColumn.mask[selectionUnfilled] = dataColumn.mask

            filled += selectionUnfilled
            unfilled -= selectionUnfilled

        for newDataColumn in newOutputData:
            if not newDataColumn.mask.any():
                newDataColumn._mask = None
            newDataColumn._lock()

        if filled.all():
            scoresMask = None
        else:
            scoresMask = NP(NP("logical_not", filled) * defs.MISSING)

        scores = DataColumn(self.scoreType, scoresData, scoresMask)

        performanceTable.end("Segmentation max")
        return {None: scores}
Example #25
0
 def fieldType(self):
     dataType = self.get("dataType")
     if dataType is None:
         return FakeFieldType("string", "continuous")
     else:
         return FakeFieldType(dataType, "continuous")
Example #26
0
    def verify(self, showSuccess=False, performanceTable=None):
        """Run the model verification tests defined by this element.

        The output is a list of results (all results or only failures,
        depending on C{showSuccess}), each of which is a dictionary of
        field names to values.  Fields are:

          - "success": was the comparison successful?
          - "expectedMissing", "observedMissing": is the
             expected/observed value missing?
          - "expectedValue", "observedValue": result as an internal
             value.
          - "expectedPythonValue", "observedPythonValue": result as a
             Python value.
          - "expectedDisplayValue", "observedDisplayValue": result as
             a string displayValue.

        Only "success", "expectedMissing", and "observedMissing" appear
        if the "is missing?" comparison was unsuccessful.

        @type showSuccess: bool
        @param showSuccess: If True, emit output even if the tests are successful.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: JSON-like list of dicts
        @return: As described above.
        """

        verificationFields = {}
        for verificationField in self.xpath("pmml:VerificationFields/pmml:VerificationField"):
            verificationField.column = verificationField.get("column", verificationField["field"])
            verificationField.precision = verificationField.get("precision", defaultFromXsd=True, convertType=True)
            verificationField.zeroThreshold = verificationField.get("zeroThreshold", defaultFromXsd=True, convertType=True)

            verificationField.data = []
            verificationField.mask = []
            verificationFields[verificationField.column] = verificationField

        inputData = {}
        inputMask = {}
        for index, row in enumerate(self.childOfClass(TableInterface).iterate()):
            for columnName, columnValue in row.items():
                verificationField = verificationFields.get(columnName)

                if verificationField is not None:
                    while len(verificationField.data) < index:
                        verificationField.data.append(defs.PADDING)
                        verificationField.mask.append(True)
                    
                    verificationField.data.append(columnValue)
                    verificationField.mask.append(False)

                else:
                    inputDataField = inputData.get(columnName)
                    if inputDataField is None:
                        inputDataField = []
                        inputData[columnName] = inputDataField
                        inputMask[columnName] = []
                    inputMaskField = inputMask[columnName]

                    while len(inputDataField) < index:
                        inputDataField.append(defs.PADDING)
                        inputMaskField.append(True)

                    inputDataField.append(columnValue)
                    inputMaskField.append(False)

        for verificationField in verificationFields.values():
            while len(verificationField.data) < index:
                verificationField.data.append(defs.PADDING)
                verificationField.mask.append(True)

        for columnName in inputData:
            inputDataField = inputData[columnName]
            inputMaskField = inputMask[columnName]
            while len(inputDataField) < index:
                inputDataField.append(defs.PADDING)
                inputMaskField.append(True)

        for columnName, verificationField in verificationFields.items():
            inputData[columnName] = verificationField.data
            inputMask[columnName] = verificationField.mask

        model = self.getparent()

        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        performanceTable.begin("make DataTable")
        dataTable = DataTable(model, inputData, inputMask, inputState=None)
        performanceTable.end("make DataTable")

        functionTable = FunctionTable()

        for miningField in model.xpath("pmml:MiningSchema/pmml:MiningField"):
            miningField.replaceField(dataTable, functionTable, performanceTable)

        for calculable in model.calculableTrans():
            calculable.calculate(dataTable, functionTable, performanceTable)

        score = model.calculateScore(dataTable, functionTable, performanceTable)
        dataTable.score = score[None]
        if model.name is not None:
            for key, value in score.items():
                if key is None:
                    dataTable.fields[model.name] = value
                else:
                    dataTable.fields["%s.%s" % (model.name, key)] = value

        for outputField in self.xpath("../pmml:Output/pmml:OutputField"):
            displayName = outputField.get("displayName", outputField["name"])
            outputField.format(dataTable, functionTable, performanceTable, score)

        output = []
        for verificationField in verificationFields.values():
            observedOutput = dataTable.fields.get(verificationField["field"])

            if observedOutput is None:
                raise defs.PmmlValidationError("VerificationField references field \"%s\" but it was not produced by the model")
            fieldType = observedOutput.fieldType

            if fieldType.dataType == "object":
                try:
                    newArray = [float(x) for x in observedOutput.data]
                except ValueError:
                    pass
                else:
                    fieldType = FakeFieldType("double", "continuous")
                    observedOutput._data = newArray
                        
            for index in xrange(len(dataTable)):
                record = {"field": verificationField["field"], "index": index}

                record["expectedMissing"] = verificationField.mask[index]
                record["observedMissing"] = (observedOutput.mask is not None and observedOutput.mask[index] != defs.VALID)

                if record["expectedMissing"] != record["observedMissing"]:
                    record["success"] = False
                    output.append(record)

                elif not record["expectedMissing"]:
                    record["expectedValue"] = fieldType.stringToValue(verificationField.data[index])
                    record["observedValue"] = observedOutput.data[index]
                    record["expectedPythonValue"] = fieldType.valueToPython(record["expectedValue"])
                    record["observedPythonValue"] = fieldType.valueToPython(record["observedValue"])
                    record["expectedDisplayValue"] = fieldType.valueToString(record["expectedValue"])
                    record["observedDisplayValue"] = fieldType.valueToString(record["observedValue"])

                    if fieldType.optype == "continuous":
                        if (abs(record["expectedValue"]) <= verificationField.zeroThreshold) and (abs(record["observedValue"]) <= verificationField.zeroThreshold):
                            record["success"] = True
                        else:
                            record["success"] = ((record["expectedValue"] * (1.0 - verificationField.precision)) <= record["observedValue"] <= (record["expectedValue"] * (1.0 + verificationField.precision)))

                        if not record["success"] or showSuccess:
                            output.append(record)
                            
                    else:
                        if record["expectedValue"] != record["observedValue"]:
                            record["success"] = False
                            output.append(record)
                        else:
                            record["success"] = True
                            if showSuccess:
                                output.append(record)

        return output
Example #27
0
    def calculateScore(self, dataTable, functionTable, performanceTable):
        """Calculate the score of this model.

        This method is called by C{calculate} to separate operations
        that are performed by all models (in C{calculate}) from
        operations that are performed by specific models (in
        C{calculateScore}).

        @type subTable: DataTable
        @param subTable: The DataTable representing this model's lexical scope.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: A DataColumn containing the score.
        """

        performanceTable.begin("ClusteringModel")

        performanceTable.begin("set up")

        distributionBased = (self["modelClass"] == "distributionBased")
        clusteringFields = self.xpath(
            "pmml:ClusteringField[not(@isCenterField='false')]")
        fieldWeights = [
            clusteringField.get("fieldWeight",
                                defaultFromXsd=True,
                                convertType=True)
            for clusteringField in clusteringFields
        ]
        for fieldWeight in fieldWeights:
            if fieldWeight < 0.0:
                raise defs.PmmlValidationError(
                    "ClusteringField fieldWeights must all be non-negative (encountered %g)"
                    % fieldWeight)
        clusters = self.xpath("pmml:Cluster")
        comparisonMeasure = self.childOfClass(ComparisonMeasure)
        defaultCompareFunction = comparisonMeasure.get("compareFunction",
                                                       defaultFromXsd=True)
        metric = comparisonMeasure.childOfClass(PmmlClusteringMetric)
        metrictag = metric.t

        performanceTable.end("set up")

        for clusteringField in clusteringFields:
            dataType = dataTable.fields[
                clusteringField["field"]].fieldType.dataType
            if dataType == "string":
                raise defs.PmmlValidationError(
                    "ClusteringField \"%s\" has dataType \"%s\", which cannot be used for clustering"
                    % (clusteringField["field"], dataType))

        missingValueWeights = self.childOfTag("MissingValueWeights")
        if missingValueWeights is None:
            adjustM = None

        else:
            performanceTable.begin("MissingValueWeights")

            missingWeights = missingValueWeights.childOfClass(
                PmmlArray).values(convertType=True)

            sumNMqi = NP("zeros", len(dataTable), dtype=NP.dtype(float))
            for clusteringField, missingWeight in zip(clusteringFields,
                                                      missingWeights):
                clusteringField.addToAdjustM(dataTable, functionTable,
                                             performanceTable, sumNMqi,
                                             missingWeight)

            adjustM = NP(sum(missingWeights) / sumNMqi)
            adjustM[NP(sumNMqi == 0.0)] = 1.0

            performanceTable.end("MissingValueWeights")

        anyInvalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
        for clusteringField in clusteringFields:
            mask = dataTable.fields[clusteringField["field"]].mask
            if mask is not None:
                NP("logical_or", anyInvalid, NP(mask == defs.INVALID),
                   anyInvalid)

        bestClusterId = None
        bestClusterAffinity = None
        allClusterAffinities = {}

        for index, cluster in enumerate(clusters):
            array = cluster.childOfClass(PmmlArray)
            if array is None:
                raise defs.PmmlValidationError(
                    "Cluster must have an array to designate its center")

            centerStrings = array.values(convertType=False)
            if len(centerStrings) != len(clusteringFields):
                raise defs.PmmlValidationError(
                    "Cluster array has %d components, but there are %d ClusteringFields with isCenterField=true"
                    % (len(centerStrings), len(clusteringFields)))

            performanceTable.begin(metrictag)

            if distributionBased:
                matrix = cluster.xpath("pmml:Covariances/pmml:Matrix")
                if len(matrix) != 1:
                    raise defs.PmmlValidationError(
                        "In distribution-based clustering, all clusters must have a Covariances/Matrix"
                    )
                try:
                    covarianceMatrix = NP("array",
                                          matrix[0].values(),
                                          dtype=NP.dtype(float))
                except ValueError:
                    raise defs.PmmlValidationError(
                        "Covariances/Matrix must contain real numbers for distribution-based clustering"
                    )

            else:
                covarianceMatrix = None

            state = self._State()
            metric.initialize(state, len(dataTable), len(clusteringFields),
                              distributionBased)

            for clusteringField, centerString, fieldWeight in zip(
                    clusteringFields, centerStrings, fieldWeights):
                if isinstance(metric, PmmlClusteringMetricBinary):
                    metric.accumulateBinary(
                        state, dataTable.fields[clusteringField["field"]],
                        centerString, distributionBased)
                else:
                    performanceTable.pause(metrictag)
                    cxy = clusteringField.compare(dataTable, functionTable,
                                                  performanceTable,
                                                  centerString,
                                                  defaultCompareFunction,
                                                  anyInvalid)
                    performanceTable.unpause(metrictag)
                    metric.accumulate(state, cxy, fieldWeight,
                                      distributionBased)

            distance = metric.finalizeDistance(state, adjustM,
                                               distributionBased,
                                               covarianceMatrix)
            del state

            performanceTable.end(metrictag)

            if index == 0:
                bestClusterId = NP("ones", len(dataTable),
                                   dtype=NP.dtype(int))  # 1-based index
                bestClusterAffinity = distance

            better = NP(distance < bestClusterAffinity)
            bestClusterId[better] = index + 1  # 1-based index
            bestClusterAffinity[better] = distance[better]

            allClusterAffinities[cluster.get("id",
                                             "%d" % (index + 1))] = distance

        if not anyInvalid.any():
            scoreMask = None
        else:
            scoreMask = NP(anyInvalid * defs.INVALID)

        performanceTable.begin("set scores")
        score = {}

        performanceTable.begin("predictedValue")
        fieldType = FakeFieldType("string", "categorical")
        clusterIdentifiers = NP("empty", len(dataTable), dtype=fieldType.dtype)
        for index, cluster in enumerate(clusters):
            value = fieldType.stringToValue(
                cluster.get("id", "%d" % (index + 1)))
            clusterIdentifiers[NP(bestClusterId == (index + 1))] = value
        score[None] = DataColumn(fieldType, clusterIdentifiers, scoreMask)
        performanceTable.end("predictedValue")

        if self.subFields["predictedDisplayValue"]:
            performanceTable.begin("predictedDisplayValue")
            fieldType = FakeFieldType("string", "categorical")
            clusterNames = NP("empty", len(dataTable), dtype=fieldType.dtype)
            for index, cluster in enumerate(clusters):
                value = fieldType.stringToValue(cluster.get("name", ""))
                clusterNames[NP(bestClusterId == (index + 1))] = value
            score["predictedDisplayValue"] = DataColumn(
                fieldType, clusterNames, scoreMask)
            performanceTable.end("predictedDisplayValue")

        if self.subFields["entity"]:
            performanceTable.begin("entity")
            fieldType = FakeFieldType("object", "any")
            entities = NP("empty", len(dataTable), dtype=fieldType.dtype)
            for index, cluster in enumerate(clusters):
                value = fieldType.stringToValue(cluster.get("name", ""))
                indexPlusOne = index + 1
                for i in xrange(len(entities)):
                    if bestClusterId[i] == indexPlusOne:
                        entities[i] = cluster
            score["entity"] = DataColumn(fieldType, entities, scoreMask)
            performanceTable.end("entity")

        if self.subFields["clusterId"]:
            performanceTable.begin("clusterId")
            fieldType = FakeFieldType("integer", "continuous")
            score["clusterId"] = DataColumn(fieldType, bestClusterId,
                                            scoreMask)
            performanceTable.end("clusterId")

        if self.subFields["entityId"]:
            performanceTable.begin("entityId")
            fieldType = FakeFieldType("integer", "continuous")
            score["entityId"] = DataColumn(fieldType, bestClusterId, scoreMask)
            performanceTable.end("entityId")

        if self.subFields["clusterAffinity"]:
            performanceTable.begin("clusterAffinity")
            fieldType = FakeFieldType("double", "continuous")
            score["clusterAffinity"] = DataColumn(fieldType,
                                                  bestClusterAffinity,
                                                  scoreMask)
            performanceTable.end("clusterAffinity")

        if self.subFields["affinity"]:
            performanceTable.begin("affinity")
            fieldType = FakeFieldType("double", "continuous")
            score["affinity"] = DataColumn(fieldType, bestClusterAffinity,
                                           scoreMask)
            performanceTable.end("affinity")

        if self.subFields["all"]:
            performanceTable.begin("all")
            fieldType = FakeFieldType("double", "continuous")
            for identifier, distance in allClusterAffinities.items():
                score["all.%s" % identifier] = DataColumn(
                    fieldType, distance, scoreMask)
            performanceTable.end("all")

        performanceTable.end("set scores")
        performanceTable.end("ClusteringModel")
        return score
Example #28
0
class PlotHistogram(PmmlPlotContent):
    """Represents a 1d histogram of the data.

    PMML subelements:

      - PlotExpression role="data": the numeric or categorical data.
      - PlotNumericExpression role="weight": histogram weights.
      - PlotSelection: expression or predicate to filter the data
        before plotting.
      - Intervals: non-uniform (numerical) histogram bins.
      - Values: explicit (categorical) histogram values.
      - PlotSvgMarker: inline SVG for histograms drawn with markers,
        where the markers are SVG pictograms.

    PMML attributes:

      - svgId: id for the resulting SVG element.
      - stateId: key for persistent storage in a DataTableState.
      - numBins: number of histogram bins.
      - low: histogram low edge.
      - high: histogram high edge.
      - normalized: if "false", the histogram represents the number
        of counts in each bin; if "true", the histogram represents
        density, with a total integral (taking into account bin
        widths) of 1.0.
      - cumulative: if "false", the histogram approximates a
        probability density function (PDF) with flat-top bins;
        if "true", the histogram approximates a cumulative
        distribution function (CDF) with linear-top bins.
      - vertical: if "true", plot the "data" expression on the x
        axis and the counts/density/cumulative values on the y
        axis.
      - visualization: one of "skyline", "polyline", "smooth",
        "points", "errorbars".
      - gap: size of the space between histogram bars in SVG
        coordinates.
      - marker: marker to use for "points" visualization (see
        PlotScatter).
      - style: CSS style properties.
        
    CSS properties:
      - fill, fill-opacity: color of the histogram bars.
      - stroke, stroke-dasharray, stroke-dashoffset, stroke-linecap,
        stroke-linejoin, stroke-miterlimit, stroke-opacity,
        stroke-width: properties of the line drawing.
      - marker-size, marker-outline: marker style for "points"
        visualization.

    See the source code for the full XSD.
    """

    styleProperties = [
        "fill",
        "fill-opacity",
        "stroke",
        "stroke-dasharray",
        "stroke-dashoffset",
        "stroke-linecap",
        "stroke-linejoin",
        "stroke-miterlimit",
        "stroke-opacity",
        "stroke-width",
        "marker-size",
        "marker-outline",
    ]

    styleDefaults = {
        "fill": "none",
        "stroke": "black",
        "marker-size": "5",
        "marker-outline": "none"
    }

    xsd = """<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
    <xs:element name="PlotHistogram">
        <xs:complexType>
            <xs:sequence>
                <xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded" />
                <xs:element ref="PlotExpression" minOccurs="1" maxOccurs="1" />
                <xs:element ref="PlotNumericExpression" minOccurs="0" maxOccurs="1" />
                <xs:element ref="PlotSelection" minOccurs="0" maxOccurs="1" />
                <xs:choice minOccurs="0" maxOccurs="1">
                    <xs:element ref="Interval" minOccurs="1" maxOccurs="unbounded" />
                    <xs:element ref="Value" minOccurs="1" maxOccurs="unbounded" />
                </xs:choice>
                <xs:element ref="PlotSvgMarker" minOccurs="0" maxOccurs="1" />
            </xs:sequence>
            <xs:attribute name="svgId" type="xs:string" use="optional" />
            <xs:attribute name="stateId" type="xs:string" use="optional" />
            <xs:attribute name="numBins" type="xs:positiveInteger" use="optional" />
            <xs:attribute name="low" type="xs:double" use="optional" />
            <xs:attribute name="high" type="xs:double" use="optional" />
            <xs:attribute name="normalized" type="xs:boolean" use="optional" default="false" />
            <xs:attribute name="cumulative" type="xs:boolean" use="optional" default="false" />
            <xs:attribute name="vertical" type="xs:boolean" use="optional" default="true" />
            <xs:attribute name="visualization" use="optional" default="skyline">
                <xs:simpleType>
                    <xs:restriction base="xs:string">
                        <xs:enumeration value="skyline" />
                        <xs:enumeration value="polyline" />
                        <xs:enumeration value="smooth" />
                        <xs:enumeration value="points" />
                        <xs:enumeration value="errorbars" />
                    </xs:restriction>
                </xs:simpleType>
            </xs:attribute>
            <xs:attribute name="gap" type="xs:double" use="optional" default="0.0" />
            <xs:attribute name="marker" type="PLOT-MARKER-TYPE" use="optional" default="circle" />
            <xs:attribute name="style" type="xs:string" use="optional" default="%s" />
        </xs:complexType>
    </xs:element>
</xs:schema>
""" % PlotStyle.toString(styleDefaults)

    fieldType = FakeFieldType("double", "continuous")
    fieldTypeNumeric = FakeFieldType("double", "continuous")

    @staticmethod
    def establishBinType(fieldType, intervals, values):
        """Determine the type of binning to use for a histogram with
        the given FieldType, Intervals, and Values.

        @type fieldType: FieldType
        @param fieldType: The FieldType of the plot expression.
        @type intervals: list of PmmlBinding
        @param intervals: The <Interval> elements; may be empty.
        @type values: list of PmmlBinding
        @param values: The <Value> elements; may be empty.
        @rtype: string
        @return: One of "nonuniform", "explicit", "unique", "scale".
        """

        if len(intervals) > 0:
            if not fieldType.isnumeric() and not fieldType.istemporal():
                raise defs.PmmlValidationError(
                    "Explicit Intervals are intended for numerical data, not %r"
                    % fieldType)
            return "nonuniform"

        elif len(values) > 0:
            if not fieldType.isstring():
                raise defs.PmmlValidationError(
                    "Explicit Values are intended for string data, not %r" %
                    fieldType)
            return "explicit"

        elif fieldType.isstring():
            return "unique"

        else:
            if not fieldType.isnumeric() and not fieldType.istemporal():
                raise defs.PmmlValidationError(
                    "PlotHistogram requires numerical or string data, not %r" %
                    fieldType)
            return "scale"

    @staticmethod
    def determineScaleBins(numBins, low, high, array):
        """Determine the C{numBins}, C{low}, and C{high} of the
        histogram from explicitly set values where available and
        implicitly derived values where necessary.

        Explicitly set values always override implicit values derived
        from the dataset.
          - C{low}, C{high} implicit values are the extrema of the
            dataset.
          - C{numBins} implicit value is the Freedman-Diaconis
            heuristic for number of histogram bins.

        @type numBins: int or None
        @param numBins: Input number of bins.
        @type low: number or None
        @param low: Low edge.
        @type high: number or None
        @param high: High edge.
        @type array: 1d Numpy array of numbers
        @param array: Dataset to use to implicitly derive values.
        @rtype: 3-tuple
        @return: C{numBins}, C{low}, C{high}
        """

        generateLow = (low is None)
        generateHigh = (high is None)

        if generateLow: low = float(array.min())
        if generateHigh: high = float(array.max())

        if low == high:
            low, high = low - 1.0, high + 1.0
        elif high < low:
            if generateLow:
                low = high - 1.0
            elif generateHigh:
                high = low + 1.0
            else:
                raise defs.PmmlValidationError(
                    "PlotHistogram attributes low and high must be in the right order: low = %g, high = %g"
                    % (low, high))
        else:
            if generateLow and generateHigh:
                low, high = low - 0.2 * (high - low), high + 0.2 * (high - low)
            elif generateLow:
                low = low - 0.2 * (high - low)
            elif generateHigh:
                high = high + 0.2 * (high - low)

        if numBins is None:
            # the Freedman-Diaconis rule
            q1, q3 = NP("percentile", array, [25.0, 75.0])
            binWidth = 2.0 * (q3 - q1) / math.pow(len(array), 1.0 / 3.0)
            if binWidth > 0.0:
                numBins = max(10, int(math.ceil((high - low) / binWidth)))
            else:
                numBins = 10

        return numBins, low, high

    @staticmethod
    def selectInterval(fieldType, array, index, lastIndex, interval, edges,
                       lastLimitPoint, lastClosed, lastInterval):
        """Select rows of an array within an interval as part of
        filling a non-uniform histogram.

        @type fieldType: FieldType
        @param fieldType: FieldType used to interpret the bounds of the interval.
        @type array: 1d Numpy array
        @param array: Values to select.
        @type index: int
        @param index: Current bin index.
        @type lastIndex: int
        @param lastIndex: Previous bin index.
        @type interval: PmmlBinding
        @param interval: PMML <Interval> element defining the interval.
        @type edges: list of 2-tuples
        @param edges: Pairs of interpreted C{leftMargin}, C{rightMargin} for the histogram.
        @type lastLimitPoint: number
        @param lastLimitPoint: Larger of the two last edges.  ("Limit point" because it may have been open or closed.)
        @type lastClosed: bool
        @param lastClosed: If True, the last limit point was closed.
        @type lastInterval: PmmlBinding
        @param lastInterval: PMML <Interval> for the last bin.
        @rtype: 4-tuple
        @return: C{selection} (1d Numpy array of bool), C{lastLimitPoint}, C{lastClosed}, C{lastInterval}
        """

        closure = interval["closure"]
        leftMargin = interval.get("leftMargin")
        rightMargin = interval.get("rightMargin")

        selection = None

        if leftMargin is None and rightMargin is None and len(intervals) != 1:
            raise defs.PmmlValidationError(
                "If a histogram bin is unbounded on both ends, it must be the only bin"
            )

        if leftMargin is not None:
            try:
                leftMargin = fieldType.stringToValue(leftMargin)
            except ValueError:
                raise defs.PmmlValidationError(
                    "Improper value in Interval leftMargin specification: \"%s\""
                    % leftMargin)

            if closure in ("openClosed", "openOpen"):
                if selection is None:
                    selection = NP(leftMargin < array)
                else:
                    NP("logical_and", selection, NP(leftMargin < array),
                       selection)

            elif closure in ("closedOpen", "closedClosed"):
                if selection is None:
                    selection = NP(leftMargin <= array)
                else:
                    NP("logical_and", selection, NP(leftMargin <= array),
                       selection)

            if lastLimitPoint is not None:
                if leftMargin < lastLimitPoint or (
                        leftMargin == lastLimitPoint and
                    (closure in ("closedOpen", "closedClosed"))
                        and lastClosed):
                    raise defs.PmmlValidationError(
                        "Intervals are out of order or overlap: %r and %r" %
                        (lastInterval, interval))

        elif index != 0:
            raise defs.PmmlValidationError(
                "Only the first Interval can have an open-ended leftMargin: %r"
                % interval)

        if rightMargin is not None:
            try:
                rightMargin = fieldType.stringToValue(rightMargin)
            except ValueError:
                raise defs.PmmlValidationError(
                    "Improper value in Interval rightMargin specification: \"%s\""
                    % rightMargin)

            if closure in ("openOpen", "closedOpen"):
                if selection is None:
                    selection = NP(array < rightMargin)
                else:
                    NP("logical_and", selection, NP(array < rightMargin),
                       selection)

            elif closure in ("openClosed", "closedClosed"):
                if selection is None:
                    selection = NP(array <= rightMargin)
                else:
                    NP("logical_and", selection, NP(array <= rightMargin),
                       selection)

            lastLimitPoint = rightMargin
            lastClosed = (closure in ("openClosed", "closedClosed"))
            lastInterval = interval

        elif index != lastIndex:
            raise defs.PmmlValidationError(
                "Only the last Interval can have an open-ended rightMargin: %r"
                % interval)

        edges.append((leftMargin, rightMargin))

        return selection, lastLimitPoint, lastClosed, lastInterval

    def prepare(self, state, dataTable, functionTable, performanceTable,
                plotRange):
        """Prepare a plot element for drawing.

        This stage consists of calculating all quantities and
        determing the bounds of the data.  These bounds may be unioned
        with bounds from other plot elements that overlay this plot
        element, so the drawing (which requires a finalized coordinate
        system) cannot begin yet.

        This method modifies C{plotRange}.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type dataTable: DataTable
        @param dataTable: Contains the data to plot.
        @type functionTable: FunctionTable
        @param functionTable: Defines functions that may be used to transform data for plotting.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process.
        @type plotRange: PlotRange
        @param plotRange: The bounding box of plot coordinates that this function will expand.
        """

        self.checkRoles(["data", "weight"])

        dataExpression = self.xpath("pmml:PlotExpression[@role='data']")
        weightExpression = self.xpath(
            "pmml:PlotNumericExpression[@role='weight']")
        cutExpression = self.xpath("pmml:PlotSelection")
        if len(dataExpression) != 1:
            raise defs.PmmlValidationError(
                "PlotHistogram requires a PlotNumericExpression with role \"data\""
            )

        dataColumn = dataExpression[0].evaluate(dataTable, functionTable,
                                                performanceTable)

        if len(weightExpression) == 0:
            weight = None
        elif len(weightExpression) == 1:
            weight = weightExpression[0].evaluate(dataTable, functionTable,
                                                  performanceTable)
        else:
            raise defs.PmmlValidationError(
                "PlotHistogram may not have more than one PlotNumericExpression with role \"data\""
            )

        if len(cutExpression) == 1:
            selection = cutExpression[0].select(dataTable, functionTable,
                                                performanceTable)
        else:
            selection = NP("ones", len(dataTable), NP.dtype(bool))

        performanceTable.begin("PlotHistogram prepare")
        self._saveContext(dataTable)

        if dataColumn.mask is not None:
            NP("logical_and", selection, NP(dataColumn.mask == defs.VALID),
               selection)

        if weight is not None and weight.mask is not None:
            NP("logical_and", selection, NP(weight.mask == defs.VALID),
               selection)

        array = dataColumn.data[selection]
        if weight is not None:
            weight = weight.data[selection]

        persistentState = {}
        stateId = self.get("stateId")
        if stateId is not None:
            if stateId in dataTable.state:
                persistentState = dataTable.state[stateId]
            else:
                dataTable.state[stateId] = persistentState

        intervals = self.xpath("pmml:Interval")
        values = self.xpath("pmml:Value")

        if "binType" not in persistentState:
            performanceTable.begin("establish binType")

            binType = self.establishBinType(dataColumn.fieldType, intervals,
                                            values)
            persistentState["binType"] = binType

            if binType == "nonuniform":
                persistentState["count"] = [0.0] * len(intervals)

            elif binType == "explicit":
                persistentState["count"] = [0.0] * len(values)

            elif binType == "unique":
                persistentState["count"] = {}

            elif binType == "scale":
                numBins = self.get("numBins", convertType=True)
                low = self.get("low", convertType=True)
                high = self.get("high", convertType=True)

                numBins, low, high = self.determineScaleBins(
                    numBins, low, high, array)

                persistentState["low"] = low
                persistentState["high"] = high
                persistentState["numBins"] = numBins
                persistentState["count"] = [0.0] * numBins

            performanceTable.end("establish binType")

        missingSum = 0.0
        if persistentState["binType"] == "nonuniform":
            performanceTable.begin("binType nonuniform")

            count = [0.0] * len(intervals)
            edges = []
            lastLimitPoint = None
            lastClosed = None
            lastInterval = None

            for index, interval in enumerate(intervals):
                selection, lastLimitPoint, lastClosed, lastInterval = self.selectInterval(
                    dataColumn.fieldType, array, index,
                    len(intervals) - 1, interval, edges, lastLimitPoint,
                    lastClosed, lastInterval)

                if selection is not None:
                    if weight is None:
                        count[index] += NP("count_nonzero", selection)
                    else:
                        count[index] += weight[selection].sum()

            persistentState["count"] = [
                x + y
                for x, y in itertools.izip(count, persistentState["count"])
            ]

            state.fieldType = self.fieldTypeNumeric
            state.count = persistentState["count"]
            state.edges = edges
            lowEdge = min(low for low, high in edges if low is not None)
            highEdge = max(high for low, high in edges if high is not None)

            performanceTable.end("binType nonuniform")

        elif persistentState["binType"] == "explicit":
            performanceTable.begin("binType explicit")

            count = [0.0] * len(values)
            displayValues = []

            for index, value in enumerate(values):
                internalValue = dataColumn.fieldType.stringToValue(
                    value["value"])
                displayValues.append(
                    value.get(
                        "displayValue",
                        dataColumn.fieldType.valueToString(internalValue,
                                                           displayValue=True)))

                selection = NP(array == internalValue)

                if weight is None:
                    count[index] += NP("count_nonzero", selection)
                else:
                    count[index] += weight[selection].sum()

            persistentState["count"] = [
                x + y
                for x, y in itertools.izip(count, persistentState["count"])
            ]

            state.fieldType = dataColumn.fieldType
            state.count = persistentState["count"]
            state.edges = displayValues

            performanceTable.end("binType explicit")

        elif persistentState["binType"] == "unique":
            performanceTable.begin("binType unique")

            uniques, inverse = NP("unique", array, return_inverse=True)
            if weight is None:
                counts = NP("bincount", inverse)
            else:
                counts = NP("bincount", inverse, weights=weight)

            persistentCount = persistentState["count"]
            for i, u in enumerate(uniques):
                string = dataColumn.fieldType.valueToString(u,
                                                            displayValue=False)

                if string in persistentCount:
                    persistentCount[string] += counts[i]
                else:
                    persistentCount[string] = counts[i]

            tosort = [(count, string)
                      for string, count in persistentCount.items()]
            tosort.sort(reverse=True)

            numBins = self.get("numBins", convertType=True)
            if numBins is not None:
                missingSum = sum(count for count, string in tosort[numBins:])
                tosort = tosort[:numBins]

            state.fieldType = dataColumn.fieldType
            state.count = [count for count, string in tosort]
            state.edges = [
                dataColumn.fieldType.valueToString(
                    dataColumn.fieldType.stringToValue(string),
                    displayValue=True) for count, string in tosort
            ]

            performanceTable.end("binType unique")

        elif persistentState["binType"] == "scale":
            performanceTable.begin("binType scale")

            numBins = persistentState["numBins"]
            low = persistentState["low"]
            high = persistentState["high"]
            binWidth = (high - low) / float(numBins)

            binAssignments = NP("array",
                                NP("floor", NP(NP(array - low) / binWidth)),
                                dtype=NP.dtype(int))
            binAssignments[NP(binAssignments > numBins)] = numBins
            binAssignments[NP(binAssignments < 0)] = numBins

            if len(binAssignments) == 0:
                count = NP("empty", 0, dtype=NP.dtype(float))
            else:
                if weight is None:
                    count = NP("bincount", binAssignments)
                else:
                    count = NP("bincount", binAssignments, weights=weight)

            if len(count) < numBins:
                padded = NP("zeros", numBins, dtype=NP.dtype(float))
                padded[:len(count)] = count
            else:
                padded = count

            persistentState["count"] = [
                x + y
                for x, y in itertools.izip(padded, persistentState["count"])
            ]

            state.fieldType = self.fieldTypeNumeric
            state.count = persistentState["count"]
            state.edges = [(low + i * binWidth, low + (i + 1) * binWidth)
                           for i in xrange(numBins)]
            lowEdge = low
            highEdge = high

            performanceTable.end("binType scale")

        if self.get("normalized", defaultFromXsd=True, convertType=True):
            if state.fieldType is self.fieldTypeNumeric:
                weightedValues = 0.0
                for (low,
                     high), value in itertools.izip(state.edges, state.count):
                    if low is not None and high is not None:
                        weightedValues += value / (high - low)

                newCount = []
                for (low, high), value in zip(state.edges, state.count):
                    if low is None or high is None:
                        newCount.append(0.0)
                    else:
                        newCount.append(value / (high - low) / weightedValues)

                state.count = newCount

            else:
                totalCount = sum(state.count) + missingSum
                state.count = [float(x) / totalCount for x in state.count]

        if self.get("cumulative", defaultFromXsd=True, convertType=True):
            maximum = sum(state.count)
        else:
            maximum = max(state.count)

        if self.get("vertical", defaultFromXsd=True, convertType=True):
            plotRange.yminPush(0.0, self.fieldType, sticky=True)

            if state.fieldType is self.fieldTypeNumeric:
                plotRange.xminPush(lowEdge, state.fieldType, sticky=True)
                plotRange.xmaxPush(highEdge, state.fieldType, sticky=True)
                plotRange.ymaxPush(maximum, state.fieldType, sticky=False)
            else:
                plotRange.expand(
                    NP("array", state.edges, dtype=NP.dtype(object)),
                    NP("ones", len(state.edges), dtype=NP.dtype(float)) *
                    maximum, state.fieldType, self.fieldType)

        else:
            plotRange.xminPush(0.0, self.fieldType, sticky=True)

            if state.fieldType is self.fieldTypeNumeric:
                plotRange.yminPush(lowEdge, state.fieldType, sticky=True)
                plotRange.ymaxPush(highEdge, state.fieldType, sticky=True)
                plotRange.xmaxPush(maximum, state.fieldType, sticky=False)
            else:
                plotRange.expand(
                    NP("ones", len(state.edges), dtype=NP.dtype(float)) *
                    maximum, NP("array", state.edges, dtype=NP.dtype(object)),
                    self.fieldType, state.fieldType)

        performanceTable.end("PlotHistogram prepare")

    def draw(self, state, plotCoordinates, plotDefinitions, performanceTable):
        """Draw the plot element.

        This stage consists of creating an SVG image of the
        pre-computed data.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type plotCoordinates: PlotCoordinates
        @param plotCoordinates: The coordinate system in which this plot element will be placed.
        @type plotDefinitions: PlotDefinitions
        @type plotDefinitions: The dictionary of key-value pairs that forms the <defs> section of the SVG document.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process.
        @rtype: SvgBinding
        @return: An SVG fragment representing the fully drawn plot element.
        """

        svg = SvgBinding.elementMaker
        performanceTable.begin("PlotHistogram draw")

        cumulative = self.get("cumulative",
                              defaultFromXsd=True,
                              convertType=True)
        vertical = self.get("vertical", defaultFromXsd=True, convertType=True)
        visualization = self.get("visualization", defaultFromXsd=True)

        output = svg.g()
        if len(state.count) > 0:
            if state.fieldType is not self.fieldTypeNumeric:
                if vertical:
                    strings = plotCoordinates.xstrings
                else:
                    strings = plotCoordinates.ystrings

                newCount = []
                for string in strings:
                    try:
                        index = state.edges.index(string)
                    except ValueError:
                        newCount.append(0.0)
                    else:
                        newCount.append(state.count[index])

                state.count = newCount
                state.edges = [(i - 0.5, i + 0.5)
                               for i in xrange(len(strings))]

            if vertical:
                Ax = NP("array", [
                    low if low is not None else float("-inf")
                    for low, high in state.edges
                ],
                        dtype=NP.dtype(float))
                Bx = NP(Ax.copy())
                Cx = NP("array", [
                    high if high is not None else float("inf")
                    for low, high in state.edges
                ],
                        dtype=NP.dtype(float))
                Dx = NP(Cx.copy())
                Ay = NP("zeros", len(state.count), dtype=NP.dtype(float))
                if cumulative:
                    Cy = NP("cumsum",
                            NP("array", state.count, dtype=NP.dtype(float)))
                    By = NP("roll", Cy, 1)
                    By[0] = 0.0
                else:
                    By = NP("array", state.count, dtype=NP.dtype(float))
                    Cy = NP(By.copy())
                Dy = NP(Ay.copy())

            else:
                if cumulative:
                    Cx = NP("cumsum",
                            NP("array", state.count, dtype=NP.dtype(float)))
                    Bx = NP("roll", Cx, 1)
                    Bx[0] = 0.0
                else:
                    Bx = NP("array", state.count, dtype=NP.dtype(float))
                    Cx = NP(Bx.copy())
                Ax = NP("zeros", len(state.count), dtype=NP.dtype(float))
                Dx = NP(Ax.copy())
                Ay = NP("array", [
                    low if low is not None else float("-inf")
                    for low, high in state.edges
                ],
                        dtype=NP.dtype(float))
                By = NP(Ay.copy())
                Cy = NP("array", [
                    high if high is not None else float("inf")
                    for low, high in state.edges
                ],
                        dtype=NP.dtype(float))
                Dy = NP(Cy.copy())

            AX, AY = plotCoordinates(Ax, Ay)
            BX, BY = plotCoordinates(Bx, By)
            CX, CY = plotCoordinates(Cx, Cy)
            DX, DY = plotCoordinates(Dx, Dy)

            if visualization == "skyline":
                gap = self.get("gap", defaultFromXsd=True, convertType=True)

                if vertical:
                    if gap > 0.0 and NP(
                            NP(DX - gap / 2.0) -
                            NP(AX + gap / 2.0)).min() > 0.0:
                        AX += gap / 2.0
                        BX += gap / 2.0
                        CX -= gap / 2.0
                        DX -= gap / 2.0
                else:
                    if gap > 0.0 and NP(
                            NP(AY + gap / 2.0) -
                            NP(DY - gap / 2.0)).min() > 0.0:
                        AY -= gap / 2.0
                        BY -= gap / 2.0
                        CY += gap / 2.0
                        DY += gap / 2.0

                pathdata = []
                nextIsMoveto = True
                for i in xrange(len(state.count)):
                    iprev = i - 1
                    inext = i + 1

                    if vertical and By[i] == 0.0 and Cy[i] == 0.0:
                        if i > 0 and not nextIsMoveto:
                            pathdata.append("L %r %r" % (DX[iprev], DY[iprev]))
                        nextIsMoveto = True

                    elif not vertical and Bx[i] == 0.0 and Cx[i] == 0.0:
                        if i > 0 and not nextIsMoveto:
                            pathdata.append("L %r %r" % (DX[iprev], DY[iprev]))
                        nextIsMoveto = True

                    else:
                        if nextIsMoveto or gap > 0.0 or (
                                vertical and DX[iprev] != AX[i]) or (
                                    not vertical and DY[iprev] != AY[i]):
                            pathdata.append("M %r %r" % (AX[i], AY[i]))
                            nextIsMoveto = False

                        pathdata.append("L %r %r" % (BX[i], BY[i]))
                        pathdata.append("L %r %r" % (CX[i], CY[i]))

                        if i == len(state.count) - 1 or gap > 0.0 or (
                                vertical and DX[i] != AX[inext]) or (
                                    not vertical and DY[i] != AY[inext]):
                            pathdata.append("L %r %r" % (DX[i], DY[i]))

                style = self.getStyleState()
                del style["marker-size"]
                del style["marker-outline"]
                output.append(
                    svg.path(d=" ".join(pathdata),
                             style=PlotStyle.toString(style)))

            elif visualization == "polyline":
                pathdata = []
                for i in xrange(len(state.count)):
                    if i == 0:
                        pathdata.append("M %r %r" % (AX[i], AY[i]))

                    pathdata.append("L %r %r" % ((BX[i] + CX[i]) / 2.0,
                                                 (BY[i] + CY[i]) / 2.0))

                    if i == len(state.count) - 1:
                        pathdata.append("L %r %r" % (DX[i], DY[i]))

                style = self.getStyleState()
                del style["marker-size"]
                del style["marker-outline"]
                output.append(
                    svg.path(d=" ".join(pathdata),
                             style=PlotStyle.toString(style)))

            elif visualization == "smooth":
                smoothingSamples = math.ceil(len(state.count) / 2.0)

                BCX = NP(NP(BX + CX) / 2.0)
                BCY = NP(NP(BY + CY) / 2.0)

                xarray = NP("array", [AX[0]] + list(BCX) + [DX[-1]],
                            dtype=NP.dtype(float))
                yarray = NP("array", [AY[0]] + list(BCY) + [DY[-1]],
                            dtype=NP.dtype(float))
                samples = NP("linspace",
                             AX[0],
                             DX[-1],
                             int(smoothingSamples),
                             endpoint=True)
                smoothingScale = abs(DX[-1] - AX[0]) / smoothingSamples

                xlist, ylist, dxlist, dylist = PlotCurve.pointsToSmoothCurve(
                    xarray, yarray, samples, smoothingScale, False)

                pathdata = PlotCurve.formatPathdata(xlist, ylist,
                                                    dxlist, dylist,
                                                    PlotCoordinates(), False,
                                                    True)

                style = self.getStyleState()
                fillStyle = dict(
                    (x, style[x]) for x in style if x.startswith("fill"))
                fillStyle["stroke"] = "none"
                strokeStyle = dict(
                    (x, style[x]) for x in style if x.startswith("stroke"))

                if style["fill"] != "none" and len(pathdata) > 0:
                    if vertical:
                        firstPoint = plotCoordinates(Ax[0], 0.0)
                        lastPoint = plotCoordinates(Dx[-1], 0.0)
                    else:
                        firstPoint = plotCoordinates(0.0, Ay[0])
                        lastPoint = plotCoordinates(0.0, Dy[-1])

                    pathdata2 = [
                        "M %r %r" % firstPoint, pathdata[0].replace("M", "L")
                    ]
                    pathdata2.extend(pathdata[1:])
                    pathdata2.append(pathdata[-1])
                    pathdata2.append("L %r %r" % lastPoint)

                    output.append(
                        svg.path(d=" ".join(pathdata2),
                                 style=PlotStyle.toString(fillStyle)))

                output.append(
                    svg.path(d=" ".join(pathdata),
                             style=PlotStyle.toString(strokeStyle)))

            elif visualization == "points":
                currentStyle = PlotStyle.toDict(self.get("style") or {})
                style = self.getStyleState()
                if "fill" not in currentStyle:
                    style["fill"] = "black"

                BCX = NP(NP(BX + CX) / 2.0)
                BCY = NP(NP(BY + CY) / 2.0)

                svgId = self.get("svgId")
                if svgId is None:
                    svgIdMarker = plotDefinitions.uniqueName()
                else:
                    svgIdMarker = svgId + ".marker"

                marker = PlotScatter.makeMarker(
                    svgIdMarker, self.get("marker", defaultFromXsd=True),
                    style, self.childOfTag("PlotSvgMarker"))
                plotDefinitions[marker.get("id")] = marker

                markerReference = "#" + marker.get("id")
                output.extend(
                    svg.use(
                        **{
                            "x": repr(x),
                            "y": repr(y),
                            defs.XLINK_HREF: markerReference
                        }) for x, y in itertools.izip(BCX, BCY))

            else:
                raise NotImplementedError("TODO: add 'errorbars'")

        svgId = self.get("svgId")
        if svgId is not None:
            output["id"] = svgId

        performanceTable.end("PlotHistogram draw")
        return output
Example #29
0
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("Discretize")

        dataColumn = dataTable.fields[self["field"]]
        if dataColumn.fieldType.dataType in ("object", "string", "boolean"):
            raise defs.PmmlValidationError("Discretize requires a numeric input field, but \"%s\" is" % dataColumn.fieldType.dataType)

        fieldType = FakeFieldType(self.get("dataType", "string"), self.get("optype", self._optype))
        fieldType._newValuesAllowed = True

        defaultValue = self.get("defaultValue")
        if defaultValue is not None:
            defaultValue = fieldType.stringToValue(defaultValue)

        data = NP("empty", len(dataTable), dtype=fieldType.dtype)
        mask = NP("empty", len(dataTable), dtype=defs.maskType)
        if defaultValue is None:
            mask[:] = defs.MISSING
        else:
            data[:] = defaultValue
            mask[:] = defs.VALID

        for discretizeBin in self.childrenOfTag("DiscretizeBin"):
            try:
                binValue = fieldType.stringToValue(discretizeBin["binValue"])
            except ValueError:
                raise defs.PmmlValidationError("Cannot cast DiscretizeBin binValue \"%s\" as %s %s" % (discretizeBin["binValue"], fieldType.optype, fieldType.dataType))

            fieldType.values.append(FakeFieldValue(value=binValue))

            interval = discretizeBin.childOfTag("Interval")

            closure = interval["closure"]
            leftMargin = interval.get("leftMargin")
            rightMargin = interval.get("rightMargin")
            selection = None

            if leftMargin is not None:
                try:
                    leftMargin = dataColumn.fieldType.stringToValue(leftMargin)
                except ValueError:
                    raise defs.PmmlValidationError("Improper value in Interval leftMargin specification: \"%s\"" % leftMargin)

                if closure in ("openClosed", "openOpen"):
                    if selection is None:
                        selection = NP(leftMargin < dataColumn.data)
                    else:
                        NP("logical_and", selection, NP(leftMargin < dataColumn.data), selection)

                elif closure in ("closedOpen", "closedClosed"):
                    if selection is None:
                        selection = NP(leftMargin <= dataColumn.data)
                    else:
                        NP("logical_and", selection, NP(leftMargin <= dataColumn.data), selection)

            if rightMargin is not None:
                try:
                    rightMargin = dataColumn.fieldType.stringToValue(rightMargin)
                except ValueError:
                    raise defs.PmmlValidationError("Improper value in Interval rightMargin specification: \"%s\"" % rightMargin)

                if closure in ("openOpen", "closedOpen"):
                    if selection is None:
                        selection = NP(dataColumn.data < rightMargin)
                    else:
                        NP("logical_and", selection, NP(dataColumn.data < rightMargin), selection)

                elif closure in ("openClosed", "closedClosed"):
                    if selection is None:
                        selection = NP(dataColumn.data <= rightMargin)
                    else:
                        NP("logical_and", selection, NP(dataColumn.data <= rightMargin), selection)
                
            if selection is not None:
                NP("logical_and", selection, NP(dataColumn.mask == defs.VALID), selection)
                data[selection] = binValue
                mask[selection] = defs.VALID

        mask[NP(dataColumn.mask == defs.MISSING)] = defs.MISSING
        mask[NP(dataColumn.mask == defs.INVALID)] = defs.INVALID

        data, mask = FieldCastMethods.applyMapMissingTo(fieldType, data, mask, self.get("mapMissingTo"))
        
        performanceTable.end("Discretize")
        return DataColumn(fieldType, data, mask)
Example #30
0
    def verify(self, showSuccess=False, performanceTable=None):
        """Run the model verification tests defined by this element.

        The output is a list of results (all results or only failures,
        depending on C{showSuccess}), each of which is a dictionary of
        field names to values.  Fields are:

          - "success": was the comparison successful?
          - "expectedMissing", "observedMissing": is the
             expected/observed value missing?
          - "expectedValue", "observedValue": result as an internal
             value.
          - "expectedPythonValue", "observedPythonValue": result as a
             Python value.
          - "expectedDisplayValue", "observedDisplayValue": result as
             a string displayValue.

        Only "success", "expectedMissing", and "observedMissing" appear
        if the "is missing?" comparison was unsuccessful.

        @type showSuccess: bool
        @param showSuccess: If True, emit output even if the tests are successful.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: JSON-like list of dicts
        @return: As described above.
        """

        verificationFields = {}
        for verificationField in self.xpath(
                "pmml:VerificationFields/pmml:VerificationField"):
            verificationField.column = verificationField.get(
                "column", verificationField["field"])
            verificationField.precision = verificationField.get(
                "precision", defaultFromXsd=True, convertType=True)
            verificationField.zeroThreshold = verificationField.get(
                "zeroThreshold", defaultFromXsd=True, convertType=True)

            verificationField.data = []
            verificationField.mask = []
            verificationFields[verificationField.column] = verificationField

        inputData = {}
        inputMask = {}
        for index, row in enumerate(
                self.childOfClass(TableInterface).iterate()):
            for columnName, columnValue in row.items():
                verificationField = verificationFields.get(columnName)

                if verificationField is not None:
                    while len(verificationField.data) < index:
                        verificationField.data.append(defs.PADDING)
                        verificationField.mask.append(True)

                    verificationField.data.append(columnValue)
                    verificationField.mask.append(False)

                else:
                    inputDataField = inputData.get(columnName)
                    if inputDataField is None:
                        inputDataField = []
                        inputData[columnName] = inputDataField
                        inputMask[columnName] = []
                    inputMaskField = inputMask[columnName]

                    while len(inputDataField) < index:
                        inputDataField.append(defs.PADDING)
                        inputMaskField.append(True)

                    inputDataField.append(columnValue)
                    inputMaskField.append(False)

        for verificationField in verificationFields.values():
            while len(verificationField.data) < index:
                verificationField.data.append(defs.PADDING)
                verificationField.mask.append(True)

        for columnName in inputData:
            inputDataField = inputData[columnName]
            inputMaskField = inputMask[columnName]
            while len(inputDataField) < index:
                inputDataField.append(defs.PADDING)
                inputMaskField.append(True)

        for columnName, verificationField in verificationFields.items():
            inputData[columnName] = verificationField.data
            inputMask[columnName] = verificationField.mask

        model = self.getparent()

        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        performanceTable.begin("make DataTable")
        dataTable = DataTable(model, inputData, inputMask, inputState=None)
        performanceTable.end("make DataTable")

        functionTable = FunctionTable()

        for miningField in model.xpath("pmml:MiningSchema/pmml:MiningField"):
            miningField.replaceField(dataTable, functionTable,
                                     performanceTable)

        for calculable in model.calculableTrans():
            calculable.calculate(dataTable, functionTable, performanceTable)

        score = model.calculateScore(dataTable, functionTable,
                                     performanceTable)
        dataTable.score = score[None]
        if model.name is not None:
            for key, value in score.items():
                if key is None:
                    dataTable.fields[model.name] = value
                else:
                    dataTable.fields["%s.%s" % (model.name, key)] = value

        for outputField in self.xpath("../pmml:Output/pmml:OutputField"):
            displayName = outputField.get("displayName", outputField["name"])
            outputField.format(dataTable, functionTable, performanceTable,
                               score)

        output = []
        for verificationField in verificationFields.values():
            observedOutput = dataTable.fields.get(verificationField["field"])

            if observedOutput is None:
                raise defs.PmmlValidationError(
                    "VerificationField references field \"%s\" but it was not produced by the model"
                )
            fieldType = observedOutput.fieldType

            if fieldType.dataType == "object":
                try:
                    newArray = [float(x) for x in observedOutput.data]
                except ValueError:
                    pass
                else:
                    fieldType = FakeFieldType("double", "continuous")
                    observedOutput._data = newArray

            for index in xrange(len(dataTable)):
                record = {"field": verificationField["field"], "index": index}

                record["expectedMissing"] = verificationField.mask[index]
                record["observedMissing"] = (
                    observedOutput.mask is not None
                    and observedOutput.mask[index] != defs.VALID)

                if record["expectedMissing"] != record["observedMissing"]:
                    record["success"] = False
                    output.append(record)

                elif not record["expectedMissing"]:
                    record["expectedValue"] = fieldType.stringToValue(
                        verificationField.data[index])
                    record["observedValue"] = observedOutput.data[index]
                    record["expectedPythonValue"] = fieldType.valueToPython(
                        record["expectedValue"])
                    record["observedPythonValue"] = fieldType.valueToPython(
                        record["observedValue"])
                    record["expectedDisplayValue"] = fieldType.valueToString(
                        record["expectedValue"])
                    record["observedDisplayValue"] = fieldType.valueToString(
                        record["observedValue"])

                    if fieldType.optype == "continuous":
                        if (abs(record["expectedValue"]) <=
                                verificationField.zeroThreshold) and (
                                    abs(record["observedValue"]) <=
                                    verificationField.zeroThreshold):
                            record["success"] = True
                        else:
                            record["success"] = (
                                (record["expectedValue"] *
                                 (1.0 - verificationField.precision)) <=
                                record["observedValue"] <=
                                (record["expectedValue"] *
                                 (1.0 + verificationField.precision)))

                        if not record["success"] or showSuccess:
                            output.append(record)

                    else:
                        if record["expectedValue"] != record["observedValue"]:
                            record["success"] = False
                            output.append(record)
                        else:
                            record["success"] = True
                            if showSuccess:
                                output.append(record)

        return output
Example #31
0
class BaselineModel(PmmlModel):
    """BaselineModel implements the baseline model in PMML, which is a
    collection of change-detection routines.

    U{PMML specification<http://www.dmg.org/v4-1/BaselineModel.html>}.
    """

    scoreType = FakeFieldType("double", "continuous")

    def calculateScore(self, dataTable, functionTable, performanceTable):
        """Calculate the score of this model.

        This method is called by C{calculate} to separate operations
        that are performed by all models (in C{calculate}) from
        operations that are performed by specific models (in
        C{calculateScore}).

        @type subTable: DataTable
        @param subTable: The DataTable representing this model's lexical scope.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: A DataColumn containing the score.
        """

        testDistributions = self.childOfTag("TestDistributions")
        testStatistic = testDistributions.get("testStatistic")

        performanceTable.begin("BaselineModel %s" % testStatistic)

        fieldName = testDistributions.get("field")
        dataColumn = dataTable.fields[fieldName]

        if testStatistic == "zValue":
            score = self.zValue(testDistributions, fieldName, dataColumn,
                                dataTable.state, performanceTable)

        elif testStatistic == "CUSUM":
            score = self.cusum(testDistributions, fieldName, dataColumn,
                               dataTable.state, performanceTable)

        else:
            raise NotImplementedError("TODO: add more testStatistics")

        performanceTable.end("BaselineModel %s" % testStatistic)
        return score

    def zValue(self, testDistributions, fieldName, dataColumn, state,
               performanceTable):
        """Calculate the score of a zValue TestStatistic.

        @type testDistributions: PmmlBinding
        @param testDistributions: The <TestDistributions> element.
        @type fieldName: string
        @param fieldName: The field name (for error messages).
        @type dataColumn: DataColumn
        @param dataColumn: The field.
        @type state: DataTableState
        @param state: The persistent state object (not used).
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: dict
        @return: A dictionary mapping PMML "feature" strings to DataColumns; zValue only defines the None key ("predictedValue").
        """

        if dataColumn.fieldType.dataType in ("object", "string", "boolean",
                                             "date", "time", "dateTime"):
            raise TypeError(
                "Field \"%s\" has dataType \"%s\", which is incompatible with BaselineModel.zValue"
                % (fieldName, dataColumn.fieldType.dataType))

        distributions = testDistributions.xpath(
            "pmml:Baseline/*[@mean and @variance]")
        if len(distributions) == 0:
            raise defs.PmmlValidationError(
                "BaselineModel zValue requires a distribution with a mean and a variance"
            )

        distribution = distributions[0]
        mean = float(distribution.get("mean"))
        variance = float(distribution.get("variance"))
        if variance <= 0.0:
            raise defs.PmmlValidationError(
                "Variance must be positive, not %g" % variance)

        return {
            None:
            DataColumn(self.scoreType,
                       NP(NP(dataColumn.data - mean) / math.sqrt(variance)),
                       dataColumn.mask)
        }

    def cusum(self, testDistributions, fieldName, dataColumn, state,
              performanceTable):
        """Calculate the score of a CUSUM TestStatistic.

        The CUSUM cumulative sum is a stateful calculation: each row
        depends on the result of the previous row.  To continue
        calculations through multiple calls to C{calc} or
        C{calculate}, pass a DataTableState object and give the
        BaselineModel a C{stateId} attribute.  The C{stateId} is not
        valid in strict PMML, but it can be inserted after validation
        or used in custom-ODG models (C{from augustus.odg import *}).

        @type testDistributions: PmmlBinding
        @param testDistributions: The <TestDistributions> element.
        @type fieldName: string
        @param fieldName: The field name (for error messages).
        @type dataColumn: DataColumn
        @param dataColumn: The field.
        @type state: DataTableState
        @param state: The persistent state object, which is used to initialize the start state and save the end state of the cumulative sum.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: dict
        @return: A dictionary mapping PMML "feature" strings to DataColumns; CUSUM only defines the None key ("predictedValue").
        """

        baseline = testDistributions.xpath(
            "pmml:Baseline/pmml:GaussianDistribution | pmml:Baseline/pmml:PoissonDistribution"
        )
        alternate = testDistributions.xpath(
            "pmml:Alternate/pmml:GaussianDistribution | pmml:Alternate/pmml:PoissonDistribution"
        )

        if len(baseline) == 0 or len(alternate) == 0:
            raise defs.PmmlValidationError(
                "BaselineModel CUSUM requires a Baseline and an Alternate that are either GaussianDistribution or PoissonDistribution"
            )

        ratios = alternate[0].logpdf(dataColumn.data) - baseline[0].logpdf(
            dataColumn.data)
        if dataColumn.mask is None:
            good = NP("ones", len(dataColumn), dtype=NP.dtype(bool))
        else:
            good = NP(dataColumn.mask == defs.VALID)

        stateId = self.get("stateId")
        last = None
        if stateId is not None:
            last = state.get(stateId)
        if last is None:
            last = 0.0

        resetValue = testDistributions.get("resetValue",
                                           defaultFromXsd=True,
                                           convertType=True)

        output = NP("empty", len(dataColumn), dtype=NP.dtype(float))

        performanceTable.begin("fill CUSUM")
        for index in xrange(len(dataColumn)):
            if good[index]:
                last = max(resetValue, last + ratios[index])
            output[index] = last
        performanceTable.end("fill CUSUM")

        if stateId is not None:
            state[stateId] = last

        return {None: DataColumn(self.scoreType, output, None)}
Example #32
0
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("Discretize")

        dataColumn = dataTable.fields[self["field"]]
        if dataColumn.fieldType.dataType in ("object", "string", "boolean"):
            raise defs.PmmlValidationError("Discretize requires a numeric input field, but \"%s\" is" % dataColumn.fieldType.dataType)

        fieldType = FakeFieldType(self.get("dataType", "string"), self.get("optype", self._optype))
        fieldType._newValuesAllowed = True

        defaultValue = self.get("defaultValue")
        if defaultValue is not None:
            defaultValue = fieldType.stringToValue(defaultValue)

        data = NP("empty", len(dataTable), dtype=fieldType.dtype)
        mask = NP("empty", len(dataTable), dtype=defs.maskType)
        if defaultValue is None:
            mask[:] = defs.MISSING
        else:
            data[:] = defaultValue
            mask[:] = defs.VALID

        for discretizeBin in self.childrenOfTag("DiscretizeBin"):
            try:
                binValue = fieldType.stringToValue(discretizeBin["binValue"])
            except ValueError:
                raise defs.PmmlValidationError("Cannot cast DiscretizeBin binValue \"%s\" as %s %s" % (discretizeBin["binValue"], fieldType.optype, fieldType.dataType))

            fieldType.values.append(FakeFieldValue(value=binValue))

            interval = discretizeBin.childOfTag("Interval")

            closure = interval["closure"]
            leftMargin = interval.get("leftMargin")
            rightMargin = interval.get("rightMargin")
            selection = None

            if leftMargin is not None:
                try:
                    leftMargin = dataColumn.fieldType.stringToValue(leftMargin)
                except ValueError:
                    raise defs.PmmlValidationError("Improper value in Interval leftMargin specification: \"%s\"" % leftMargin)

                if closure in ("openClosed", "openOpen"):
                    if selection is None:
                        selection = NP(leftMargin < dataColumn.data)
                    else:
                        NP("logical_and", selection, NP(leftMargin < dataColumn.data), selection)

                elif closure in ("closedOpen", "closedClosed"):
                    if selection is None:
                        selection = NP(leftMargin <= dataColumn.data)
                    else:
                        NP("logical_and", selection, NP(leftMargin <= dataColumn.data), selection)

            if rightMargin is not None:
                try:
                    rightMargin = dataColumn.fieldType.stringToValue(rightMargin)
                except ValueError:
                    raise defs.PmmlValidationError("Improper value in Interval rightMargin specification: \"%s\"" % rightMargin)

                if closure in ("openOpen", "closedOpen"):
                    if selection is None:
                        selection = NP(dataColumn.data < rightMargin)
                    else:
                        NP("logical_and", selection, NP(dataColumn.data < rightMargin), selection)

                elif closure in ("openClosed", "closedClosed"):
                    if selection is None:
                        selection = NP(dataColumn.data <= rightMargin)
                    else:
                        NP("logical_and", selection, NP(dataColumn.data <= rightMargin), selection)
                
            if selection is not None:
                NP("logical_and", selection, NP(dataColumn.mask == defs.VALID), selection)
                data[selection] = binValue
                mask[selection] = defs.VALID

        mask[NP(dataColumn.mask == defs.MISSING)] = defs.MISSING
        mask[NP(dataColumn.mask == defs.INVALID)] = defs.INVALID

        data, mask = FieldCastMethods.applyMapMissingTo(fieldType, data, mask, self.get("mapMissingTo"))
        
        performanceTable.end("Discretize")
        return DataColumn(fieldType, data, mask)
Example #33
0
    def replaceField(self, dataTable, functionTable, performanceTable):
        """Replace a field in the DataTable for outlier removal,
        missing value handling, and invalid value treatment.

        @type dataTable: DataTable
        @param dataTable: The pre-built DataTable.
        @type functionTable: FunctionTable
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        """

        dataColumn = dataTable.fields.get(self.name)
        if dataColumn is None:
            return

        performanceTable.begin("MiningField")

        optype = self.get("optype", dataColumn.fieldType.optype)
        if optype != dataColumn.fieldType.optype:
            dataColumn = FieldCastMethods.cast(
                FakeFieldType(dataColumn.fieldType.dataType, optype),
                dataColumn)

        data = dataColumn.data
        mask = dataColumn.mask

        outliers = self.get("outliers")

        lowValue = self.get("lowValue")
        if lowValue is not None:
            lowValue = dataColumn.fieldType.stringToValue(lowValue)

            if outliers == "asMissingValues":
                selection = NP(dataColumn.data < lowValue)
                mask = FieldCastMethods.outliersAsMissing(
                    mask, dataColumn.mask, selection)

            elif outliers == "asExtremeValues":
                selection = NP(dataColumn.data < lowValue)
                if data is dataColumn.data:
                    data = NP("copy", data)
                    data.setflags(write=True)
                    data[selection] = lowValue

        highValue = self.get("highValue")
        if highValue is not None:
            highValue = dataColumn.fieldType.stringToValue(highValue)

            if outliers == "asMissingValues":
                selection = NP(dataColumn.data > highValue)
                mask = FieldCastMethods.outliersAsMissing(
                    mask, dataColumn.mask, selection)

            elif outliers == "asExtremeValues":
                selection = NP(dataColumn.data > highValue)
                if data is dataColumn.data:
                    data = NP("copy", data)
                    data.setflags(write=True)
                    data[selection] = highValue

        mask = FieldCastMethods.applyInvalidValueTreatment(
            mask, self.get("invalidValueTreatment"))
        data, mask = FieldCastMethods.applyMapMissingTo(
            dataColumn.fieldType, data, mask,
            self.get("missingValueReplacement"))

        dataTable.fields.replaceField(
            self.name, DataColumn(dataColumn.fieldType, data, mask))
        performanceTable.end("MiningField")
Example #34
0
class NormDiscrete(PmmlExpression):
    """NormDiscrete implements an expression that acts as an indicator
    function on categorical fields, return 1 when a field is equal to
    a given value, 0 otherwise.

    U{PMML specification<http://www.dmg.org/v4-1/Transformations.html>}.
    """

    _fieldType = FakeFieldType("integer", "continuous")

    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("NormDiscrete")

        dataColumn = dataTable.fields[self["field"]]
        value = dataColumn.fieldType.stringToValue(self["value"])
        data = NP("array",
                  NP(dataColumn.data == value),
                  dtype=self._fieldType.dtype)
        data, mask = FieldCastMethods.applyMapMissingTo(
            self._fieldType, data, dataColumn.mask, self.get("mapMissingTo"))

        performanceTable.end("NormDiscrete")
        return DataColumn(self._fieldType, data, mask)

    @staticmethod
    def fanOutByValue(modelLoader, fieldName, dataColumn, prefix=None):
        """Create a suite of NormDiscrete transformations, one
        indicator function for each unique value in a categorical
        dataset.

        @type modelLoader: ModelLoader
        @param modelLoader: The ModelLoader used to create the new PMML nodes.
        @type fieldName: FieldName
        @param fieldName: The name of the categorical field to fan out, used in the names of the new fields.
        @type dataColumn: DataColumn
        @param dataColumn: The categorical dataset.
        @type prefix: string or None
        @param prefix: The PMML prefix, used to create an lxml.etree.ElementMaker.
        """

        if prefix is None:
            E = modelLoader.elementMaker()
        else:
            E = modelLoader.elementMaker(prefix)

        if dataColumn.mask is None:
            values = NP("unique", dataColumn.data)
        else:
            values = NP("unique",
                        dataColumn.data[NP(dataColumn.mask == defs.VALID)])

        derivedFields = []
        for value in values:
            stringValue = dataColumn.fieldType.valueToString(value)
            normDiscrete = E.NormDiscrete(field=fieldName, value=stringValue)
            derivedField = E.DerivedField(normDiscrete,
                                          name=("%s.%s" %
                                                (fieldName, stringValue)),
                                          dataType="integer",
                                          optype="continuous")
            derivedFields.append(derivedField)

        return derivedFields
Example #35
0
    def format(self, subTable, functionTable, performanceTable, score):
        """Extract or post-process output for the output field of a DataTable.

        @type subTable: DataTable
        @param subTable: The DataTable associated with this local lexical scope.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @type score: dict
        @param score: Dictionary mapping PMML score "feature" strings to DataColumns.  This dictionary always contains a None key, which is the basic feature ("predictedValue").
        @rtype: DataColumn
        @return: The output that would go into an output field of a DataTable.
        """

        performanceTable.begin("OutputField")

        feature = self.get("feature")
        if feature is None:
            dataColumn = subTable.fields[self["name"]]

        elif feature == "predictedValue":
            dataColumn = score[None]

        elif feature == "predictedDisplayValue":
            original = score[None]
            toString = original.fieldType.valueToString
            data = NP("empty", len(subTable), dtype=NP.dtype(object))
            for i, x in enumerate(original.data):
                data[i] = toString(x)
            dataColumn = DataColumn(FakeFieldType("string", "continuous"),
                                    data, None)

        elif feature == "transformedValue":
            expression = self.childOfClass(PmmlExpression)
            if expression is None:
                raise defs.PmmlValidationError(
                    "OutputField with feature \"transformedValue\" requires an EXPRESSION"
                )

            performanceTable.pause("OutputField")
            dataColumn = expression.evaluate(subTable, functionTable,
                                             performanceTable)
            performanceTable.unpause("OutputField")

        elif feature == "decision":
            decisions = self.childOfTag("Decisions")
            if decisions is None:
                raise defs.PmmlValidationError(
                    "OutputField with feature \"decision\" requires a Decisions block"
                )

            performanceTable.pause("OutputField")
            dataColumn = self.childOfClass(PmmlExpression).evaluate(
                subTable, functionTable, performanceTable)
            performanceTable.unpause("OutputField")

            if dataColumn.mask is None:
                valid = None
            else:
                valid = NP(dataColumn.mask == defs.VALID)

            fieldType = FakeFieldType("object", "any")
            data = NP("empty", len(subTable), dtype=fieldType.dtype)
            mask = NP(
                NP("ones", len(subTable), dtype=defs.maskType) * defs.MISSING)

            for decision in decisions.childrenOfTag("Decision"):
                value = dataColumn.fieldType.stringToValue(decision["value"])

                selection = NP(dataColumn.data == value)
                if valid is not None:
                    NP("logical_and", selection, valid, selection)

                for i in xrange(len(data)):
                    if selection[i]:
                        data[i] = decision

                mask[selection] = defs.VALID

            if not mask.any():
                mask = None

            dataColumn = DataColumn(fieldType, data, mask)

        elif feature in score:
            dataColumn = score[feature]

        else:
            model = self.getparent()
            if model is not None: model = model.getparent()

            if model is None:
                model = "(orphaned OutputField; no parent model)"
            else:
                model = model.t

            raise defs.PmmlValidationError(
                "Models of type %s do not produce \"%s\" features (or at least, it is not yet implemented by Augustus)"
                % (model, feature))

        dataType = self.get("dataType", dataColumn.fieldType.dataType)
        optype = self.get("optype", dataColumn.fieldType.optype)
        if (dataType != dataColumn.fieldType.dataType
                or optype != dataColumn.fieldType.optype) and feature not in (
                    "predictedDisplayValue", "decision"):
            dataColumn = FieldCastMethods.cast(FakeFieldType(dataType, optype),
                                               dataColumn)

        if feature is not None:
            subTable.fields[self.get("displayName", self["name"])] = dataColumn

        performanceTable.end("OutputField")
        return dataColumn