def singleton(self, inputData, inputMask=None, inputState=None): """Create a single-row DataTable for event-based processes. This static method is to the DataTable constructor, but it creates a DataTable with only one row and it uses the Python data type of the C{inputData} to define a type, rather than an explicit C{context}. @type inputData: dict-like mapping from strings to single values (not lists) @param inputData: A single data record. @type inputMask: dict-like mapping from strings to single C{defs.maskType} values (not lists), or None @param inputMask: A single mask. @type inputState: DataTableState or None @param inputState: Initial state of the DataTable. To continue a previous calculation, use the C{dataTable.state} from the previous calculation. """ dataColumns = OrderedDict() for fieldName in sorted(inputData.keys()): value = inputData[fieldName] if isinstance(value, basestring): fieldType = FakeFieldType("string", "continuous") elif isinstance(value, float): fieldType = FakeFieldType("double", "continuous") elif isinstance(value, int): fieldType = FakeFieldType("integer", "continuous") elif isinstance(value, bool): fieldType = FakeFieldType("boolean", "continuous") # TODO: PMML date types (when passed a datetype.datetype object) else: fieldType = FakeFieldType("object", "any") data = NP("empty", 1, dtype=fieldType.dtype) data[0] = value if inputMask is None or inputMask.get(fieldName) is None: mask = None else: mask = NP("empty", 1, dtype=defs.maskType) mask[0] = inputMask.get(fieldName) dataColumns[fieldName] = DataColumn(fieldType, data, mask) dataTable = DataTable.__new__(DataTable) dataTable._configure(dataColumns, inputState) return dataTable
def functionAverageFake(self, value, howmany, fieldType): """Averages rows in a DataColumn when it is known that there are no matches. @type value: number @param value: Initial and final value. @type howmany: int @param howmany: Number of rows. @type fieldType: FieldType @param fieldType: The type of field to emulate. @rtype: DataColumn @return: The faked results. """ fieldType = FakeFieldType("double", "continuous") numerator = NP("empty", howmany, dtype=fieldType.dtype) denominator = NP("empty", howmany, dtype=fieldType.dtype) numerator[:] = value[0] denominator[:] = value[1] data = NP(numerator / denominator) if value[1] == 0: mask = NP("empty", howmany, dtype=defs.maskType) mask[:] = defs.INVALID else: mask = None return DataColumn(fieldType, data, mask)
def functionAverage(self, dataColumn, whereMask, groupSelection, getstate, setstate): """Averages rows in a DataColumn, possibly with an SQL where mask and groupField. @type dataColumn: DataColumn @param dataColumn: The input data column. @type whereMask: 1d Numpy array of bool, or None @param whereMask: The result of the SQL where selection. @type groupSelection: 1d Numpy array of bool, or None. @param groupSelection: Rows corresponding to a particular value of the groupField. @type getstate: callable function @param getstate: Retrieve staring values from the DataTableState. @type setstate: callable function @param setstate: Store ending values to the DataTableState. @rtype: DataColumn @return: A column of averaged rows. """ fieldType = FakeFieldType("double", "continuous") if dataColumn.fieldType.dataType not in ("integer", "float", "double"): raise defs.PmmlValidationError( "Aggregate function \"average\" requires a numeric input field: \"integer\", \"float\", \"double\"" ) denominator = NP("ones", len(dataColumn), dtype=fieldType.dtype) if dataColumn.mask is not None: NP("logical_and", denominator, NP(dataColumn.mask == defs.VALID), denominator) if whereMask is not None: NP("logical_and", denominator, whereMask, denominator) if groupSelection is not None: NP("logical_and", denominator, groupSelection, denominator) numerator = NP("multiply", denominator, dataColumn.data) if getstate is not None and len(dataColumn) > 0: startingState = getstate() if startingState is not None: startingNumerator, startingDenominator = startingState numerator[0] += startingNumerator denominator[0] += startingDenominator numerator = NP("cumsum", numerator) denominator = NP("cumsum", denominator) data = NP(numerator / denominator) mask = NP(NP("logical_not", NP("isfinite", data)) * defs.INVALID) if not mask.any(): mask = None if setstate is not None and len(dataColumn) > 0: setstate((numerator[-1], denominator[-1])) return DataColumn(fieldType, data, mask)
def functionMultiset(self, dataColumn, whereMask, groupSelection, getstate, setstate): """Derives a multiset of rows in a DataColumn, possibly with an SQL where mask and groupField. @type dataColumn: DataColumn @param dataColumn: The input data column. @type whereMask: 1d Numpy array of bool, or None @param whereMask: The result of the SQL where selection. @type groupSelection: 1d Numpy array of bool, or None. @param groupSelection: Rows corresponding to a particular value of the groupField. @type getstate: callable function @param getstate: Retrieve staring values from the DataTableState. @type setstate: callable function @param setstate: Store ending values to the DataTableState. @rtype: DataColumn of dict objects @return: A column of multisetted rows. """ fieldType = FakeFieldType("object", "any") selection = NP("ones", len(dataColumn), dtype=NP.dtype(bool)) if dataColumn.mask is not None: selection = NP("logical_and", selection, NP(dataColumn.mask == defs.VALID)) if whereMask is not None: NP("logical_and", selection, whereMask, selection) if groupSelection is not None: NP("logical_and", selection, groupSelection, selection) multiset = {} if getstate is not None: startingState = getstate() if startingState is not None: multiset = startingState current = dict(multiset) data = NP("empty", len(dataColumn), dtype=NP.dtype(object)) toPython = dataColumn.fieldType.valueToPython for i, x in enumerate(dataColumn.data): if selection[i]: value = toPython(x) if value not in multiset: multiset[value] = 0 multiset[value] += 1 current = dict(multiset) data[i] = current if setstate is not None: setstate(multiset) return DataColumn(fieldType, data, None)
def functionSum(self, dataColumn, whereMask, groupSelection, getstate, setstate): """Adds up rows in a DataColumn, possibly with an SQL where mask and groupField. @type dataColumn: DataColumn @param dataColumn: The input data column. @type whereMask: 1d Numpy array of bool, or None @param whereMask: The result of the SQL where selection. @type groupSelection: 1d Numpy array of bool, or None. @param groupSelection: Rows corresponding to a particular value of the groupField. @type getstate: callable function @param getstate: Retrieve staring values from the DataTableState. @type setstate: callable function @param setstate: Store ending values to the DataTableState. @rtype: DataColumn @return: A column of added rows. """ fieldType = FakeFieldType("double", "continuous") if dataColumn.fieldType.dataType not in ("integer", "float", "double"): raise defs.PmmlValidationError( "Aggregate function \"sum\" requires a numeric input field: \"integer\", \"float\", \"double\"" ) ones = NP("ones", len(dataColumn), dtype=fieldType.dtype) if dataColumn.mask is not None: NP("logical_and", ones, NP(dataColumn.mask == defs.VALID), ones) if whereMask is not None: NP("logical_and", ones, whereMask, ones) if groupSelection is not None: NP("logical_and", ones, groupSelection, ones) NP("multiply", ones, dataColumn.data, ones) if getstate is not None and len(dataColumn) > 0: startingState = getstate() if startingState is not None: ones[0] += startingState data = NP("cumsum", ones) if setstate is not None and len(dataColumn) > 0: setstate(data[-1]) return DataColumn(fieldType, data, None)
def functionMultisetFake(self, value, howmany, fieldType): """Derives a multiset of rows in a DataColumn when it is known that there are no matches. @type value: number @param value: Initial and final value. @type howmany: int @param howmany: Number of rows. @type fieldType: FieldType @param fieldType: The type of field to emulate. @rtype: DataColumn @return: The faked results. """ fieldType = FakeFieldType("object", "any") data = NP("empty", howmany, dtype=fieldType.dtype) data[:] = value return DataColumn(fieldType, data, None)
class Constant(object): """Equivalent of a PMML <Constant> element.""" def __init__(self, dataType, value): self.fieldType = FakeFieldType(dataType, "continuous") self.value = value def evaluate(self, dataTable, functionTable, performanceTable): data = NP("empty", len(dataTable), dtype=self.fieldType.dtype) data[:] = self.value return self.fieldType.toDataColumn(data, None) def __repr__(self): return repr(self.value) def asPmml(self, E): return E.Constant(str(self.value), dataType=self.fieldType.dataType)
def emptyDataTable(self): """Construct an empty DataTable from the serialized DataTableFields and DataTableState. @rtype: DataTable @return: An empty DataTable, suitable for PmmlPlotContent.prepare. """ context = {} inputData = {} inputState = self.unserializeState() for name, value in inputState.iteritems(): if name.endswith(".context"): for fieldName, (dataType, optype) in value.iteritems(): context[fieldName] = FakeFieldType(dataType, optype) inputData[fieldName] = [] return DataTable(context, inputData, inputState=inputState)
def functionCount(self, dataColumn, whereMask, groupSelection, getstate, setstate): """Counts rows in a DataColumn, possibly with an SQL where mask and groupField. @type dataColumn: DataColumn @param dataColumn: The input data column. @type whereMask: 1d Numpy array of bool, or None @param whereMask: The result of the SQL where selection. @type groupSelection: 1d Numpy array of bool, or None. @param groupSelection: Rows corresponding to a particular value of the groupField. @type getstate: callable function @param getstate: Retrieve staring values from the DataTableState. @type setstate: callable function @param setstate: Store ending values to the DataTableState. @rtype: DataColumn @return: A column of counted rows. """ fieldType = FakeFieldType("integer", "continuous") ones = NP("ones", len(dataColumn), dtype=fieldType.dtype) if dataColumn.mask is not None: NP("logical_and", ones, NP(dataColumn.mask == defs.VALID), ones) if whereMask is not None: NP("logical_and", ones, whereMask, ones) if groupSelection is not None: NP("logical_and", ones, groupSelection, ones) if getstate is not None and len(dataColumn) > 0: startingState = getstate() if startingState is not None: ones[0] += startingState data = NP("cumsum", ones) if setstate is not None and len(dataColumn) > 0: setstate(data[-1]) return DataColumn(fieldType, data, None)
def prepare(self, state, dataTable, functionTable, performanceTable, plotRange): """Prepare a plot element for drawing. This stage consists of calculating all quantities and determing the bounds of the data. These bounds may be unioned with bounds from other plot elements that overlay this plot element, so the drawing (which requires a finalized coordinate system) cannot begin yet. This method modifies C{plotRange}. @type state: ad-hoc Python object @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type dataTable: DataTable @param dataTable: Contains the data to plot. @type functionTable: FunctionTable @param functionTable: Defines functions that may be used to transform data for plotting. @type performanceTable: PerformanceTable @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process. @type plotRange: PlotRange @param plotRange: The bounding box of plot coordinates that this function will expand. """ self._saveContext(dataTable) for directive in self.xpath("pmml:PlotLine"): try: x1 = float(directive["x1"]) y1 = float(directive["y1"]) x2 = float(directive["x2"]) y2 = float(directive["y2"]) except ValueError: pass else: fieldType = FakeFieldType("double", "continuous") plotRange.xminPush(x1, fieldType, sticky=False) plotRange.yminPush(y1, fieldType, sticky=False) plotRange.xmaxPush(x2, fieldType, sticky=False) plotRange.ymaxPush(y2, fieldType, sticky=False)
class PlotCurve(PmmlPlotContent): """Represents a curve defined by mathematical formulae or a jagged line/smooth curve through a set of data points. PMML subelements for a 1d formula: - PlotFormula role="y(x)" - PlotFormula role="dy/dx" (optional) PMML subelements for a parametric formula: - PlotFormula role="x(t)" - PlotFormula role="y(t)" - PlotFormula role="dx/dt" (optional) - PlotFormula role="dy/dt" (optional) PMML subelements for a fit to data points: - PlotNumericExpression role="x" - PlotNumericExpression role="y" - PlotNumericExpression role="dx" (optional) - PlotNumericExpression role="dy" (optional) - PlotSelection (optional) PMML attributes: - svgId: id for the resulting SVG element. - stateId: key for persistent storage in a DataTableState. - low: low edge of domain (in x or t) for mathematical formulae. - high: high edge of domain (in x or t) for mathematical formulae. - numSamples: number of locations to sample for mathematical formulae. - samplingMethod: "uniform", "random", or "adaptive". - loop: if "true", draw a closed loop that connects the first and last points. - smooth: if "false", draw a jagged line between each data point; if "true", fit a smooth curve. - smoothingScale: size of the smoothing scale in units of the domain (in x or t). - style: CSS style properties. CSS properties: - fill, fill-opacity: color under the curve. - stroke, stroke-dasharray, stroke-dashoffset, stroke-linecap, stroke-linejoin, stroke-miterlimit, stroke-opacity, stroke-width: properties of the line drawing. See the source code for the full XSD. """ styleProperties = [ "fill", "fill-opacity", "stroke", "stroke-dasharray", "stroke-dashoffset", "stroke-linecap", "stroke-linejoin", "stroke-miterlimit", "stroke-opacity", "stroke-width", ] styleDefaults = {"fill": "none", "stroke": "black"} xsd = """<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"> <xs:element name="PlotCurve"> <xs:complexType> <xs:sequence> <xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded" /> <xs:choice minOccurs="1" maxOccurs="1"> <xs:element ref="PlotFormula" minOccurs="1" maxOccurs="4" /> <xs:sequence> <xs:element ref="PlotNumericExpression" minOccurs="1" maxOccurs="4" /> <xs:element ref="PlotSelection" minOccurs="0" maxOccurs="1" /> </xs:sequence> </xs:choice> </xs:sequence> <xs:attribute name="svgId" type="xs:string" use="optional" /> <xs:attribute name="stateId" type="xs:string" use="optional" /> <xs:attribute name="low" type="xs:double" use="optional" /> <xs:attribute name="high" type="xs:double" use="optional" /> <xs:attribute name="numSamples" type="xs:positiveInteger" use="optional" default="100" /> <xs:attribute name="samplingMethod" use="optional" default="uniform"> <xs:simpleType> <xs:restriction base="xs:string"> <xs:enumeration value="uniform" /> <xs:enumeration value="random" /> <xs:enumeration value="adaptive" /> </xs:restriction> </xs:simpleType> </xs:attribute> <xs:attribute name="loop" type="xs:boolean" use="optional" default="false" /> <xs:attribute name="smooth" type="xs:boolean" use="optional" default="true" /> <xs:attribute name="smoothingScale" type="xs:double" use="optional" default="1.0" /> <xs:attribute name="style" type="xs:string" use="optional" default="%s" /> </xs:complexType> </xs:element> </xs:schema> """ % PlotStyle.toString(styleDefaults) xfieldType = FakeFieldType("double", "continuous") @classmethod def expressionsToPoints(cls, expression, derivative, samples, loop, functionTable, performanceTable): """Evaluate a set of given string-based formulae to generate numeric points. This is used to plot mathematical curves. @type expression: 1- or 2-tuple of strings @param expression: If a 1-tuple, the string is passed to Formula and interpreted as y(x); if a 2-tuple, the strings are passed to Formula and interpreted as x(t), y(t). @type derivative: 1- or 2-tuple of strings (same length as C{expression}) @param derivative: Strings are passed to Formua and interpreted as dy/dx (if a 1-tuple) or dx/dt, dy/dt (if a 2-tuple). @type samples: 1d Numpy array @param samples: Values of x or t at which to evaluate the expression or expressions. @type loop: bool @param loop: If False, disconnect the end of the set of points from the beginning. @type functionTable: FunctionTable @param functionTable: Functions that may be used to perform the calculation. @type performanceTable: PerformanceTable @param performanceTable: Measures and records performance (time and memory consumption) of the process. @rtype: 6-tuple @return: C{xlist}, C{ylist}, C{dxlist}, C{dylist} (1d Numpy arrays), xfieldType, yfieldType (FieldTypes). """ if len(expression) == 1: sampleTable = DataTable({"x": "double"}, {"x": samples}) parsed = Formula.parse(expression[0]) ydataColumn = parsed.evaluate(sampleTable, functionTable, performanceTable) if not ydataColumn.fieldType.isnumeric( ) and not ydataColumn.fieldType.istemporal(): raise defs.PmmlValidationError( "PlotFormula y(x) must return a numeric expression, not %r" % ydataColumn.fieldType) xfieldType = cls.xfieldType yfieldType = ydataColumn.fieldType selection = None if ydataColumn.mask is not None: selection = NP(ydataColumn.mask == defs.VALID) if derivative[0] is None: if selection is None: xlist = samples ylist = ydataColumn.data else: xlist = samples[selection] ylist = ydataColumn.data[selection] dxlist = NP( (NP("roll", xlist, -1) - NP("roll", xlist, 1)) / 2.0) dylist = NP( (NP("roll", ylist, -1) - NP("roll", ylist, 1)) / 2.0) if not loop: dxlist[0] = 0.0 dxlist[-1] = 0.0 dylist[0] = 0.0 dylist[-1] = 0.0 else: parsed = Formula.parse(derivative[0]) dydataColumn = parsed.evaluate(sampleTable, functionTable, performanceTable) if not dydataColumn.fieldType.isnumeric( ) and not dydataColumn.fieldType.istemporal(): raise defs.PmmlValidationError( "PlotFormula dy/dx must return a numeric expression, not %r" % dydataColumn.fieldType) if dydataColumn.mask is not None: if selection is None: selection = NP(dydataColumn.mask == defs.VALID) else: NP("logical_and", selection, NP(dydataColumn.mask == defs.VALID), selection) if selection is None: xlist = samples ylist = ydataColumn.data dxlist = NP( (NP("roll", xlist, -1) - NP("roll", xlist, 1)) / 2.0) dylist = dydataColumn.data else: xlist = samples[selection] ylist = ydataColumn.data[selection] dxlist = NP( (NP("roll", xlist, -1) - NP("roll", xlist, 1)) / 2.0) dylist = NP(dydataColumn.data[selection] * dxlist) if not loop: dxlist[0] = 0.0 dxlist[-1] = 0.0 dylist[0] = 0.0 dylist[-1] = 0.0 elif len(expression) == 2: sampleTable = DataTable({"t": "double"}, {"t": samples}) parsed = Formula.parse(expression[0]) xdataColumn = parsed.evaluate(sampleTable, functionTable, performanceTable) if not xdataColumn.fieldType.isnumeric( ) and not xdataColumn.fieldType.istemporal(): raise defs.PmmlValidationError( "PlotFormula x(t) must return a numeric expression, not %r" % xdataColumn.fieldType) parsed = Formula.parse(expression[1]) ydataColumn = parsed.evaluate(sampleTable, functionTable, performanceTable) if not ydataColumn.fieldType.isnumeric( ) and not ydataColumn.fieldType.istemporal(): raise defs.PmmlValidationError( "PlotFormula y(t) must return a numeric expression, not %r" % ydataColumn.fieldType) xfieldType = xdataColumn.fieldType yfieldType = ydataColumn.fieldType selection = None if xdataColumn.mask is not None: selection = NP(xdataColumn.mask == defs.VALID) if ydataColumn.mask is not None: if selection is None: selection = NP(ydataColumn.mask == defs.VALID) else: NP("logical_and", selection, NP(ydataColumn.mask == defs.VALID), selection) if derivative[0] is None: if selection is None: xlist = xdataColumn.data ylist = ydataColumn.data else: xlist = xdataColumn.data[selection] ylist = ydataColumn.data[selection] dxlist = NP( (NP("roll", xlist, -1) - NP("roll", xlist, 1)) / 2.0) dylist = NP( (NP("roll", ylist, -1) - NP("roll", ylist, 1)) / 2.0) if not loop: dxlist[0] = 0.0 dxlist[-1] = 0.0 dylist[0] = 0.0 dylist[-1] = 0.0 else: parsed = Formula.parse(derivative[0]) dxdataColumn = parsed.evaluate(sampleTable, functionTable, performanceTable) if not dxdataColumn.fieldType.isnumeric( ) and not dxdataColumn.fieldType.istemporal(): raise defs.PmmlValidationError( "PlotFormula dx/dt must return a numeric expression, not %r" % dxdataColumn.fieldType) parsed = Formula.parse(derivative[1]) dydataColumn = parsed.evaluate(sampleTable, functionTable, performanceTable) if not dydataColumn.fieldType.isnumeric( ) and not dydataColumn.fieldType.istemporal(): raise defs.PmmlValidationError( "PlotFormula dy/dt must return a numeric expression, not %r" % dydataColumn.fieldType) if dxdataColumn.mask is not None: if selection is None: selection = NP(dxdataColumn.mask == defs.VALID) else: NP("logical_and", selection, NP(dxdataColumn.mask == defs.VALID), selection) if dydataColumn.mask is not None: if selection is None: selection = NP(dydataColumn.mask == defs.VALID) else: NP("logical_and", selection, NP(dydataColumn.mask == defs.VALID), selection) if selection is None: dt = NP( (NP("roll", samples, -1) - NP("roll", samples, 1)) / 2.0) xlist = xdataColumn.data ylist = ydataColumn.data dxlist = NP(dxdataColumn.data * dt) dylist = NP(dydataColumn.data * dt) else: dt = NP((NP("roll", samples[selection], -1) - NP("roll", samples[selection], 1)) / 2.0) xlist = xdataColumn.data[selection] ylist = ydataColumn.data[selection] dxlist = NP(dxdataColumn.data[selection] * dt) dylist = NP(dydataColumn.data[selection] * dt) if not loop: dxlist[0] = 0.0 dxlist[-1] = 0.0 dylist[0] = 0.0 dylist[-1] = 0.0 return xlist, ylist, dxlist, dylist, xfieldType, yfieldType @staticmethod def pointsToSmoothCurve(xarray, yarray, samples, smoothingScale, loop): """Fit a smooth line through a set of given numeric points with a characteristic smoothing scale. This is a non-parametric locally linear fit, used to plot data as a smooth line. @type xarray: 1d Numpy array of numbers @param xarray: Array of x values. @type yarray: 1d Numpy array of numbers @param yarray: Array of y values. @type samples: 1d Numpy array of numbers @param samples: Locations at which to fit the C{xarray} and C{yarray} with best-fit positions and derivatives. @type smoothingScale: number @param smoothingScale: Standard deviation of the Gaussian kernel used to smooth the locally linear fit. @type loop: bool @param loop: If False, disconnect the end of the fitted curve from the beginning. @rtype: 4-tuple of 1d Numpy arrays @return: C{xlist}, C{ylist}, C{dxlist}, C{dylist} appropriate for C{formatPathdata}. """ ylist = [] dylist = [] for sample in samples: weights = NP( NP( NP( "exp", NP( NP(-0.5 * NP("power", NP(xarray - sample), 2)) / NP(smoothingScale * smoothingScale))) / smoothingScale) / (math.sqrt(2.0 * math.pi))) sum1 = weights.sum() sumx = NP(weights * xarray).sum() sumxx = NP(weights * NP(xarray * xarray)).sum() sumy = NP(weights * yarray).sum() sumxy = NP(weights * NP(xarray * yarray)).sum() delta = (sum1 * sumxx) - (sumx * sumx) intercept = ((sumxx * sumy) - (sumx * sumxy)) / delta slope = ((sum1 * sumxy) - (sumx * sumy)) / delta ylist.append(intercept + (sample * slope)) dylist.append(slope) xlist = samples ylist = NP("array", ylist, dtype=NP.dtype(float)) dxlist = NP((NP("roll", xlist, -1) - NP("roll", xlist, 1)) / 2.0) dylist = NP("array", dylist, dtype=NP.dtype(float)) * dxlist if not loop: dxlist[0] = 0.0 dxlist[-1] = 0.0 dylist[0] = 0.0 dylist[-1] = 0.0 return xlist, ylist, dxlist, dylist @staticmethod def formatPathdata(xlist, ylist, dxlist, dylist, plotCoordinates, loop, smooth): """Compute SVG path data from position and derivatives lists. @type xlist: 1d Numpy array of numbers @param xlist: Array of x values at each point t. @type ylist: 1d Numpy array of numbers @param ylist: Array of y values at each point t. @type dxlist: 1d Numpy array of numbers @param dxlist: Array of dx/dt derivatives at each point t. @type dylist: 1d Numpy array of numbers @param dylist: Array of dy/dt derivatives at each point t. @type plotCoordinates: PlotCoordinates @param plotCoordinates: Coordinate system to convert the points. @type loop: bool @param loop: If True, the last point should be connected to the first point. @type smooth: bool @param smooth: If True, use the derivatives (C{dxlist} and C{dylist}) to define Bezier curves between the points; otherwise, draw straight lines. @rtype: list of strings @return: When concatenated with spaces, the return type is appropriate for an SVG path's C{d} attribute. """ pathdata = [] if not smooth: X, Y = plotCoordinates(xlist, ylist) nextIsMoveto = True for x, y in itertools.izip(X, Y): if nextIsMoveto: pathdata.append("M %r %r" % (x, y)) nextIsMoveto = False else: pathdata.append("L %r %r" % (x, y)) if loop: pathdata.append("Z") else: C1x = NP("roll", xlist, 1) + NP("roll", dxlist, 1) / 3.0 C1y = NP("roll", ylist, 1) + NP("roll", dylist, 1) / 3.0 C2x = xlist - dxlist / 3.0 C2y = ylist - dylist / 3.0 X, Y = plotCoordinates(xlist, ylist) C1X, C1Y = plotCoordinates(C1x, C1y) C2X, C2Y = plotCoordinates(C2x, C2y) nextIsMoveto = True for x, y, c1x, c1y, c2x, c2y in itertools.izip( X, Y, C1X, C1Y, C2X, C2Y): if nextIsMoveto: pathdata.append("M %r %r" % (x, y)) nextIsMoveto = False else: pathdata.append("C %r %r %r %r %r %r" % (c1x, c1y, c2x, c2y, x, y)) if loop: pathdata.append("Z") return pathdata def generateSamples(self, low, high): """Used by C{prepare} to generate an array of samples. @type low: number @param low: Minimum value to sample. @type high: number @param high: Maximum value to sample. @rtype: 1d Numpy array @return: An array of uniform, random, or adaptive samples of an interval. """ numSamples = self.get("numSamples", defaultFromXsd=True, convertType=True) samplingMethod = self.get("samplingMethod", defaultFromXsd=True) if samplingMethod == "uniform": samples = NP("linspace", low, high, numSamples, endpoint=True) elif samplingMethod == "random": samples = NP( NP(NP(NP.random.rand(numSamples)) * (high - low)) + low) samples.sort() else: raise NotImplementedError("TODO: add 'adaptive'") return samples def prepare(self, state, dataTable, functionTable, performanceTable, plotRange): """Prepare a plot element for drawing. This stage consists of calculating all quantities and determing the bounds of the data. These bounds may be unioned with bounds from other plot elements that overlay this plot element, so the drawing (which requires a finalized coordinate system) cannot begin yet. This method modifies C{plotRange}. @type state: ad-hoc Python object @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type dataTable: DataTable @param dataTable: Contains the data to plot. @type functionTable: FunctionTable @param functionTable: Defines functions that may be used to transform data for plotting. @type performanceTable: PerformanceTable @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process. @type plotRange: PlotRange @param plotRange: The bounding box of plot coordinates that this function will expand. """ self.checkRoles([ "y(x)", "dy/dx", "x(t)", "y(t)", "dx/dt", "dy/dt", "x", "y", "dx", "dy" ]) performanceTable.begin("PlotCurve prepare") self._saveContext(dataTable) yofx = self.xpath("pmml:PlotFormula[@role='y(x)']") dydx = self.xpath("pmml:PlotFormula[@role='dy/dx']") xoft = self.xpath("pmml:PlotFormula[@role='x(t)']") yoft = self.xpath("pmml:PlotFormula[@role='y(t)']") dxdt = self.xpath("pmml:PlotFormula[@role='dx/dt']") dydt = self.xpath("pmml:PlotFormula[@role='dy/dt']") nx = self.xpath("pmml:PlotNumericExpression[@role='x']") ny = self.xpath("pmml:PlotNumericExpression[@role='y']") ndx = self.xpath("pmml:PlotNumericExpression[@role='dx']") ndy = self.xpath("pmml:PlotNumericExpression[@role='dy']") cutExpression = self.xpath("pmml:PlotSelection") if len(yofx) + len(dydx) + len(xoft) + len(yoft) + len(dxdt) + len( dydt) > 0: if len(yofx) == 1 and len(dydx) == 0 and len(xoft) == 0 and len( yoft) == 0 and len(dxdt) == 0 and len(dydt) == 0: expression = (yofx[0].text, ) derivative = (None, ) elif len(yofx) == 1 and len(dydx) == 1 and len(xoft) == 0 and len( yoft) == 0 and len(dxdt) == 0 and len(dydt) == 0: expression = (yofx[0].text, ) derivative = (dydx[0].text, ) elif len(yofx) == 0 and len(dydx) == 0 and len(xoft) == 1 and len( yoft) == 1 and len(dxdt) == 0 and len(dydt) == 0: expression = xoft[0].text, yoft[0].text derivative = None, None elif len(yofx) == 0 and len(dydx) == 0 and len(xoft) == 1 and len( yoft) == 1 and len(dxdt) == 1 and len(dydt) == 1: expression = xoft[0].text, yoft[0].text derivative = dxdt[0].text, dydt[0].text else: raise defs.PmmlValidationError( "The only allowed combinations of PlotFormulae are: \"y(x)\", \"y(x) dy/dx\", \"x(t) y(t)\", and \"x(t) y(t) dx/dt dy/dt\"" ) low = self.get("low", convertType=True) high = self.get("high", convertType=True) if low is None or high is None: raise defs.PmmlValidationError( "The \"low\" and \"high\" attributes are required for PlotCurves defined by formulae" ) samples = self.generateSamples(low, high) loop = self.get("loop", defaultFromXsd=True, convertType=True) state.x, state.y, state.dx, state.dy, xfieldType, yfieldType = self.expressionsToPoints( expression, derivative, samples, loop, functionTable, performanceTable) else: performanceTable.pause("PlotCurve prepare") if len(ndx) == 1: dxdataColumn = ndx[0].evaluate(dataTable, functionTable, performanceTable) else: dxdataColumn = None if len(ndy) == 1: dydataColumn = ndy[0].evaluate(dataTable, functionTable, performanceTable) else: dydataColumn = None performanceTable.unpause("PlotCurve prepare") if len(nx) == 0 and len(ny) == 1: performanceTable.pause("PlotCurve prepare") ydataColumn = ny[0].evaluate(dataTable, functionTable, performanceTable) performanceTable.unpause("PlotCurve prepare") if len(cutExpression) == 1: performanceTable.pause("PlotCurve prepare") selection = cutExpression[0].select( dataTable, functionTable, performanceTable) performanceTable.unpause("PlotCurve prepare") else: selection = NP("ones", len(ydataColumn.data), NP.dtype(bool)) if ydataColumn.mask is not None: selection = NP("logical_and", selection, NP(ydataColumn.mask == defs.VALID), selection) if dxdataColumn is not None and dxdataColumn.mask is not None: selection = NP("logical_and", selection, NP(dxdataColumn.mask == defs.VALID), selection) if dydataColumn is not None and dydataColumn.mask is not None: selection = NP("logical_and", selection, NP(dydataColumn.mask == defs.VALID), selection) yarray = ydataColumn.data[selection] xarray = NP("ones", len(yarray), dtype=NP.dtype(float)) xarray[0] = 0.0 xarray = NP("cumsum", xarray) dxarray, dyarray = None, None if dxdataColumn is not None: dxarray = dxdataColumn.data[selection] if dydataColumn is not None: dyarray = dydataColumn.data[selection] xfieldType = self.xfieldType yfieldType = ydataColumn.fieldType elif len(nx) == 1 and len(ny) == 1: performanceTable.pause("PlotCurve prepare") xdataColumn = nx[0].evaluate(dataTable, functionTable, performanceTable) ydataColumn = ny[0].evaluate(dataTable, functionTable, performanceTable) performanceTable.unpause("PlotCurve prepare") if len(cutExpression) == 1: performanceTable.pause("PlotCurve prepare") selection = cutExpression[0].select( dataTable, functionTable, performanceTable) performanceTable.unpause("PlotCurve prepare") else: selection = NP("ones", len(ydataColumn.data), NP.dtype(bool)) if xdataColumn.mask is not None: selection = NP("logical_and", selection, NP(xdataColumn.mask == defs.VALID), selection) if ydataColumn.mask is not None: selection = NP("logical_and", selection, NP(ydataColumn.mask == defs.VALID), selection) if dxdataColumn is not None and dxdataColumn.mask is not None: selection = NP("logical_and", selection, NP(dxdataColumn.mask == defs.VALID), selection) if dydataColumn is not None and dydataColumn.mask is not None: selection = NP("logical_and", selection, NP(dydataColumn.mask == defs.VALID), selection) xarray = xdataColumn.data[selection] yarray = ydataColumn.data[selection] dxarray, dyarray = None, None if dxdataColumn is not None: dxarray = dxdataColumn.data[selection] if dydataColumn is not None: dyarray = dydataColumn.data[selection] xfieldType = xdataColumn.fieldType yfieldType = ydataColumn.fieldType else: raise defs.PmmlValidationError( "The only allowed combinations of PlotNumericExpressions are: \"y(x)\" and \"x(t) y(t)\"" ) persistentState = {} stateId = self.get("stateId") if stateId is not None: if stateId in dataTable.state: persistentState = dataTable.state[stateId] xarray = NP("concatenate", [xarray, persistentState["x"]]) yarray = NP("concatenate", [yarray, persistentState["y"]]) if dxarray is not None: dxarray = NP("concatenate", [dxarray, persistentState["dx"]]) if dyarray is not None: dyarray = NP("concatenate", [dyarray, persistentState["dy"]]) else: dataTable.state[stateId] = persistentState persistentState["x"] = xarray persistentState["y"] = yarray if dxarray is not None: persistentState["dx"] = dxarray if dyarray is not None: persistentState["dy"] = dyarray smooth = self.get("smooth", defaultFromXsd=True, convertType=True) if not smooth: if dyarray is not None and dxarray is None: dxarray = NP( (NP("roll", xarray, -1) - NP("roll", xarray, 1)) / 2.0) dyarray = dyarray * dxarray loop = self.get("loop", defaultFromXsd=True, convertType=True) if dxarray is not None and not loop: dxarray[0] = 0.0 dxarray[-1] = 0.0 if dyarray is not None and not loop: dyarray[0] = 0.0 dyarray[-1] = 0.0 state.x = xarray state.y = yarray state.dx = dxarray state.dy = dyarray else: smoothingScale = self.get("smoothingScale", defaultFromXsd=True, convertType=True) loop = self.get("loop", defaultFromXsd=True, convertType=True) samples = self.generateSamples(xarray.min(), xarray.max()) state.x, state.y, state.dx, state.dy = self.pointsToSmoothCurve( xarray, yarray, samples, smoothingScale, loop) if plotRange is not None: plotRange.expand(state.x, state.y, xfieldType, yfieldType) performanceTable.end("PlotCurve prepare") def draw(self, state, plotCoordinates, plotDefinitions, performanceTable): """Draw the plot element. This stage consists of creating an SVG image of the pre-computed data. @type state: ad-hoc Python object @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type plotCoordinates: PlotCoordinates @param plotCoordinates: The coordinate system in which this plot element will be placed. @type plotDefinitions: PlotDefinitions @type plotDefinitions: The dictionary of key-value pairs that forms the <defs> section of the SVG document. @type performanceTable: PerformanceTable @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process. @rtype: SvgBinding @return: An SVG fragment representing the fully drawn plot element. """ svg = SvgBinding.elementMaker performanceTable.begin("PlotCurve draw") loop = self.get("loop", defaultFromXsd=True, convertType=True) pathdata = self.formatPathdata( state.x, state.y, state.dx, state.dy, plotCoordinates, loop, (state.dx is not None and state.dy is not None)) output = svg.g() style = self.getStyleState() strokeStyle = dict( (x, style[x]) for x in style if x.startswith("stroke")) fillStyle = dict((x, style[x]) for x in style if x.startswith("fill")) fillStyle["stroke"] = "none" if style["fill"] != "none": if len(self.xpath("pmml:PlotFormula[@role='y(x)']")) > 0 and len( pathdata) > 1: firstPoint = plotCoordinates(state.x[0], 0.0) lastPoint = plotCoordinates(state.x[-1], 0.0) X0, Y0 = plotCoordinates(state.x[0], state.y[0]) pathdata2 = ["M %r %r" % firstPoint] pathdata2.append("L %r %r" % (X0, Y0)) pathdata2.extend(pathdata[1:]) pathdata2.append("L %r %r" % lastPoint) output.append( svg.path(d=" ".join(pathdata2), style=PlotStyle.toString(fillStyle))) else: output.append( svg.path(d=" ".join(pathdata), style=PlotStyle.toString(fillStyle))) output.append( svg.path(d=" ".join(pathdata), style=PlotStyle.toString(strokeStyle))) svgId = self.get("svgId") if svgId is not None: output["id"] = svgId performanceTable.end("PlotCurve draw") return output
class PlotSvgContent(PmmlPlotContent): """PlotSvgContent represents an SVG image embedded in a coordinate system. PMML subelements: - SvgBinding for inline SVG. PMML attributes: - svgId: id for the resulting SVG element. - fileName: for external SVG. - x1: left edge. - y1: bottom edge. - x2: right edge. - y2: top edge. Inline and external SVG are mutually exclusive. See the source code for the full XSD. """ xsd = """<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"> <xs:element name="PlotSvgContent"> <xs:complexType> <xs:complexContent> <xs:restriction base="xs:anyType"> <xs:sequence> <xs:any minOccurs="0" maxOccurs="1" processContents="skip" /> </xs:sequence> <xs:attribute name="svgId" type="xs:string" use="optional" /> <xs:attribute name="fileName" type="xs:string" use="optional" /> <xs:attribute name="x1" type="xs:double" use="required" /> <xs:attribute name="y1" type="xs:double" use="required" /> <xs:attribute name="x2" type="xs:double" use="required" /> <xs:attribute name="y2" type="xs:double" use="required" /> </xs:restriction> </xs:complexContent> </xs:complexType> </xs:element> </xs:schema> """ fieldTypeNumeric = FakeFieldType("double", "continuous") def prepare(self, state, dataTable, functionName, performanceTable, plotRange): """Prepare a plot element for drawing. This stage consists of calculating all quantities and determing the bounds of the data. These bounds may be unioned with bounds from other plot elements that overlay this plot element, so the drawing (which requires a finalized coordinate system) cannot begin yet. This method modifies C{plotRange}. @type state: ad-hoc Python object @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type dataTable: DataTable @param dataTable: Contains the data to plot. @type functionTable: FunctionTable @param functionTable: Defines functions that may be used to transform data for plotting. @type performanceTable: PerformanceTable @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process. @type plotRange: PlotRange @param plotRange: The bounding box of plot coordinates that this function will expand. """ self._saveContext(dataTable) x1 = float(self["x1"]) y1 = float(self["y1"]) x2 = float(self["x2"]) y2 = float(self["y2"]) if x1 >= x2 or y1 >= y2: raise defs.PmmlValidationError( "x1 must be less than x2 and y1 must be less than y2") if plotRange.xStrictlyPositive or plotRange.yStrictlyPositive: raise defs.PmmlValidationError( "PlotSvgContent can only be properly displayed in linear coordinates" ) plotRange.xminPush(x1, self.fieldTypeNumeric, sticky=True) plotRange.yminPush(y1, self.fieldTypeNumeric, sticky=True) plotRange.xmaxPush(x2, self.fieldTypeNumeric, sticky=True) plotRange.ymaxPush(y2, self.fieldTypeNumeric, sticky=True) def draw(self, state, plotCoordinates, plotDefinitions, performanceTable): """Draw the plot element. This stage consists of creating an SVG image of the pre-computed data. @type state: ad-hoc Python object @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type plotCoordinates: PlotCoordinates @param plotCoordinates: The coordinate system in which this plot element will be placed. @type plotDefinitions: PlotDefinitions @type plotDefinitions: The dictionary of key-value pairs that forms the <defs> section of the SVG document. @type performanceTable: PerformanceTable @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process. @rtype: SvgBinding @return: An SVG fragment representing the fully drawn plot element. """ svg = SvgBinding.elementMaker x1 = float(self["x1"]) y1 = float(self["y1"]) x2 = float(self["x2"]) y2 = float(self["y2"]) inlineSvg = self.getchildren() fileName = self.get("fileName") if len(inlineSvg) == 1 and fileName is None: svgBinding = inlineSvg[0] elif len(inlineSvg) == 0 and fileName is not None: svgBinding = SvgBinding.loadXml(fileName) else: raise defs.PmmlValidationError( "PlotSvgContent should specify an inline SVG or a fileName but not both or neither" ) sx1, sy1, sx2, sy2 = PlotSvgAnnotation.findSize(svgBinding) subCoordinates = PlotCoordinatesWindow(plotCoordinates, sx1, sy1, sx2, sy2, x1, y1, x2 - x1, y2 - y1) tx0, ty0 = subCoordinates(0.0, 0.0) tx1, ty1 = subCoordinates(1.0, 1.0) transform = "translate(%r, %r) scale(%r, %r)" % (tx0, ty0, tx1 - tx0, ty1 - ty0) attribs = {"transform": transform} svgId = self.get("svgId") if svgId is not None: attribs["id"] = svgId if "style" in svgBinding.attrib: attribs["style"] = svgBinding.attrib["style"] return svg.g(*(copy.deepcopy(svgBinding).getchildren()), **attribs)
def calculateScore(self, dataTable, functionTable, performanceTable): """Calculate the score of this model. This method is called by C{calculate} to separate operations that are performed by all models (in C{calculate}) from operations that are performed by specific models (in C{calculateScore}). @type subTable: DataTable @param subTable: The DataTable representing this model's lexical scope. @type functionTable: FunctionTable or None @param functionTable: A table of functions. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: A DataColumn containing the score. """ performanceTable.begin("TreeModel") performanceTable.begin("set up") missingValueStrategy = self.get("missingValueStrategy", defaultFromXsd=True) if missingValueStrategy == "lastPrediction": missingValueStrategy = Node.LAST_PREDICTION elif missingValueStrategy == "nullPrediction": missingValueStrategy = Node.NULL_PREDICTION elif missingValueStrategy == "defaultChild": missingValueStrategy = Node.DEFAULT_CHILD elif missingValueStrategy == "weightedConfidence": missingValueStrategy = Node.WEIGHTED_CONFIDENCE elif missingValueStrategy == "aggregateNodes": missingValueStrategy = Node.AGGREGATE_NODES elif missingValueStrategy == "none": missingValueStrategy = Node.NONE missingValuePenalty = self.get("missingValuePenalty", defaultFromXsd=True, convertType=True) noTrueChildStrategy = self.get("noTrueChildStrategy", defaultFromXsd=True) if noTrueChildStrategy == "returnNullPredication": noTrueChildStrategy = Node.RETURN_NULL_PREDICTION elif noTrueChildStrategy == "returnLastPrediction": noTrueChildStrategy = Node.RETURN_LAST_PREDICTION if self["functionName"] == "classification": fieldType = FakeFieldType("string", "categorical") elif self["functionName"] == "regression": fieldType = FakeFieldType("double", "continuous") else: raise defs.PmmlValidationError( "TreeModel functionName may only be \"classification\" or \"regression\", not \"%s\"" % self["functionName"]) performanceTable.end("set up") score = { None: DataColumn(fieldType, NP("empty", len(dataTable), dtype=fieldType.dtype), NP("ones", len(dataTable), dtype=defs.maskType)) } score[None]._unlock() if self.subFields["entity"]: fieldType = FakeFieldType("object", "any") score["entity"] = DataColumn( fieldType, NP("empty", len(dataTable), dtype=fieldType.dtype), NP("ones", len(dataTable), dtype=defs.maskType)) score["entity"]._unlock() if self.subFields["entityId"]: fieldType = FakeFieldType("string", "categorical") score["entityId"] = DataColumn( fieldType, NP("empty", len(dataTable), dtype=fieldType.dtype), NP("ones", len(dataTable), dtype=defs.maskType)) score["entityId"]._unlock() if self.subFields["confidence"]: fieldType = FakeFieldType("double", "continuous") score["confidence"] = DataColumn( fieldType, NP("empty", len(dataTable), dtype=fieldType.dtype), NP("ones", len(dataTable), dtype=defs.maskType)) score["confidence"]._unlock() fieldType = FakeFieldType("double", "continuous") score["penaltyProduct"] = DataColumn( fieldType, NP("ones", len(dataTable), dtype=fieldType.dtype), None) score["penaltyProduct"]._unlock() if self.subFields["probability"]: fieldType = FakeFieldType("double", "continuous") score["probability"] = DataColumn( fieldType, NP("empty", len(dataTable), dtype=fieldType.dtype), NP("ones", len(dataTable), dtype=defs.maskType)) score["probability"]._unlock() node = self.childOfClass(Node) selection = node.evaluatePredicate(dataTable, functionTable, performanceTable, returnUnknowns=False) node.applyScore(dataTable, functionTable, performanceTable, selection, score, missingValueStrategy, missingValuePenalty, noTrueChildStrategy) if "confidence" in score: score["confidence"]._data *= score["penaltyProduct"].data del score["penaltyProduct"] for field in score.values(): if not field.mask.any(): field._mask = None else: field._mask *= defs.INVALID field._lock() performanceTable.end("TreeModel") return score
class PlotHeatMap(PmmlPlotContent): """Represents a 2d heat map of a mathematical formula or a 2d histogram of data. PMML subelements for mathematical function plotting: - PlotFormula role="z(x,y)" PMML subelements for 2d histograms: - PlotNumericExpression role="x" - PlotNumericExpression role="y" - PlotNumericExpression role="zweight" (optional) - PlotSelection: expression or predicate to filter the data before plotting. PMML subelements for plotting the mean of a third coordinate z: - PlotNumericExpression role="x" - PlotNumericExpression role="y" - PlotNumericExpression role="zmean" - PlotSelection: expression or predicate to filter the data before plotting. PMML attribute: - svgId: id for the resulting SVG element. - stateId: key for persistent storage in a DataTableState. - xbins: number of histogram bins in the x direction. - ybins: number of histogram bins in the y direction. - xlow: low edge of the x range of the histogram. - ylow: low edge of the y range of the histogram. - xhigh: high edge of the x range of the histogram. - yhigh: high edge of the y range of the histogram. - imageRendering: "optimizeQuality", "optimizeSpeed" - onePixelBeyondBorder: if "true", extend the image beyond the border by one pixel. This is to work around a feature of many SVG viewers that blend the borders of a raster image into the background. See the source code for the full XSD. """ xsd = """<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"> <xs:element name="PlotHeatMap"> <xs:complexType> <xs:sequence> <xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded" /> <xs:choice minOccurs="1" maxOccurs="1"> <xs:element ref="PlotFormula" minOccurs="1" maxOccurs="1" /> <xs:sequence> <xs:element ref="PlotNumericExpression" minOccurs="2" maxOccurs="3" /> <xs:element ref="PlotSelection" minOccurs="0" maxOccurs="1" /> </xs:sequence> </xs:choice> </xs:sequence> <xs:attribute name="svgId" type="xs:string" use="optional" /> <xs:attribute name="stateId" type="xs:string" use="optional" /> <xs:attribute name="xbins" type="xs:positiveInteger" use="optional" /> <xs:attribute name="ybins" type="xs:positiveInteger" use="optional" /> <xs:attribute name="xlow" type="xs:double" use="optional" /> <xs:attribute name="ylow" type="xs:double" use="optional" /> <xs:attribute name="xhigh" type="xs:double" use="optional" /> <xs:attribute name="yhigh" type="xs:double" use="optional" /> <xs:attribute name="imageRendering" use="optional" default="optimizeQuality"> <xs:simpleType> <xs:restriction base="xs:string"> <xs:enumeration value="optimizeQuality" /> <xs:enumeration value="optimizeSpeed" /> </xs:restriction> </xs:simpleType> </xs:attribute> <xs:attribute name="onePixelBeyondBorder" type="xs:boolean" use="optional" default="true" /> </xs:complexType> </xs:element> </xs:schema> """ xyfieldType = FakeFieldType("double", "continuous") zfieldType = FakeFieldType("double", "continuous") def prepare(self, state, dataTable, functionTable, performanceTable, plotRange): """Prepare a plot element for drawing. This stage consists of calculating all quantities and determing the bounds of the data. These bounds may be unioned with bounds from other plot elements that overlay this plot element, so the drawing (which requires a finalized coordinate system) cannot begin yet. This method modifies C{plotRange}. @type state: ad-hoc Python object @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type dataTable: DataTable @param dataTable: Contains the data to plot. @type functionTable: FunctionTable @param functionTable: Defines functions that may be used to transform data for plotting. @type performanceTable: PerformanceTable @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process. @type plotRange: PlotRange @param plotRange: The bounding box of plot coordinates that this function will expand. """ self.checkRoles(["z(x,y)", "x", "y", "zmean", "zweight"]) performanceTable.begin("PlotHeatMap prepare") self._saveContext(dataTable) zofxy = self.xpath("pmml:PlotFormula[@role='z(x,y)']") xexpr = self.xpath("pmml:PlotNumericExpression[@role='x']") yexpr = self.xpath("pmml:PlotNumericExpression[@role='y']") zmean = self.xpath("pmml:PlotNumericExpression[@role='zmean']") zweight = self.xpath("pmml:PlotNumericExpression[@role='zweight']") cutExpression = self.xpath("pmml:PlotSelection") if len(zofxy) == 1 and len(xexpr) == 0 and len(yexpr) == 0 and len( zmean) == 0 and len(zweight) == 0: xbins = self.get("xbins", convertType=True) xlow = self.get("xlow", convertType=True) xhigh = self.get("xhigh", convertType=True) ybins = self.get("ybins", convertType=True) ylow = self.get("ylow", convertType=True) yhigh = self.get("yhigh", convertType=True) if xbins is None or xlow is None or xhigh is None or ybins is None or ylow is None or yhigh is None: raise defs.PmmlValidationError( "xbins, xlow, xhigh, ybins, ylow, and yhigh are required for HeatMaps of a mathematical formula" ) if xlow >= xhigh or ylow >= yhigh: raise defs.PmmlValidationError( "xlow must be less than xhigh and ylow must be less than yhigh" ) if plotRange.xStrictlyPositive or plotRange.yStrictlyPositive: raise defs.PmmlValidationError( "PlotHeatMap can only be properly displayed in linear x, y coordinates" ) xbinWidth = (xhigh - xlow) / float(xbins) ybinWidth = (yhigh - ylow) / float(ybins) xarray = NP("tile", NP("linspace", xlow, xhigh, xbins, endpoint=True), ybins) yarray = NP("repeat", NP("linspace", ylow, yhigh, ybins, endpoint=True), xbins) sampleTable = DataTable({ "x": "double", "y": "double" }, { "x": xarray, "y": yarray }) parsed = Formula.parse(zofxy[0].text) performanceTable.pause("PlotHeatMap prepare") zdataColumn = parsed.evaluate(sampleTable, functionTable, performanceTable) performanceTable.unpause("PlotHeatMap prepare") if not zdataColumn.fieldType.isnumeric(): raise defs.PmmlValidationError( "PlotFormula z(x,y) must return a numeric expression, not %r" % zdataColumn.fieldType) selection = NP("isfinite", zdataColumn.data) if zdataColumn.mask is not None: NP("logical_and", selection, NP(zdataColumn.mask == defs.VALID), selection) if plotRange.zStrictlyPositive: NP("logical_and", selection, NP(zdataColumn.data > 0.0), selection) gooddata = zdataColumn.data[selection] plotRange.zminPush(gooddata.min(), zdataColumn.fieldType, sticky=False) plotRange.zmaxPush(gooddata.max(), zdataColumn.fieldType, sticky=False) state.zdata = zdataColumn.data state.zmask = NP("logical_not", selection) * defs.INVALID elif len(zofxy) == 0 and len(xexpr) == 1 and len(yexpr) == 1: performanceTable.pause("PlotHeatMap prepare") xdataColumn = xexpr[0].evaluate(dataTable, functionTable, performanceTable) ydataColumn = yexpr[0].evaluate(dataTable, functionTable, performanceTable) performanceTable.unpause("PlotHeatMap prepare") xbins = self.get("xbins", convertType=True) xlow = self.get("xlow", convertType=True) xhigh = self.get("xhigh", convertType=True) ybins = self.get("ybins", convertType=True) ylow = self.get("ylow", convertType=True) yhigh = self.get("yhigh", convertType=True) if len(xdataColumn) > 0: if xlow is None: xlow = NP("nanmin", xdataColumn.data) if xhigh is None: xhigh = NP("nanmax", xdataColumn.data) if ylow is None: ylow = NP("nanmin", ydataColumn.data) if yhigh is None: yhigh = NP("nanmax", ydataColumn.data) else: if xlow is None: xlow = 0.0 if xhigh is None: xhigh = 1.0 if ylow is None: ylow = 0.0 if yhigh is None: yhigh = 1.0 if xbins is None: q1, q3 = NP("percentile", xdataColumn.data, [25.0, 75.0]) binWidth = 2.0 * (q3 - q1) / math.pow(len(xdataColumn.data), 1.0 / 3.0) if binWidth > 0.0: xbins = max(10, int(math.ceil((xhigh - xlow) / binWidth))) else: xbins = 10 if ybins is None: q1, q3 = NP("percentile", ydataColumn.data, [25.0, 75.0]) binWidth = 2.0 * (q3 - q1) / math.pow(len(ydataColumn.data), 1.0 / 3.0) if binWidth > 0.0: ybins = max(10, int(math.ceil((yhigh - ylow) / binWidth))) else: ybins = 10 if xlow >= xhigh or ylow >= yhigh: raise defs.PmmlValidationError( "xlow must be less than xhigh and ylow must be less than yhigh" ) if plotRange.xStrictlyPositive or plotRange.yStrictlyPositive: raise defs.PmmlValidationError( "PlotHeatMap can only be properly displayed in linear x, y coordinates" ) persistentState = {} stateId = self.get("stateId") if stateId is not None: if stateId in dataTable.state: persistentState = dataTable.state[stateId] else: dataTable.state[stateId] = persistentState if len(zmean) == 0: if "xbins" in persistentState: xbins = persistentState["xbins"] if "xlow" in persistentState: xlow = persistentState["xlow"] if "xhigh" in persistentState: xhigh = persistentState["xhigh"] if "ybins" in persistentState: ybins = persistentState["ybins"] if "ylow" in persistentState: ylow = persistentState["ylow"] if "yhigh" in persistentState: yhigh = persistentState["yhigh"] persistentState["xbins"] = xbins persistentState["xlow"] = xlow persistentState["xhigh"] = xhigh persistentState["ybins"] = ybins persistentState["ylow"] = ylow persistentState["yhigh"] = yhigh xbinWidth = (xhigh - xlow) / float(xbins) ybinWidth = (yhigh - ylow) / float(ybins) mask = NP("ones", len(dataTable), dtype=NP.dtype(float)) if xdataColumn.mask is not None: NP("multiply", mask, (xdataColumn.mask == defs.VALID), mask) if ydataColumn.mask is not None: NP("multiply", mask, (ydataColumn.mask == defs.VALID), mask) if len(cutExpression) == 1: performanceTable.pause("PlotHeatMap prepare") NP( "multiply", mask, cutExpression[0].select(dataTable, functionTable, performanceTable), mask) performanceTable.unpause("PlotHeatMap prepare") if len(zmean) == 0 and len(zweight) == 0: histogram, xedges, yedges = NP("histogram2d", ydataColumn.data, xdataColumn.data, bins=(ybins, xbins), range=[[ylow, yhigh], [xlow, xhigh]], weights=mask) if len(dataTable) == 0: # work around Numpy <= 1.6.1 bug histogram = NP("zeros", (ybins, xbins), dtype=NP.dtype(float)) if "histogram" in persistentState: persistentState["histogram"] = NP( persistentState["histogram"] + histogram) else: persistentState["histogram"] = histogram histogram = persistentState["histogram"] if plotRange.zStrictlyPositive: zmin = 0.1 else: zmin = 0.0 zmax = NP("nanmax", histogram) plotRange.zminPush(zmin, self.zfieldType, sticky=True) if zmax > zmin: plotRange.zmaxPush(zmax, self.zfieldType, sticky=False) elif len(zmean) == 0 and len(zweight) == 1: performanceTable.pause("PlotHeatMap prepare") weightsDataColumn = zweight[0].evaluate( dataTable, functionTable, performanceTable) performanceTable.unpause("PlotHeatMap prepare") if weightsDataColumn.mask is not None: NP("multiply", mask, (weightsDataColumn.mask == defs.VALID), mask) weights = NP(weightsDataColumn.data * mask) histogram, xedges, yedges = NP("histogram2d", ydataColumn.data, xdataColumn.data, bins=(ybins, xbins), range=[[ylow, yhigh], [xlow, xhigh]], weights=weights) if "histogram" in persistentState: persistentState["histogram"] = NP( persistentState["histogram"] + histogram) else: persistentState["histogram"] = histogram histogram = persistentState["histogram"] if plotRange.zStrictlyPositive: w = weights[NP(weights > 0.0)] if len(w) > 0: zmin = 0.1 * NP("nanmin", w) else: zmin = 0.1 else: zmin = 0.0 zmax = NP("nanmax", histogram) plotRange.zminPush(zmin, self.zfieldType, sticky=True) if zmax > zmin: plotRange.zmaxPush(zmax, self.zfieldType, sticky=False) elif len(zmean) == 1 and len(zweight) == 0: performanceTable.pause("PlotHeatMap prepare") zdataColumn = zmean[0].evaluate(dataTable, functionTable, performanceTable) performanceTable.unpause("PlotHeatMap prepare") if zdataColumn.mask is not None: NP("multiply", mask, (zdataColumn.mask == defs.VALID), mask) weights = NP(zdataColumn.data * mask) numer, xedges, yedges = NP("histogram2d", ydataColumn.data, xdataColumn.data, bins=(ybins, xbins), range=[[ylow, yhigh], [xlow, xhigh]], weights=weights) denom, xedges, yedges = NP("histogram2d", ydataColumn.data, xdataColumn.data, bins=(ybins, xbins), range=[[ylow, yhigh], [xlow, xhigh]], weights=mask) if "numer" in persistentState: persistentState["numer"] = NP(persistentState["numer"] + numer) persistentState["denom"] = NP(persistentState["denom"] + denom) else: persistentState["numer"] = numer persistentState["denom"] = denom numer = persistentState["numer"] denom = persistentState["denom"] histogram = numer / denom selection = NP("isfinite", histogram) if plotRange.zStrictlyPositive: NP("logical_and", selection, NP(histogram > 0.0), selection) if NP("count_nonzero", selection) > 0: gooddata = histogram[selection] plotRange.zminPush(gooddata.min(), self.zfieldType, sticky=False) plotRange.zmaxPush(gooddata.max(), self.zfieldType, sticky=False) else: raise defs.PmmlValidationError( "The only allowed combinations of PlotFormula/PlotNumericExpressions are: \"z(x,y)\" (function), \"x y\" (histogram), \"x y zmean\" (mean of z in x y bins), \"x y zweight\" (weighted x y histogram)" ) state.zdata = NP("reshape", histogram, xbins * ybins) state.zmask = None else: raise defs.PmmlValidationError( "The only allowed combinations of PlotFormula/PlotNumericExpressions are: \"z(x,y)\" (function), \"x y\" (histogram), \"x y zmean\" (mean of z in x y bins), \"x y zweight\" (weighted x y histogram)" ) plotRange.xminPush(xlow, self.xyfieldType, sticky=True) plotRange.yminPush(ylow, self.xyfieldType, sticky=True) plotRange.xmaxPush(xhigh, self.xyfieldType, sticky=True) plotRange.ymaxPush(yhigh, self.xyfieldType, sticky=True) state.xbins = xbins state.xlow = xlow state.xhigh = xhigh state.ybins = ybins state.ylow = ylow state.yhigh = yhigh performanceTable.end("PlotHeatMap prepare") def draw(self, state, plotCoordinates, plotDefinitions, performanceTable): """Draw the plot element. This stage consists of creating an SVG image of the pre-computed data. @type state: ad-hoc Python object @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type plotCoordinates: PlotCoordinates @param plotCoordinates: The coordinate system in which this plot element will be placed. @type plotDefinitions: PlotDefinitions @type plotDefinitions: The dictionary of key-value pairs that forms the <defs> section of the SVG document. @type performanceTable: PerformanceTable @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process. @rtype: SvgBinding @return: An SVG fragment representing the fully drawn plot element. """ svg = SvgBinding.elementMaker svgId = self.get("svgId") if svgId is None: output = svg.g() else: output = svg.g(id=svgId) if not hasattr(plotCoordinates, "zmin"): return output performanceTable.begin("PlotHeatMap draw") xbins = state.xbins xlow = state.xlow xhigh = state.xhigh ybins = state.ybins ylow = state.ylow yhigh = state.yhigh reddata = NP("empty", len(state.zdata), dtype=NP.uint8) greendata = NP("empty", len(state.zdata), dtype=NP.uint8) bluedata = NP("empty", len(state.zdata), dtype=NP.uint8) alphadata = NP("empty", len(state.zdata), dtype=NP.uint8) if len(plotCoordinates.gradient) == 0: offsets = [0.0, 1.0] reds = [255, 0] greens = [255, 0] blues = [255, 255] alphas = [255, 255] else: offsets = [float(g["offset"]) for g in plotCoordinates.gradient] reds = [ min(int(math.floor(256 * float(g["red"]))), 255) for g in plotCoordinates.gradient ] greens = [ min(int(math.floor(256 * float(g["green"]))), 255) for g in plotCoordinates.gradient ] blues = [ min(int(math.floor(256 * float(g["blue"]))), 255) for g in plotCoordinates.gradient ] alphas = [ min(int(math.floor(256 * float(g.get("opacity", 1.0)))), 255) for g in plotCoordinates.gradient ] if not plotCoordinates.zlog: normalized = NP( NP(state.zdata - plotCoordinates.zmin) / (plotCoordinates.zmax - plotCoordinates.zmin)) else: normalized = NP( NP( NP("log10", state.zdata) - NP("log10", plotCoordinates.zmin)) / NP( NP("log10", plotCoordinates.zmax) - NP("log10", plotCoordinates.zmin))) for index in xrange(len(offsets) - 1): if index == 0: under = NP(normalized < offsets[index]) reddata[under] = reds[index] greendata[under] = greens[index] bluedata[under] = blues[index] alphadata[under] = alphas[index] if index == len(offsets) - 2: over = NP(normalized >= offsets[index + 1]) reddata[over] = reds[index + 1] greendata[over] = greens[index + 1] bluedata[over] = blues[index + 1] alphadata[over] = alphas[index + 1] selection = NP(normalized >= offsets[index]) NP("logical_and", selection, NP(normalized < offsets[index + 1]), selection) subset = NP(NP(normalized[selection]) - offsets[index]) norm = 1. / (offsets[index + 1] - offsets[index]) reddata[selection] = NP( "array", NP( NP(subset * ((reds[index + 1] - reds[index]) * norm)) + reds[index]), dtype=NP.uint8) greendata[selection] = NP( "array", NP( NP(subset * ((greens[index + 1] - greens[index]) * norm)) + greens[index]), dtype=NP.uint8) bluedata[selection] = NP( "array", NP( NP(subset * ((blues[index + 1] - blues[index]) * norm)) + blues[index]), dtype=NP.uint8) alphadata[selection] = NP( "array", NP( NP(subset * ((alphas[index + 1] - alphas[index]) * norm)) + alphas[index]), dtype=NP.uint8) badpixels = NP("isnan", normalized) NP("logical_or", badpixels, NP("isinf", normalized), badpixels) if state.zmask is not None: NP("logical_or", badpixels, NP(state.zmask != defs.VALID), badpixels) alphadata[badpixels] = 0 X1, Y1 = plotCoordinates(xlow, ylow) X2, Y2 = plotCoordinates(xhigh, yhigh) onePixelBeyondBorder = self.get("onePixelBeyondBorder", defaultFromXsd=True, convertType=True) if onePixelBeyondBorder: Xwidth = (X2 - X1) / xbins Yheight = (Y1 - Y2) / ybins X1 -= Xwidth X2 += Xwidth Y1 += Yheight Y2 -= Yheight arrayToPng = ArrayToPng() arrayToPng.putdata(xbins, ybins, reddata, greendata, bluedata, alphadata, flipy=True, onePixelBeyondBorder=onePixelBeyondBorder) output.append( svg.image( **{ defs.XLINK_HREF: "data:image/png;base64," + arrayToPng.b64encode(), "x": repr(X1), "y": repr(Y2), "width": repr(X2 - X1), "height": repr(Y1 - Y2), "image-rendering": self.get("imageRendering", defaultFromXsd=True), "preserveAspectRatio": "none" })) performanceTable.end("PlotHeatMap draw") return output
def evaluate(self, dataTable, functionTable, performanceTable): """Evaluate the expression, using a DataTable as input. @type dataTable: DataTable @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression. @type functionTable: FunctionTable @param functionTable: The FunctionTable, containing any functions that might be called in this expression. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: The result of the calculation as a DataColumn. """ function = self["function"] groupField = self.get("groupField") if groupField is None: performanceTable.begin("Aggregate %s" % function) else: performanceTable.begin("Aggregate %s groupField" % function) dataColumn = dataTable.fields[self["field"]] whereMask = self.where(dataTable, functionTable, performanceTable) stateId = self.get("stateId") if groupField is None: if stateId is None: getstate = None setstate = None else: def getstate(): return dataTable.state.get(stateId) def setstate(value): dataTable.state[stateId] = value if function == "count": dataColumn = self.functionCount(dataColumn, whereMask, None, getstate, setstate) elif function == "sum": dataColumn = self.functionSum(dataColumn, whereMask, None, getstate, setstate) elif function == "average": dataColumn = self.functionAverage(dataColumn, whereMask, None, getstate, setstate) elif function == "min": dataColumn = self.functionMin(dataColumn, whereMask, None, getstate, setstate) elif function == "max": dataColumn = self.functionMax(dataColumn, whereMask, None, getstate, setstate) elif function == "multiset": dataColumn = self.functionMultiset(dataColumn, whereMask, None, getstate, setstate) performanceTable.end("Aggregate %s" % function) return dataColumn else: groupColumn = dataTable.fields[groupField] if groupColumn.mask is None: validGroup = groupColumn.data else: validGroup = groupColumn.data[NP( groupColumn.mask == defs.VALID)] if stateId is not None: state = dataTable.state.get(stateId) if state is None: record = {} else: record = state valuesSeen = dict((stringValue, False) for stringValue in record) groupTables = {} groupColumnFieldType = None for groupValue in NP("unique", validGroup): groupSelection = NP(groupColumn.data == groupValue) if groupColumn.mask is not None: NP("logical_and", groupSelection, NP(groupColumn.mask == defs.VALID), groupSelection) groupColumnFieldType = groupColumn.fieldType stringValue = groupColumnFieldType.valueToString(groupValue) if stringValue in record: def getstate(): return record[stringValue] else: getstate = None def setstate(value): record[stringValue] = value valuesSeen[stringValue] = True value = groupColumnFieldType.valueToPython(groupValue) if function == "count": groupTables[value] = self.functionCount( dataColumn, whereMask, groupSelection, getstate, setstate) elif function == "sum": groupTables[value] = self.functionSum( dataColumn, whereMask, groupSelection, getstate, setstate) elif function == "average": groupTables[value] = self.functionAverage( dataColumn, whereMask, groupSelection, getstate, setstate) elif function == "min": groupTables[value] = self.functionMin( dataColumn, whereMask, groupSelection, getstate, setstate) elif function == "max": groupTables[value] = self.functionMax( dataColumn, whereMask, groupSelection, getstate, setstate) elif function == "multiset": groupTables[value] = self.functionMultiset( dataColumn, whereMask, groupSelection, getstate, setstate) if stateId is not None: dataTable.state[stateId] = record for stringValue in valuesSeen: if not valuesSeen[stringValue]: value = groupColumnFieldType.valueToPython( groupColumnFieldType.stringToValue(stringValue)) if function == "count": groupTables[value] = self.functionCountFake( record[stringValue], len(dataTable), dataColumn.fieldType) elif function == "sum": groupTables[value] = self.functionSumFake( record[stringValue], len(dataTable), dataColumn.fieldType) elif function == "average": groupTables[value] = self.functionAverageFake( record[stringValue], len(dataTable), dataColumn.fieldType) elif function in ("min", "max"): groupTables[value] = self.functionMinMaxFake( record[stringValue], len(dataTable), dataColumn.fieldType) elif function == "multiset": groupTables[value] = self.functionMultisetFake( record[stringValue], len(dataTable), dataColumn.fieldType) performanceTable.begin("Aggregate %s groupField collect" % function) fieldType = FakeFieldType("object", "any") data = NP("empty", len(dataTable), dtype=NP.dtype(object)) if function == "count": for i in xrange(len(dataTable)): data[i] = dict((value, table.data[i]) for value, table in groupTables.items() if table.data[i] != 0) elif function == "sum": for i in xrange(len(dataTable)): data[i] = dict((value, table.data[i]) for value, table in groupTables.items() if table.data[i] != 0.0) elif function == "average": for i in xrange(len(dataTable)): data[i] = dict( (value, table.data[i]) for value, table in groupTables.items() if table.data[i] > 0.0 or table.data[i] <= 0.0) elif function in ("min", "max"): for table in groupTables.values(): if table.mask is None: table._mask = NP("zeros", len(table), dtype=defs.maskType) for i in xrange(len(dataTable)): data[i] = dict((value, table.data[i]) for value, table in groupTables.items() if table.mask[i] == defs.VALID) elif function == "multiset": for i in xrange(len(dataTable)): data[i] = dict((value, table.data[i]) for value, table in groupTables.items() if len(table.data[i]) > 0) performanceTable.end("Aggregate %s groupField collect" % function) performanceTable.end("Aggregate %s groupField" % function) return DataColumn(fieldType, data, None)
def evaluate(self, dataTable, functionTable, performanceTable): """Evaluate the expression, using a DataTable as input. @type dataTable: DataTable @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression. @type functionTable: FunctionTable @param functionTable: The FunctionTable, containing any functions that might be called in this expression. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: The result of the calculation as a DataColumn. """ performanceTable.begin("MapValues") fieldType = FakeFieldType(self.get("dataType", "string"), self.get("optype", self._optype)) fieldType._newValuesAllowed = True defaultValue = self.get("defaultValue") if defaultValue is not None: defaultValue = fieldType.stringToValue(defaultValue) data = NP("empty", len(dataTable), dtype=fieldType.dtype) if defaultValue is not None: data[:] = defaultValue outputColumn = self["outputColumn"] columnNameToField = {} for fieldColumnPair in self.childrenOfTag("FieldColumnPair"): dataColumn = dataTable.fields[fieldColumnPair["field"]] columnNameToField[fieldColumnPair["column"]] = dataColumn # cache partial selections because they'll be used over and over in intersecting sets dataSelections = {} missingSelections = {} coverage = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) for index, row in enumerate(self.childOfClass(TableInterface).iterate()): outputValue = row.get(outputColumn) if outputValue is None: raise defs.PmmlValidationError("MapValues has outputColumn \"%s\" but a column with that name does not appear in row %d of the table" % (outputColumn, index)) del row[outputColumn] outputValue = fieldType.stringToValue(outputValue) # this is an intersection of all matching columns selection = NP("ones", len(dataTable), dtype=NP.dtype(bool)) for columnName, columnValueString in row.items(): dataColumn = columnNameToField.get(columnName) if dataColumn is not None: columnValue = dataColumn.fieldType.stringToValue(columnValueString) # one cached data array per column (name, value) pair if (columnName, columnValueString) not in dataSelections: selectData = NP(dataColumn.data == columnValue) if dataColumn.mask is not None: NP("logical_and", selectData, NP(dataColumn.mask == defs.VALID), selectData) dataSelections[columnName, columnValueString] = selectData NP("logical_and", selection, dataSelections[columnName, columnValueString], selection) # one cached mask array per column name ("missing" has only one possible value, though I consider any non-VALID "missing") if columnName not in missingSelections and dataColumn.mask is not None: missingSelections[columnName] = NP(dataColumn.mask != defs.VALID) # set the intersection to the output value data[selection] = outputValue NP("logical_or", coverage, selection, coverage) missing = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) for missingSelection in missingSelections.values(): NP("logical_or", missing, missingSelection, missing) coverage -= missing mask = missing * defs.MISSING data, mask = FieldCastMethods.applyMapMissingTo(fieldType, data, mask, self.get("mapMissingTo")) if defaultValue is None: NP("logical_not", coverage, coverage) if mask is None: mask = NP(coverage * defs.MISSING) else: mask[coverage] = defs.MISSING performanceTable.end("MapValues") return DataColumn(fieldType, data, mask)
def calculateScore(self, dataTable, functionTable, performanceTable): """Calculate the score of this model. This method is called by C{calculate} to separate operations that are performed by all models (in C{calculate}) from operations that are performed by specific models (in C{calculateScore}). @type subTable: DataTable @param subTable: The DataTable representing this model's lexical scope. @type functionTable: FunctionTable or None @param functionTable: A table of functions. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: A DataColumn containing the score. """ performanceTable.begin("ClusteringModel") performanceTable.begin("set up") distributionBased = (self["modelClass"] == "distributionBased") clusteringFields = self.xpath("pmml:ClusteringField[not(@isCenterField='false')]") fieldWeights = [clusteringField.get("fieldWeight", defaultFromXsd=True, convertType=True) for clusteringField in clusteringFields] for fieldWeight in fieldWeights: if fieldWeight < 0.0: raise defs.PmmlValidationError("ClusteringField fieldWeights must all be non-negative (encountered %g)" % fieldWeight) clusters = self.xpath("pmml:Cluster") comparisonMeasure = self.childOfClass(ComparisonMeasure) defaultCompareFunction = comparisonMeasure.get("compareFunction", defaultFromXsd=True) metric = comparisonMeasure.childOfClass(PmmlClusteringMetric) metrictag = metric.t performanceTable.end("set up") for clusteringField in clusteringFields: dataType = dataTable.fields[clusteringField["field"]].fieldType.dataType if dataType == "string": raise defs.PmmlValidationError("ClusteringField \"%s\" has dataType \"%s\", which cannot be used for clustering" % (clusteringField["field"], dataType)) missingValueWeights = self.childOfTag("MissingValueWeights") if missingValueWeights is None: adjustM = None else: performanceTable.begin("MissingValueWeights") missingWeights = missingValueWeights.childOfClass(PmmlArray).values(convertType=True) sumNMqi = NP("zeros", len(dataTable), dtype=NP.dtype(float)) for clusteringField, missingWeight in zip(clusteringFields, missingWeights): clusteringField.addToAdjustM(dataTable, functionTable, performanceTable, sumNMqi, missingWeight) adjustM = NP(sum(missingWeights) / sumNMqi) adjustM[NP(sumNMqi == 0.0)] = 1.0 performanceTable.end("MissingValueWeights") anyInvalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) for clusteringField in clusteringFields: mask = dataTable.fields[clusteringField["field"]].mask if mask is not None: NP("logical_or", anyInvalid, NP(mask == defs.INVALID), anyInvalid) bestClusterId = None bestClusterAffinity = None allClusterAffinities = {} for index, cluster in enumerate(clusters): array = cluster.childOfClass(PmmlArray) if array is None: raise defs.PmmlValidationError("Cluster must have an array to designate its center") centerStrings = array.values(convertType=False) if len(centerStrings) != len(clusteringFields): raise defs.PmmlValidationError("Cluster array has %d components, but there are %d ClusteringFields with isCenterField=true" % (len(centerStrings), len(clusteringFields))) performanceTable.begin(metrictag) if distributionBased: matrix = cluster.xpath("pmml:Covariances/pmml:Matrix") if len(matrix) != 1: raise defs.PmmlValidationError("In distribution-based clustering, all clusters must have a Covariances/Matrix") try: covarianceMatrix = NP("array", matrix[0].values(), dtype=NP.dtype(float)) except ValueError: raise defs.PmmlValidationError("Covariances/Matrix must contain real numbers for distribution-based clustering") else: covarianceMatrix = None state = self._State() metric.initialize(state, len(dataTable), len(clusteringFields), distributionBased) for clusteringField, centerString, fieldWeight in zip(clusteringFields, centerStrings, fieldWeights): if isinstance(metric, PmmlClusteringMetricBinary): metric.accumulateBinary(state, dataTable.fields[clusteringField["field"]], centerString, distributionBased) else: performanceTable.pause(metrictag) cxy = clusteringField.compare(dataTable, functionTable, performanceTable, centerString, defaultCompareFunction, anyInvalid) performanceTable.unpause(metrictag) metric.accumulate(state, cxy, fieldWeight, distributionBased) distance = metric.finalizeDistance(state, adjustM, distributionBased, covarianceMatrix) del state performanceTable.end(metrictag) if index == 0: bestClusterId = NP("ones", len(dataTable), dtype=NP.dtype(int)) # 1-based index bestClusterAffinity = distance better = NP(distance < bestClusterAffinity) bestClusterId[better] = index + 1 # 1-based index bestClusterAffinity[better] = distance[better] allClusterAffinities[cluster.get("id", "%d" % (index + 1))] = distance if not anyInvalid.any(): scoreMask = None else: scoreMask = NP(anyInvalid * defs.INVALID) performanceTable.begin("set scores") score = {} performanceTable.begin("predictedValue") fieldType = FakeFieldType("string", "categorical") clusterIdentifiers = NP("empty", len(dataTable), dtype=fieldType.dtype) for index, cluster in enumerate(clusters): value = fieldType.stringToValue(cluster.get("id", "%d" % (index + 1))) clusterIdentifiers[NP(bestClusterId == (index + 1))] = value score[None] = DataColumn(fieldType, clusterIdentifiers, scoreMask) performanceTable.end("predictedValue") if self.subFields["predictedDisplayValue"]: performanceTable.begin("predictedDisplayValue") fieldType = FakeFieldType("string", "categorical") clusterNames = NP("empty", len(dataTable), dtype=fieldType.dtype) for index, cluster in enumerate(clusters): value = fieldType.stringToValue(cluster.get("name", "")) clusterNames[NP(bestClusterId == (index + 1))] = value score["predictedDisplayValue"] = DataColumn(fieldType, clusterNames, scoreMask) performanceTable.end("predictedDisplayValue") if self.subFields["entity"]: performanceTable.begin("entity") fieldType = FakeFieldType("object", "any") entities = NP("empty", len(dataTable), dtype=fieldType.dtype) for index, cluster in enumerate(clusters): value = fieldType.stringToValue(cluster.get("name", "")) indexPlusOne = index + 1 for i in xrange(len(entities)): if bestClusterId[i] == indexPlusOne: entities[i] = cluster score["entity"] = DataColumn(fieldType, entities, scoreMask) performanceTable.end("entity") if self.subFields["clusterId"]: performanceTable.begin("clusterId") fieldType = FakeFieldType("integer", "continuous") score["clusterId"] = DataColumn(fieldType, bestClusterId, scoreMask) performanceTable.end("clusterId") if self.subFields["entityId"]: performanceTable.begin("entityId") fieldType = FakeFieldType("integer", "continuous") score["entityId"] = DataColumn(fieldType, bestClusterId, scoreMask) performanceTable.end("entityId") if self.subFields["clusterAffinity"]: performanceTable.begin("clusterAffinity") fieldType = FakeFieldType("double", "continuous") score["clusterAffinity"] = DataColumn(fieldType, bestClusterAffinity, scoreMask) performanceTable.end("clusterAffinity") if self.subFields["affinity"]: performanceTable.begin("affinity") fieldType = FakeFieldType("double", "continuous") score["affinity"] = DataColumn(fieldType, bestClusterAffinity, scoreMask) performanceTable.end("affinity") if self.subFields["all"]: performanceTable.begin("all") fieldType = FakeFieldType("double", "continuous") for identifier, distance in allClusterAffinities.items(): score["all.%s" % identifier] = DataColumn(fieldType, distance, scoreMask) performanceTable.end("all") performanceTable.end("set scores") performanceTable.end("ClusteringModel") return score
def __init__(self, dataType, value): self.fieldType = FakeFieldType(dataType, "continuous") self.value = value
def __init__(self, context, inputData, inputMask=None, inputState=None): """Create a DataTable from a type-context, input data, possible input masks, and possible input states. For maximum flexibility, very few assumptions are made about the format of C{inputData}. It need only have a structure that is equivalent to a dictionary mapping strings (field names) to lists of values (data columns). Numpy U{record arrays<http://docs.scipy.org/doc/numpy/user/basics.rec.html>}, U{NpzFiles <http://docs.scipy.org/doc/numpy/reference/generated/numpy.savez.html>}, and U{Pandas data frames<http://pandas.pydata.org/>} effectively present their data in this format because:: inputData[fieldName] yields a column of values. Regardless of the input type, these values are then interpreted by the C{context} to set their PMML type. The length of the resulting DataTable is equal to the length of the shortest DataColumn. Generally, one should use equal-length arrays to build a DataTable. @type context: PmmlBinding, FieldType, string, dict, or None @param context: If a rooted PmmlBinding, use the PMML's DataDictionary to interpret C{inputData}. If a FieldType, use that FieldType to interpret all fields. If a string, use that dataType (e.g. "integer", "dateDaysSince[1960]") to interpret all fields. If a dictionary from field names to FieldTypes or dataType strings, use them on a per-field basis. Otherwise, assume a FieldType from the Numpy C{dtype}. The last option only works if all C{inputData} columns are Numpy arrays. @type inputData: any dict-like mapping from strings to lists @param inputData: Maps field names (strings) to columns of data (lists or Numpy arrays) that are interpreted by C{context}. @type inputMask: dict-like mapping from strings to lists of bool, or None @param inputMask: If None, missing data are identified by C{NaN} values in the C{inputData} (Pandas convention). Otherwise, C{NaN} or a True value in the corresponding {inputMask} would label a data item as MISSING. @type inputState: DataTableState or None @param inputState: Initial state of the DataTable. To continue a previous calculation, use the C{dataTable.state} from the previous calculation. @raise TypeError: If the C{inputData} columns are not Numpy arrays and a C{context} is not given, this method raises an error. """ if isinstance(context, PmmlBinding) and len( context.xpath("ancestor-or-self::pmml:PMML")) != 0: # get types from PMML dataColumns = OrderedDict() for fieldName, fieldDefinition in context.fieldContext().items(): fieldType = FieldType(fieldDefinition) try: dataField = inputData[fieldName] except KeyError: dataField = None else: try: maskField = inputMask[fieldName] except (KeyError, TypeError): maskField = None if dataField is not None: dataColumns[fieldName] = fieldType.toDataColumn( dataField, maskField) else: if not isinstance(context, dict): context = dict((x, context) for x in inputData) if all(isinstance(x, FieldType) for x in context.values()): # FieldTypes provided explicitly dataColumns = OrderedDict() for fieldName in sorted(context.keys()): data = inputData[fieldName] if inputMask is None: mask = None else: mask = inputMask[fieldName] dataColumns[fieldName] = context[fieldName].toDataColumn( data, mask) elif all(isinstance(x, basestring) for x in context.values()): # FieldTypes provided by dataType name dataColumns = OrderedDict() for fieldName in sorted(context.keys()): data = inputData[fieldName] if inputMask is None: mask = None else: mask = inputMask[fieldName] if context[fieldName] == "string": fieldType = FakeFieldType(context[fieldName], "categorical") else: fieldType = FakeFieldType(context[fieldName], "continuous") dataColumns[fieldName] = fieldType.toDataColumn(data, mask) elif all( isinstance(inputData[x], NP.ndarray) for x in inputData.keys()): # FieldTypes provided by NumPy types dataColumns = OrderedDict() for fieldName in sorted(context.keys()): data = inputData[fieldName] if inputMask is None: mask = None else: mask = inputMask[fieldName] if data.dtype in (NP.object, NP.object0, NP.object_, NP.str, NP.str_, NP.string0, NP.string_) or re.match( "\|S[0-9]+", str( data.dtype)) is not None: fieldType = FakeFieldType("string", "categorical") elif data.dtype in (NP.int, NP.int0, NP.int8, NP.int16, NP.int32, NP.int64, NP.int_, NP.integer): fieldType = FakeFieldType("integer", "continuous") elif data.dtype in (NP.float, NP.__getattr__("float16", noneIfMissing=True), NP.float32): fieldType = FakeFieldType("float", "continuous") elif data.dtype in (NP.float64, NP.float128, NP.float_, NP.double): fieldType = FakeFieldType("double", "continuous") elif data.dtype in (NP.bool, NP.bool8, NP.bool_): fieldType = FakeFieldType("boolean", "continuous") else: raise TypeError("Unrecognized NumPy dtype: %r" % data.dtype) dataColumns[fieldName] = fieldType.toDataColumn(data, mask) else: raise TypeError( "Context must be PMML (anchored by a <PMML> ancestor), a dictionary of FieldType objects, dataType strings, or inputData must consist entirely of NumPy arrays" ) self._configure(dataColumns, inputState)
class NormContinuous(PmmlExpression): """NormContinuous implements an expression that performs piecewise linear, everywhere continuous, transformations on a continuous field. U{PMML specification<http://www.dmg.org/v4-1/Transformations.html>}. """ _fieldType = FakeFieldType("double", "continuous") def transformSelection(self, linearNorm1, linearNorm2, indata, outdata, selection): """Linearly transform a Subset of the dataset as part of an overall piecewise linear transformation. @type linearNorm1: PmmlBinding @param linearNorm1: The left-side <LinearNorm> object. @type linearNorm2: PmmlBinding @param linearNorm2: The right-side <LinearNorm> object. @type indata: 1d Numpy array @param indata: Unselected input data. @type outdata: 1d Numpy array @param outdata: Output data, modified by this function. @type selection: 1d Numpy array of bool @param selection: The Numpy selector for this piecewise region. """ a1 = linearNorm1.orig b1 = linearNorm1.norm a2 = linearNorm2.orig b2 = linearNorm2.norm outdata[selection] = NP( b1 + NP(NP(NP(indata[selection] - a1) / NP(a2 - a1)) * NP(b2 - b1))) def evaluate(self, dataTable, functionTable, performanceTable): """Evaluate the expression, using a DataTable as input. @type dataTable: DataTable @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression. @type functionTable: FunctionTable @param functionTable: The FunctionTable, containing any functions that might be called in this expression. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: The result of the calculation as a DataColumn. """ performanceTable.begin("NormContinuous") dataColumn = dataTable.fields[self["field"]] if dataColumn.fieldType.dataType in ("object", "string", "boolean"): raise defs.PmmlValidationError( "NormContinuous requires a numeric input field, but \"%s\" is" % dataColumn.fieldType.dataType) outliers = self.get("outliers") linearNorms = self.childrenOfTag("LinearNorm") for linearNorm in linearNorms: linearNorm.orig = float(linearNorm["orig"]) linearNorm.norm = float(linearNorm["norm"]) linearNorms.sort(lambda x, y: cmp(x.orig, y.orig) ) # technically, it's invalid if not already sorted data = NP("empty", len(dataTable), self._fieldType.dtype) mask = dataColumn.mask # extrapolate before the first selection = NP(dataColumn.data <= linearNorms[0].orig) if outliers == "asMissingValues": mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, selection) elif outliers == "asExtremeValues": data[selection] = linearNorms[0].norm else: self.transformSelection(linearNorms[0], linearNorms[1], dataColumn.data, data, selection) for i in xrange(len(linearNorms) - 1): selection = NP(linearNorms[i].orig < dataColumn.data) NP("logical_and", selection, NP(dataColumn.data <= linearNorms[i + 1].orig), selection) self.transformSelection(linearNorms[i], linearNorms[i + 1], dataColumn.data, data, selection) selection = NP(linearNorms[-1].orig < dataColumn.data) if outliers == "asMissingValues": mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, selection) elif outliers == "asExtremeValues": data[selection] = linearNorms[-1].norm else: self.transformSelection(linearNorms[-2], linearNorms[-1], dataColumn.data, data, selection) data, mask = FieldCastMethods.applyMapMissingTo( self._fieldType, data, mask, self.get("mapMissingTo")) performanceTable.end("NormContinuous") return DataColumn(self._fieldType, data, mask)
class PlotBoxAndWhisker(PmmlPlotContent): """Represents a "box-and-whiskers" plot or a "profile histogram." PMML subelements: - PlotExpression role="sliced": expression to be sliced like a histogram. - PlotNumericExpression role="profiled": expression to be profiled in each slice. - PlotSelection: expression or predicate to filter the data before plotting. - Intervals: non-uniform (numerical) histogram bins. - Values: explicit (categorical) histogram values. PMML attributes: - svgId: id for the resulting SVG element. - stateId: key for persistent storage in a DataTableState. - numBins: number of histogram bins. - low: histogram low edge. - high: histogram high edge. - levels: "percentage" for quartile-like box-and-whiskers, "standardDeviation" for mean and standard deviation, as in a profile histogram. - lowWhisker: bottom of the lower whisker, usually the 0th percentile (absolute minimum). - lowBox: bottom of the box, usually the 25th percentile. - midLine: middle line of the box, usually the median. - highBox: top of the box, usually the 75th percentile. - highWhisker: top of the upper whisker, usually the 100th percentile (absolute maximum). - vertical: if "true", plot the "sliced" expression on the x axis and the "profiled" expression on the y axis. - gap: size of the space between boxes in SVG coordinates. - style: CSS style properties. CSS properties: - fill, fill-opacity: color of the box. - stroke, stroke-dasharray, stroke-dashoffset, stroke-linecap, stroke-linejoin, stroke-miterlimit, stroke-opacity, stroke-width: properties of the line drawing the box and the whiskers. See the source code for the full XSD. """ styleProperties = ["fill", "fill-opacity", "stroke", "stroke-dasharray", "stroke-dashoffset", "stroke-linecap", "stroke-linejoin", "stroke-miterlimit", "stroke-opacity", "stroke-width", ] styleDefaults = {"fill": "none", "stroke": "black"} xsd = """<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"> <xs:element name="PlotBoxAndWhisker"> <xs:complexType> <xs:sequence> <xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded" /> <xs:element ref="PlotExpression" minOccurs="1" maxOccurs="1" /> <xs:element ref="PlotNumericExpression" minOccurs="1" maxOccurs="1" /> <xs:element ref="PlotSelection" minOccurs="0" maxOccurs="1" /> <xs:choice minOccurs="0" maxOccurs="1"> <xs:element ref="Interval" minOccurs="1" maxOccurs="unbounded" /> <xs:element ref="Value" minOccurs="1" maxOccurs="unbounded" /> </xs:choice> </xs:sequence> <xs:attribute name="svgId" type="xs:string" use="optional" /> <xs:attribute name="stateId" type="xs:string" use="optional" /> <xs:attribute name="numBins" type="xs:positiveInteger" use="optional" /> <xs:attribute name="low" type="xs:double" use="optional" /> <xs:attribute name="high" type="xs:double" use="optional" /> <xs:attribute name="levels" use="optional" default="percentage"> <xs:simpleType> <xs:restriction base="xs:string"> <xs:enumeration value="percentage" /> <xs:enumeration value="standardDeviation" /> </xs:restriction> </xs:simpleType> </xs:attribute> <xs:attribute name="lowWhisker" type="xs:double" use="optional" default="0" /> <xs:attribute name="lowBox" type="xs:double" use="optional" default="25" /> <xs:attribute name="midLine" type="xs:double" use="optional" default="50" /> <xs:attribute name="highBox" type="xs:double" use="optional" default="75" /> <xs:attribute name="highWhisker" type="xs:double" use="optional" default="100" /> <xs:attribute name="vertical" type="xs:boolean" use="optional" default="true" /> <xs:attribute name="gap" type="xs:double" use="optional" default="10" /> <xs:attribute name="style" type="xs:string" use="optional" default="%s" /> </xs:complexType> </xs:element> </xs:schema> """ % PlotStyle.toString(styleDefaults) fieldTypeNumeric = FakeFieldType("double", "continuous") def prepare(self, state, dataTable, functionTable, performanceTable, plotRange): """Prepare a plot element for drawing. This stage consists of calculating all quantities and determing the bounds of the data. These bounds may be unioned with bounds from other plot elements that overlay this plot element, so the drawing (which requires a finalized coordinate system) cannot begin yet. This method modifies C{plotRange}. @type state: ad-hoc Python object @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type dataTable: DataTable @param dataTable: Contains the data to plot. @type functionTable: FunctionTable @param functionTable: Defines functions that may be used to transform data for plotting. @type performanceTable: PerformanceTable @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process. @type plotRange: PlotRange @param plotRange: The bounding box of plot coordinates that this function will expand. """ self.checkRoles(["sliced", "profiled"]) slicedExpression = self.xpath("pmml:PlotExpression[@role='sliced']") profiledExpression = self.xpath("pmml:PlotNumericExpression[@role='profiled']") cutExpression = self.xpath("pmml:PlotSelection") if len(slicedExpression) != 1: raise defs.PmmlValidationError("PlotHistogram requires a PlotExpression with role \"sliced\"") if len(profiledExpression) != 1: raise defs.PmmlValidationError("PlotHistogram requires a PlotNumericExpression with role \"profiled\"") slicedDataColumn = slicedExpression[0].evaluate(dataTable, functionTable, performanceTable) profiledDataColumn = profiledExpression[0].evaluate(dataTable, functionTable, performanceTable) if len(cutExpression) == 1: selection = cutExpression[0].select(dataTable, functionTable, performanceTable) else: selection = NP("ones", len(dataTable), NP.dtype(bool)) performanceTable.begin("PlotBoxAndWhisker prepare") self._saveContext(dataTable) if slicedDataColumn.mask is not None: NP("logical_and", selection, NP(slicedDataColumn.mask == defs.VALID), selection) if profiledDataColumn.mask is not None: NP("logical_and", selection, NP(profiledDataColumn.mask == defs.VALID), selection) slicedArray = slicedDataColumn.data[selection] profiledArray = profiledDataColumn.data[selection] persistentState = {} stateId = self.get("stateId") if stateId is not None: if stateId in dataTable.state: persistentState = dataTable.state[stateId] else: dataTable.state[stateId] = persistentState intervals = self.xpath("pmml:Interval") values = self.xpath("pmml:Value") if "binType" not in persistentState: performanceTable.begin("establish binType") binType = PlotHistogram.establishBinType(slicedDataColumn.fieldType, intervals, values) persistentState["binType"] = binType if binType == "nonuniform": persistentState["distributions"] = [NP("empty", 0, dtype=profiledDataColumn.fieldType.dtype) for x in xrange(len(intervals))] elif binType == "explicit": persistentState["distributions"] = [NP("empty", 0, dtype=profiledDataColumn.fieldType.dtype) for x in xrange(len(values))] elif binType == "unique": persistentState["distributions"] = {} elif binType == "scale": numBins = self.get("numBins", convertType=True) low = self.get("low", convertType=True) high = self.get("high", convertType=True) numBins, low, high = PlotHistogram.determineScaleBins(numBins, low, high, slicedArray) persistentState["low"] = low persistentState["high"] = high persistentState["numBins"] = numBins persistentState["distributions"] = [NP("empty", 0, dtype=profiledDataColumn.fieldType.dtype) for x in xrange(numBins)] performanceTable.end("establish binType") if persistentState["binType"] == "nonuniform": performanceTable.begin("binType nonuniform") distributions = [None] * len(intervals) state.edges = [] lastLimitPoint = None lastClosed = None lastInterval = None for index, interval in enumerate(intervals): selection, lastLimitPoint, lastClosed, lastInterval = PlotHistogram.selectInterval(slicedDataColumn.fieldType, slicedArray, index, len(intervals) - 1, interval, state.edges, lastLimitPoint, lastClosed, lastInterval) if selection is None: distributions[index] = profiledArray else: distributions[index] = profiledArray[selection] persistentState["distributions"] = [NP("concatenate", [x, y]) for x, y in itertools.izip(persistentState["distributions"], distributions)] distributions = persistentState["distributions"] lowEdge = min(low for low, high in state.edges if low is not None) highEdge = max(high for low, high in state.edges if high is not None) state.slicedFieldType = self.fieldTypeNumeric performanceTable.end("binType nonuniform") elif persistentState["binType"] == "explicit": performanceTable.begin("binType explicit") distributions = [None] * len(values) displayValues = [] for index, value in enumerate(values): internalValue = slicedDataColumn.fieldType.stringToValue(value["value"]) displayValues.append(value.get("displayValue", slicedDataColumn.fieldType.valueToString(internalValue, displayValue=True))) selection = NP(slicedArray == internalValue) distributions[index] = profiledArray[selection] persistentState["distributions"] = [NP("concatenate", [x, y]) for x, y in itertools.izip(persistentState["distributions"], distributions)] distributions = persistentState["distributions"] state.edges = displayValues state.slicedFieldType = slicedDataColumn.fieldType performanceTable.end("binType explicit") elif persistentState["binType"] == "unique": performanceTable.begin("binType unique") uniques, inverse = NP("unique", slicedArray, return_inverse=True) persistentDistributions = persistentState["distributions"] for i, u in enumerate(uniques): string = slicedDataColumn.fieldType.valueToString(u, displayValue=False) selection = NP(inverse == i) if string in persistentDistributions: persistentDistributions[string] = NP("concatenate", [persistentDistributions[string], profiledArray[selection]]) else: persistentDistributions[string] = profiledArray[selection] tosort = [(len(distribution), string) for string, distribution in persistentDistributions.items()] tosort.sort(reverse=True) numBins = self.get("numBins", convertType=True) if numBins is not None: tosort = tosort[:numBins] distributions = [persistentDistributions[string] for count, string in tosort] state.edges = [slicedDataColumn.fieldType.valueToString(slicedDataColumn.fieldType.stringToValue(string), displayValue=True) for count, string in tosort] state.slicedFieldType = slicedDataColumn.fieldType performanceTable.end("binType unique") elif persistentState["binType"] == "scale": performanceTable.begin("binType scale") numBins = persistentState["numBins"] low = persistentState["low"] high = persistentState["high"] binWidth = (high - low) / float(numBins) binAssignments = NP("array", NP("floor", NP(NP(slicedArray - low)/binWidth)), dtype=NP.dtype(int)) distributions = [None] * numBins for index in xrange(numBins): selection = NP(binAssignments == index) distributions[index] = profiledArray[selection] persistentState["distributions"] = [NP("concatenate", [x, y]) for x, y in itertools.izip(persistentState["distributions"], distributions)] distributions = persistentState["distributions"] state.edges = [(low + i*binWidth, low + (i + 1)*binWidth) for i in xrange(numBins)] lowEdge = low highEdge = high state.slicedFieldType = self.fieldTypeNumeric performanceTable.end("binType scale") levels = self.get("levels", defaultFromXsd=True) lowWhisker = self.get("lowWhisker", defaultFromXsd=True, convertType=True) lowBox = self.get("lowBox", defaultFromXsd=True, convertType=True) midLine = self.get("midLine", defaultFromXsd=True, convertType=True) highBox = self.get("highBox", defaultFromXsd=True, convertType=True) highWhisker = self.get("highWhisker", defaultFromXsd=True, convertType=True) state.ranges = [] minProfiled = None maxProfiled = None for distribution in distributions: if levels == "percentage": if len(distribution) > 0: state.ranges.append(NP("percentile", distribution, [lowWhisker, lowBox, midLine, highBox, highWhisker])) else: state.ranges.append(None) elif levels == "standardDeviation": mu = NP("mean", distribution) sigma = NP("std", distribution, ddof=1) if NP("isfinite", sigma) and sigma > 0.0: state.ranges.append([(lowWhisker - mu)/sigma, (lowBox - mu)/sigma, (midLine - mu)/sigma, (highBox - mu)/sigma, (highWhisker - mu)/sigma]) else: state.ranges.append(None) if state.ranges[-1] is not None: if minProfiled is None: minProfiled = min(state.ranges[-1]) maxProfiled = max(state.ranges[-1]) else: minProfiled = min(minProfiled, min(state.ranges[-1])) maxProfiled = max(maxProfiled, max(state.ranges[-1])) state.profiledFieldType = profiledDataColumn.fieldType if self.get("vertical", defaultFromXsd=True, convertType=True): if state.slicedFieldType is self.fieldTypeNumeric: plotRange.xminPush(lowEdge, state.slicedFieldType, sticky=False) plotRange.xmaxPush(highEdge, state.slicedFieldType, sticky=False) if minProfiled is not None: plotRange.yminPush(minProfiled, state.profiledFieldType, sticky=False) plotRange.ymaxPush(maxProfiled, state.profiledFieldType, sticky=False) else: strings = NP("array", state.edges, dtype=NP.dtype(object)) if minProfiled is not None: values = NP("ones", len(state.edges), dtype=state.profiledFieldType.dtype) * maxProfiled values[0] = minProfiled else: values = NP("zeros", len(state.edges), dtype=state.profiledFieldType.dtype) plotRange.expand(strings, values, state.slicedFieldType, state.profiledFieldType) else: if state.slicedFieldType is self.fieldTypeNumeric: plotRange.yminPush(lowEdge, state.slicedFieldType, sticky=False) plotRange.ymaxPush(highEdge, state.slicedFieldType, sticky=False) if minProfiled is not None: plotRange.xminPush(minProfiled, state.profiledFieldType, sticky=False) plotRange.xmaxPush(maxProfiled, state.profiledFieldType, sticky=False) else: strings = NP("array", state.edges, dtype=NP.dtype(object)) if minProfiled is not None: values = NP("ones", len(state.edges), dtype=state.profiledFieldType.dtype) * maxProfiled values[0] = minProfiled else: values = NP("zeros", len(state.edges), dtype=state.profiledFieldType.dtype) plotRange.expand(values, strings, state.profiledFieldType, state.slicedFieldType) performanceTable.end("PlotBoxAndWhisker prepare") def draw(self, state, plotCoordinates, plotDefinitions, performanceTable): """Draw the plot element. This stage consists of creating an SVG image of the pre-computed data. @type state: ad-hoc Python object @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type plotCoordinates: PlotCoordinates @param plotCoordinates: The coordinate system in which this plot element will be placed. @type plotDefinitions: PlotDefinitions @type plotDefinitions: The dictionary of key-value pairs that forms the <defs> section of the SVG document. @type performanceTable: PerformanceTable @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process. @rtype: SvgBinding @return: An SVG fragment representing the fully drawn plot element. """ svg = SvgBinding.elementMaker performanceTable.begin("PlotBoxAndWhisker draw") vertical = self.get("vertical", defaultFromXsd=True, convertType=True) gap = self.get("gap", defaultFromXsd=True, convertType=True) if state.slicedFieldType is not self.fieldTypeNumeric: if vertical: strings = plotCoordinates.xstrings else: strings = plotCoordinates.ystrings newRanges = [] for string in strings: try: index = state.edges.index(string) except ValueError: newRanges.append(None) else: newRanges.append(state.ranges[index]) state.ranges = newRanges state.edges = [(i - 0.5, i + 0.5) for i in xrange(len(strings))] lowEdge = NP("array", [low if low is not None else float("-inf") for low, high in state.edges], dtype=NP.dtype(float)) highEdge = NP("array", [high if high is not None else float("inf") for low, high in state.edges], dtype=NP.dtype(float)) selection = NP("array", [levels is not None for levels in state.ranges], dtype=NP.dtype(bool)) lowEdge = lowEdge[selection] highEdge = highEdge[selection] lowWhisker = NP("array", [levels[0] for levels in state.ranges if levels is not None], dtype=state.profiledFieldType.dtype) lowBox = NP("array", [levels[1] for levels in state.ranges if levels is not None], dtype=state.profiledFieldType.dtype) midLine = NP("array", [levels[2] for levels in state.ranges if levels is not None], dtype=state.profiledFieldType.dtype) highBox = NP("array", [levels[3] for levels in state.ranges if levels is not None], dtype=state.profiledFieldType.dtype) highWhisker = NP("array", [levels[4] for levels in state.ranges if levels is not None], dtype=state.profiledFieldType.dtype) output = svg.g() if len(lowEdge) > 0: if vertical: Ax = lowEdge Bx = lowEdge Cx = lowEdge Dx = highEdge Ex = highEdge Fx = highEdge Gx = NP(NP(lowEdge + highEdge) / 2.0) Hx = Gx Ix = Gx Jx = Gx Ay = lowBox By = midLine Cy = highBox Dy = lowBox Ey = midLine Fy = highBox Gy = lowWhisker Hy = lowBox Iy = highBox Jy = highWhisker else: Ax = lowBox Bx = midLine Cx = highBox Dx = lowBox Ex = midLine Fx = highBox Gx = lowWhisker Hx = lowBox Ix = highBox Jx = highWhisker Ay = lowEdge By = lowEdge Cy = lowEdge Dy = highEdge Ey = highEdge Fy = highEdge Gy = NP(NP(lowEdge + highEdge) / 2.0) Hy = Gy Iy = Gy Jy = Gy AX, AY = plotCoordinates(Ax, Ay) BX, BY = plotCoordinates(Bx, By) CX, CY = plotCoordinates(Cx, Cy) DX, DY = plotCoordinates(Dx, Dy) EX, EY = plotCoordinates(Ex, Ey) FX, FY = plotCoordinates(Fx, Fy) GX, GY = plotCoordinates(Gx, Gy) HX, HY = plotCoordinates(Hx, Hy) IX, IY = plotCoordinates(Ix, Iy) JX, JY = plotCoordinates(Jx, Jy) if vertical: if gap > 0.0 and NP(NP(DX - gap/2.0) - NP(AX + gap/2.0)).min() > 0.0: AX += gap/2.0 BX += gap/2.0 CX += gap/2.0 DX -= gap/2.0 EX -= gap/2.0 FX -= gap/2.0 else: if gap > 0.0 and NP(NP(DY - gap/2.0) - NP(AY + gap/2.0)).min() > 0.0: AY += gap/2.0 BY += gap/2.0 CY += gap/2.0 DY -= gap/2.0 EY -= gap/2.0 FY -= gap/2.0 style = self.getStyleState() strokeStyle = dict((x, style[x]) for x in style if x.startswith("stroke")) strokeStyle["fill"] = "none" style = PlotStyle.toString(style) strokeStyle = PlotStyle.toString(strokeStyle) for i in xrange(len(lowEdge)): pathdata = ["M %r %r" % (HX[i], HY[i]), "L %r %r" % (AX[i], AY[i]), "L %r %r" % (BX[i], BY[i]), "L %r %r" % (CX[i], CY[i]), "L %r %r" % (IX[i], IY[i]), "L %r %r" % (FX[i], FY[i]), "L %r %r" % (EX[i], EY[i]), "L %r %r" % (DX[i], DY[i]), "L %r %r" % (HX[i], HY[i]), "Z"] output.append(svg.path(d=" ".join(pathdata), style=style)) output.append(svg.path(d="M %r %r L %r %r" % (BX[i], BY[i], EX[i], EY[i]), style=strokeStyle)) output.append(svg.path(d="M %r %r L %r %r" % (HX[i], HY[i], GX[i], GY[i]), style=strokeStyle)) output.append(svg.path(d="M %r %r L %r %r" % (IX[i], IY[i], JX[i], JY[i]), style=strokeStyle)) if vertical: width = (DX[i] - AX[i]) / 4.0 output.append(svg.path(d="M %r %r L %r %r" % (GX[i] - width, GY[i], GX[i] + width, GY[i]), style=strokeStyle)) output.append(svg.path(d="M %r %r L %r %r" % (JX[i] - width, JY[i], JX[i] + width, JY[i]), style=strokeStyle)) else: width = (DY[i] - AY[i]) / 4.0 output.append(svg.path(d="M %r %r L %r %r" % (GX[i], GY[i] - width, GX[i], GY[i] + width), style=strokeStyle)) output.append(svg.path(d="M %r %r L %r %r" % (JX[i], JY[i] - width, JX[i], JY[i] + width), style=strokeStyle)) performanceTable.end("PlotBoxAndWhisker draw") svgId = self.get("svgId") if svgId is not None: output["id"] = svgId return output
def __init__(self, fileNames, namesToFieldTypes=None, namesToAvroPaths=None, inputState=None, chunkSize=1000000): if InputStream is None: raise RuntimeError( "The optional augustus.avrostream module is required for \"AvroDataTableStream\" but it hasn't been installed or the Avro C++ library is not accessible;%sRecommendations: re-build Augustus with \"python setup.py install --with-avrostream\" or correct your LD_LIBRARY_PATH" % os.linesep) if isinstance(fileNames, basestring): self.fileNames = glob.glob(fileNames) if len(self.fileNames) == 0: raise IOError("No files matched the fileName pattern \"%s\"" % fileNames) else: self.fileNames = fileNames self.schema = None for fileName in self.fileNames: inputStream = InputStream() inputStream.start(fileName, 0, {}, {}) try: schema = json.loads(inputStream.schema()) if self.schema is not None and schema != self.schema: raise ValueError( "these files do not all have the same schema") self.schema = schema except Exception: raise finally: inputStream.close() if self.schema["type"] != "record": raise TypeError( "Top level of schema must describe a record, not %r" % self.schema) if namesToFieldTypes is None: if namesToAvroPaths is None: namesToFieldTypes = dict( (x["name"], None) for x in self.schema["fields"]) # If no parameters are given and this is a map-reduce result, drill down and get the values. if set(namesToFieldTypes.keys()) == set(["key", "value"]) and [ x["type"] for x in self.schema["fields"] if x["name"] == "key" ][0] == "string" and [ x["type"] for x in self.schema["fields"] if x["name"] == "value" ][0]["type"] == "record": del namesToFieldTypes["value"] namesToAvroPaths = {"key": ("key", )} for x in [ x["type"] for x in self.schema["fields"] if x["name"] == "value" ][0]["fields"]: name = x["name"] if name != "key": namesToFieldTypes[name] = None namesToAvroPaths[name] = ("value", name) else: namesToFieldTypes = dict((x, None) for x in namesToAvroPaths) if isinstance(namesToFieldTypes, (list, tuple)): namesToFieldTypes = dict((x, None) for x in namesToFieldTypes) if namesToAvroPaths is None: self.namesToAvroPaths = {} for name in namesToFieldTypes: self.namesToAvroPaths[name] = (name, ) else: self.namesToAvroPaths = dict(namesToAvroPaths) for name, path in self.namesToAvroPaths.items(): if isinstance(path, basestring): self.namesToAvroPaths[name] = (path, ) self.namesToFieldTypes = dict(namesToFieldTypes) for name, fieldType in namesToFieldTypes.items(): schemaObject = self.schema path = self.namesToAvroPaths[name] for pathname in path: if schemaObject["type"] == "record": pass elif isinstance( schemaObject["type"], dict) and schemaObject["type"].get("type") == "record": schemaObject = schemaObject["type"] else: raise LookupError("path %r not found in the schema" % (path, )) fieldNames = [x["name"] for x in schemaObject["fields"]] if pathname not in fieldNames: raise LookupError("path %r not found in the schema" % (path, )) schemaObject, = (x for x in schemaObject["fields"] if x["name"] == pathname) avroType = schemaObject["type"] if isinstance(avroType, dict): avroType = avroType["type"] if avroType == "enum": values = [ FakeFieldValue(x) for x in schemaObject["type"]["symbols"] ] else: values = [] if fieldType == "string": self.namesToFieldTypes[name] = FakeFieldType( "string", "continuous") elif fieldType == "categorical": self.namesToFieldTypes[name] = FakeFieldType("string", "categorical", values=values) self._setupMaps(self.namesToFieldTypes[name]) elif fieldType == "ordinal": self.namesToFieldTypes[name] = FakeFieldType("string", "ordinal", values=values) self._setupMaps(self.namesToFieldTypes[name]) elif isinstance(fieldType, basestring): self.namesToFieldTypes[name] = FakeFieldType( fieldType, "continuous") elif fieldType is None: if avroType in ("null", "record", "array", "map", "fixed"): del self.namesToFieldTypes[name] del self.namesToAvroPaths[name] elif avroType in ("boolean", "int", "long"): self.namesToFieldTypes[name] = FakeFieldType( "integer", "continuous") elif avroType in ("float", "double"): self.namesToFieldTypes[name] = FakeFieldType( "double", "continuous") elif avroType in ("bytes", "string"): self.namesToFieldTypes[name] = FakeFieldType( "string", "continuous") elif avroType == "enum": self.namesToFieldTypes[name] = FakeFieldType("string", "categorical", values=values) self._setupMaps(self.namesToFieldTypes[name]) else: raise TypeError("Unrecognized Avro type: %s" % avroType) if name in self.namesToFieldTypes: fieldType = self.namesToFieldTypes[name] if not isinstance(fieldType, FieldType): raise TypeError("namesToFieldTypes must map to FieldTypes") # TODO: make this more sensible if fieldType.dataType in ("date", "time", "dateTime", "dateDaysSince[0]", "dateDaysSince[1960]", "dateDaysSince[1970]", "dateDaysSince[1980]", "timeSeconds", "dateTimeSecondsSince[0]", "dateTimeSecondsSince[1960]", "dateTimeSecondsSince[1970]", "dateTimeSecondsSince[1980]"): raise NotImplementedError if fieldType.dataType == "object": raise TypeError( "PMML type %r and Avro type \"%s\" are incompatible" % (fieldType, avroType)) elif fieldType.dataType == "string": if fieldType.optype == "continuous": if avroType not in ("boolean", "int", "long", "float", "double", "string", "bytes"): raise TypeError( "PMML type %r and Avro type \"%s\" are incompatible" % (fieldType, avroType)) elif fieldType.optype == "categorical": if avroType != "enum": raise TypeError( "PMML type %r and Avro type \"%s\" are incompatible" % (fieldType, avroType)) elif fieldType.optype == "ordinal": if avroType != "enum": raise TypeError( "PMML type %r and Avro type \"%s\" are incompatible" % (fieldType, avroType)) elif fieldType.dataType in ("boolean", "integer", "dateDaysSince[0]", "dateDaysSince[1960]", "dateDaysSince[1970]", "dateDaysSince[1980]", "timeSeconds", "dateTimeSecondsSince[0]", "dateTimeSecondsSince[1960]", "dateTimeSecondsSince[1970]", "dateTimeSecondsSince[1980]"): if avroType not in ("boolean", "int", "long"): raise TypeError( "PMML type %r and Avro type \"%s\" are incompatible" % (fieldType, avroType)) elif fieldType.dataType in ("float", "double"): if avroType not in ("boolean", "int", "long", "float", "double"): raise TypeError( "PMML type %r and Avro type \"%s\" are incompatible" % (fieldType, avroType)) elif fieldType.dataType == "boolean": raise TypeError( "PMML type %r and Avro type \"%s\" are incompatible" % (fieldType, avroType)) elif fieldType.dataType in ("date", "time", "dateTime"): if avroType != "string": raise TypeError( "PMML type %r and Avro type \"%s\" are incompatible" % (fieldType, avroType)) self.inputState = inputState self.chunkSize = chunkSize
_YEAR = 365 * _DAY _monthName = { 1: "Jan", 2: "Feb", 3: "Mar", 4: "Apr", 5: "May", 6: "Jun", 7: "Jul", 8: "Aug", 9: "Sep", 10: "Oct", 11: "Nov", 12: "Dec" } _fieldType = FakeFieldType("dateTime", "continuous") @staticmethod def _explicitTimeTicks(low, high, initialize, skip, bigTick, contextGranularity, firstIsContext, anyContext, renderContext, renderOther): lowDateTime = PlotTickMarks._fieldType.valueToPython(low) highDateTime = PlotTickMarks._fieldType.valueToPython(high) ticks = {} miniticks = [] runner = PlotTickMarks._fieldType.valueToPython(low).replace( **initialize) while runner <= highDateTime: td = runner - FakeFieldType._dateTimeOrigin
class MiningModel(PmmlModel): """MiningModel implements segmentation, the application of a large pool of models to a dataset, with models selected for individual data records by the data's features. U{PMML specification<http://www.dmg.org/v4-1/MultipleModels.html>}. """ scoreType = FakeFieldType("object", "any") scoreTypeSegment = FakeFieldType("object", "any") scoreTypeCardinality = FakeFieldType("integer", "continuous") SELECT_ALL = object() MEDIAN = object() SUM = object() AVERAGE = object() WEIGHTED_AVERAGE = object() MAJORITY_VOTE = object() WEIGHTED_MAJORITY_VOTE = object() def calculateScore(self, dataTable, functionTable, performanceTable): """Calculate the score of this model. This method is called by C{calculate} to separate operations that are performed by all models (in C{calculate}) from operations that are performed by specific models (in C{calculateScore}). @type subTable: DataTable @param subTable: The DataTable representing this model's lexical scope. @type functionTable: FunctionTable or None @param functionTable: A table of functions. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: A DataColumn containing the score. """ segmentation = self.childOfTag("Segmentation") if segmentation is None: return dataTable multipleModelMethod = segmentation.get("multipleModelMethod") if multipleModelMethod == "selectAll": return self._selectAllMedianMajority(dataTable, functionTable, performanceTable, segmentation, self.SELECT_ALL) if multipleModelMethod == "median": return self._selectAllMedianMajority(dataTable, functionTable, performanceTable, segmentation, self.MEDIAN) elif multipleModelMethod == "majorityVote": return self._selectAllMedianMajority(dataTable, functionTable, performanceTable, segmentation, self.MAJORITY_VOTE) elif multipleModelMethod == "weightedMajorityVote": return self._selectAllMedianMajority(dataTable, functionTable, performanceTable, segmentation, self.WEIGHTED_MAJORITY_VOTE) elif multipleModelMethod == "selectFirst": return self._selectFirst(dataTable, functionTable, performanceTable, segmentation) elif multipleModelMethod == "sum": return self._sumAverageWeighted(dataTable, functionTable, performanceTable, segmentation, self.SUM) elif multipleModelMethod == "average": return self._sumAverageWeighted(dataTable, functionTable, performanceTable, segmentation, self.AVERAGE) elif multipleModelMethod == "weightedAverage": return self._sumAverageWeighted(dataTable, functionTable, performanceTable, segmentation, self.WEIGHTED_AVERAGE) elif multipleModelMethod == "max": return self._selectMax(dataTable, functionTable, performanceTable, segmentation) else: raise NotImplementedError( "multipleModelMethod \"%s\" has not been implemented" % multipleModelMethod) def _selectAllMedianMajority(self, dataTable, functionTable, performanceTable, segmentation, which): """Used by C{calculateScore}.""" if which is self.SELECT_ALL: performanceLabel = "Segmentation selectAll" elif which is self.MEDIAN: performanceLabel = "Segmentation median" elif which is self.MAJORITY_VOTE: performanceLabel = "Segmentation majorityVote" elif which is self.WEIGHTED_MAJORITY_VOTE: performanceLabel = "Segmentation weightedMajorityVote" performanceTable.begin(performanceLabel) scores = [[] for x in xrange(len(dataTable))] if which is self.SELECT_ALL: segments = [[] for x in xrange(len(dataTable))] newOutputData = {} for segment in segmentation.childrenOfTag("Segment", iterator=True): performanceTable.pause(performanceLabel) selection = segment.childOfClass(PmmlPredicate).evaluate( dataTable, functionTable, performanceTable) performanceTable.unpause(performanceLabel) if not selection.any(): continue segmentName = segment.get("id") indexes = NP("nonzero", selection)[0] subTable = dataTable.subTable(selection) subModel = segment.childOfClass(PmmlModel) performanceTable.pause(performanceLabel) subModel.calculate(subTable, functionTable, performanceTable) performanceTable.unpause(performanceLabel) if which is self.MEDIAN and subTable.score.fieldType.dataType in ( "string", "boolean", "object"): raise defs.PmmlValidationError( "Segmentation with multipleModelMethod=\"median\" cannot be applied to models that produce dataType \"%s\"" % subTable.score.fieldType.dataType) scoreData = subTable.score.data scoreMask = subTable.score.mask indexesUsed = indexes if which is self.SELECT_ALL: for subIndex, index in enumerate(indexes): if scoreMask is None or scoreMask[subIndex] == defs.VALID: scores[index].append(scoreData[subIndex]) segments[index].append(segmentName) elif which is self.MEDIAN: for subIndex, index in enumerate(indexes): if scoreMask is None or scoreMask[subIndex] == defs.VALID: scores[index].append(scoreData[subIndex]) elif which in (self.MAJORITY_VOTE, self.WEIGHTED_MAJORITY_VOTE): if which is self.MAJORITY_VOTE: weight = 1.0 else: weight = float(segment.get("weight", 1.0)) for subIndex, index in enumerate(indexes): if scoreMask is None or scoreMask[subIndex] == defs.VALID: newValue = scoreData[subIndex] score = scores[index] found = False for pair in score: if pair[0] == newValue: pair[1] += weight found = True break if not found: score.append([newValue, weight]) if which is self.SELECT_ALL: for fieldName, dataColumn in subTable.output.items(): newData = newOutputData.get(fieldName) if newData is None: newData = [[] for x in xrange(len(dataTable))] newOutputData[fieldName] = newData dataColumnData = dataColumn.data dataColumnMask = dataColumn.mask for subIndex, index in enumerate(indexes): if scoreMask is None or scoreMask[ subIndex] == defs.VALID: if dataColumnMask is None or dataColumnMask[ subIndex] == defs.VALID: newData[index].append(dataColumnData[subIndex]) else: newData[index].append(None) if which is self.SELECT_ALL: for fieldName, newData in newOutputData.items(): finalNewData = NP("empty", len(dataTable), dtype=NP.dtype(object)) for index, newDatum in enumerate(newData): finalNewData[index] = tuple(newDatum) dataTable.output[fieldName] = DataColumn( self.scoreType, finalNewData, None) finalScoresData = NP("empty", len(dataTable), dtype=NP.dtype(object)) for index, score in enumerate(scores): finalScoresData[index] = tuple(score) finalScores = DataColumn(self.scoreType, finalScoresData, None) if self.name is None: performanceTable.end(performanceLabel) return {None: finalScores} else: finalSegmentsData = NP("empty", len(dataTable), dtype=NP.dtype(object)) for index, segment in enumerate(segments): finalSegmentsData[index] = tuple(segment) performanceTable.end(performanceLabel) return { None: finalScores, "segment": DataColumn(self.scoreTypeSegment, finalSegmentsData, None) } elif which is self.MEDIAN: finalScoresData = NP("empty", len(dataTable), dtype=NP.dtype(object)) finalScoresMask = NP("empty", len(dataTable), dtype=defs.maskType) for index, score in enumerate(scores): if len(score) > 0: finalScoresData[index] = NP("median", score) finalScoresMask[index] = defs.VALID else: finalScoresMask[index] = defs.INVALID if not finalScoresMask.any(): finalScoresMask = None finalScores = DataColumn(self.scoreType, finalScoresData, finalScoresMask) performanceTable.end(performanceLabel) return {None: finalScores} elif which in (self.MAJORITY_VOTE, self.WEIGHTED_MAJORITY_VOTE): finalScoresData = NP("empty", len(dataTable), dtype=NP.dtype(object)) finalScoresMask = NP("empty", len(dataTable), dtype=defs.maskType) cardinality = NP("empty", len(dataTable), dtype=self.scoreTypeCardinality.dtype) for index, score in enumerate(scores): bestN, bestValue = None, None for value, N in score: if bestN is None or N > bestN: bestN = N bestValue = value if bestN is not None: finalScoresData[index] = bestValue finalScoresMask[index] = defs.VALID cardinality[index] = bestN else: finalScoresMask[index] = defs.INVALID cardinality[index] = 0 if not finalScoresMask.any(): finalScoresMask = None finalScores = DataColumn(self.scoreType, finalScoresData, finalScoresMask) if self.name is None: performanceTable.end(performanceLabel) return {None: finalScores} else: finalCardinality = DataColumn(self.scoreTypeCardinality, cardinality, None) performanceTable.end(performanceLabel) return {None: finalScores, "cardinality": finalCardinality} def _selectFirst(self, dataTable, functionTable, performanceTable, segmentation): """Used by C{calculateScore}.""" performanceTable.begin("Segmentation selectFirst") scoresData = NP("empty", len(dataTable), dtype=NP.dtype(object)) scoresMask = NP("zeros", len(dataTable), dtype=defs.maskType) unfilled = NP("ones", len(dataTable), dtype=NP.dtype(bool)) segments = NP("empty", len(dataTable), dtype=NP.dtype(object)) newOutputData = [] for segment in segmentation.childrenOfTag("Segment", iterator=True): performanceTable.pause("Segmentation selectFirst") selection = segment.childOfClass(PmmlPredicate).evaluate( dataTable, functionTable, performanceTable) performanceTable.unpause("Segmentation selectFirst") NP("logical_and", selection, unfilled, selection) if not selection.any(): continue subTable = dataTable.subTable(selection) subModel = segment.childOfClass(PmmlModel) performanceTable.pause("Segmentation selectFirst") subModel.calculate(subTable, functionTable, performanceTable) performanceTable.unpause("Segmentation selectFirst") scoresData[selection] = subTable.score.data if subTable.score.mask is not None: scoresMask[selection] = subTable.score.mask else: scoresMask[selection] = defs.VALID segmentName = segment.get("id") if segmentName is not None: segments[selection] = segmentName for fieldName, dataColumn in subTable.output.items(): if fieldName not in dataTable.output: data = NP("empty", len(dataTable), dtype=dataColumn.fieldType.dtype) data[selection] = dataColumn.data mask = NP( NP("ones", len(dataTable), dtype=defs.maskType) * defs.MISSING) if dataColumn.mask is None: mask[selection] = defs.VALID else: mask[selection] = dataColumn.mask newDataColumn = DataColumn(dataColumn.fieldType, data, mask) newDataColumn._unlock() dataTable.output[fieldName] = newDataColumn newOutputData.append(newDataColumn) else: newDataColumn = dataTable.output[fieldName] newDataColumn.data[selection] = dataColumn.data if dataColumn.mask is None: newDataColumn.mask[selection] = defs.VALID else: newDataColumn.mask[selection] = dataColumn.mask unfilled -= selection if not unfilled.any(): break for newDataColumn in newOutputData: if not newDataColumn.mask.any(): newDataColumn._mask = None newDataColumn._lock() if not scoresMask.any(): scoresMask = None scores = DataColumn(self.scoreType, scoresData, scoresMask) if self.name is None: performanceTable.end("Segmentation selectFirst") return {None: scores} else: performanceTable.end("Segmentation selectFirst") return { None: scores, "segment": DataColumn(self.scoreTypeSegment, segments, None) } def _sumAverageWeighted(self, dataTable, functionTable, performanceTable, segmentation, which): """Used by C{calculateScore}.""" if which is self.SUM: performanceLabel = "Segmentation sum" elif which is self.AVERAGE: performanceLabel = "Segmentation average" elif which is self.WEIGHTED_AVERAGE: performanceLabel = "Segmentation weightedAverage" performanceTable.begin(performanceLabel) scoresData = NP("zeros", len(dataTable), dtype=NP.dtype(object)) if which is not self.SUM: denominator = NP("zeros", len(dataTable), dtype=NP.dtype(float)) invalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) for segment in segmentation.childrenOfTag("Segment", iterator=True): performanceTable.pause(performanceLabel) selection = segment.childOfClass(PmmlPredicate).evaluate( dataTable, functionTable, performanceTable) performanceTable.unpause(performanceLabel) if not selection.any(): continue subTable = dataTable.subTable(selection) subModel = segment.childOfClass(PmmlModel) performanceTable.pause(performanceLabel) subModel.calculate(subTable, functionTable, performanceTable) performanceTable.unpause(performanceLabel) if subTable.score.fieldType.dataType in ("string", "boolean", "object"): raise defs.PmmlValidationError( "Segmentation with multipleModelMethod=\"%s\" cannot be applied to models that produce dataType \"%s\"" % (self.childOfTag("Segmentation").get( "multipleModelMethod"), subTable.score.fieldType.dataType)) # ignore invalid in matches (like the built-in "+" and "avg" Apply functions) if subTable.score.mask is not None: NP("logical_and", selection, NP(subTable.score.mask == defs.VALID), selection) if which is self.SUM: scoresData[selection] += subTable.score.data if which is self.AVERAGE: scoresData[selection] += subTable.score.data denominator[selection] += 1.0 elif which is self.WEIGHTED_AVERAGE: weight = float(segment.get("weight", 1.0)) scoresData[selection] += (subTable.score.data * weight) denominator[selection] += weight if subTable.score.mask is not None: invalid[selection] = NP("logical_or", invalid[selection], NP(subTable.score.mask != defs.VALID)) if which is not self.SUM: NP("logical_or", invalid, NP(denominator == 0.0), invalid) valid = NP("logical_not", invalid) scoresData[valid] /= denominator[valid] if invalid.any(): scoresMask = NP( NP("array", invalid, dtype=defs.maskType) * defs.INVALID) else: scoresMask = None scores = DataColumn(self.scoreType, scoresData, scoresMask) performanceTable.end(performanceLabel) return {None: scores} def _selectMax(self, dataTable, functionTable, performanceTable, segmentation): """Used by C{calculateScore}.""" performanceTable.begin("Segmentation max") scoresData = NP("empty", len(dataTable), dtype=NP.dtype(object)) filled = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) unfilled = NP("ones", len(dataTable), dtype=NP.dtype(bool)) newOutputData = [] for segment in segmentation.childrenOfTag("Segment", iterator=True): performanceTable.pause("Segmentation max") selection = segment.childOfClass(PmmlPredicate).evaluate( dataTable, functionTable, performanceTable) performanceTable.unpause("Segmentation max") if not selection.any(): continue subTable = dataTable.subTable(selection) subModel = segment.childOfClass(PmmlModel) performanceTable.pause("Segmentation max") subModel.calculate(subTable, functionTable, performanceTable) performanceTable.unpause("Segmentation max") if subTable.score.fieldType.dataType in ("string", "boolean", "object"): raise defs.PmmlValidationError( "Segmentation with multipleModelMethod=\"max\" cannot be applied to models that produce dataType \"%s\"" % subTable.score.fieldType.dataType) # ignore invalid in matches (like the built-in "min" Apply function) if subTable.score.mask is not None: NP("logical_and", selection, NP(subTable.score.mask == defs.VALID), selection) selectionFilled = NP("logical_and", selection, filled) selectionUnfilled = NP("logical_and", selection, unfilled) filled_selection = filled[selection] unfilled_selection = unfilled[selection] left, right = subTable.score.data[filled_selection], scoresData[ selectionFilled] condition = NP(left > right) scoresData[selectionFilled] = NP("where", condition, left, right) scoresData[selectionUnfilled] = subTable.score.data[ unfilled_selection] for fieldName, dataColumn in subTable.output.items(): if fieldName not in dataTable.output: data = NP("empty", len(dataTable), dtype=dataColumn.fieldType.dtype) data[selectionUnfilled] = dataColumn.data mask = NP( NP("ones", len(dataTable), dtype=defs.maskType) * defs.MISSING) if dataColumn.mask is None: mask[selectionUnfilled] = defs.VALID else: mask[selectionUnfilled] = dataColumn.mask newDataColumn = DataColumn(dataColumn.fieldType, data, mask) newDataColumn._unlock() dataTable.output[fieldName] = newDataColumn newOutputData.append(newDataColumn) else: newDataColumn = dataTable.output[fieldName] newDataColumn.data[selectionFilled] = NP( "where", condition, dataColumn.data[filled_selection], newDataColumn.data[selectionFilled]) newDataColumn.data[selectionUnfilled] = dataColumn.data[ unfilled_selection] if dataColumn.mask is None: newDataColumn.mask[selectionUnfilled] = defs.VALID else: newDataColumn.mask[selectionUnfilled] = dataColumn.mask filled += selectionUnfilled unfilled -= selectionUnfilled for newDataColumn in newOutputData: if not newDataColumn.mask.any(): newDataColumn._mask = None newDataColumn._lock() if filled.all(): scoresMask = None else: scoresMask = NP(NP("logical_not", filled) * defs.MISSING) scores = DataColumn(self.scoreType, scoresData, scoresMask) performanceTable.end("Segmentation max") return {None: scores}
def fieldType(self): dataType = self.get("dataType") if dataType is None: return FakeFieldType("string", "continuous") else: return FakeFieldType(dataType, "continuous")
def verify(self, showSuccess=False, performanceTable=None): """Run the model verification tests defined by this element. The output is a list of results (all results or only failures, depending on C{showSuccess}), each of which is a dictionary of field names to values. Fields are: - "success": was the comparison successful? - "expectedMissing", "observedMissing": is the expected/observed value missing? - "expectedValue", "observedValue": result as an internal value. - "expectedPythonValue", "observedPythonValue": result as a Python value. - "expectedDisplayValue", "observedDisplayValue": result as a string displayValue. Only "success", "expectedMissing", and "observedMissing" appear if the "is missing?" comparison was unsuccessful. @type showSuccess: bool @param showSuccess: If True, emit output even if the tests are successful. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: JSON-like list of dicts @return: As described above. """ verificationFields = {} for verificationField in self.xpath("pmml:VerificationFields/pmml:VerificationField"): verificationField.column = verificationField.get("column", verificationField["field"]) verificationField.precision = verificationField.get("precision", defaultFromXsd=True, convertType=True) verificationField.zeroThreshold = verificationField.get("zeroThreshold", defaultFromXsd=True, convertType=True) verificationField.data = [] verificationField.mask = [] verificationFields[verificationField.column] = verificationField inputData = {} inputMask = {} for index, row in enumerate(self.childOfClass(TableInterface).iterate()): for columnName, columnValue in row.items(): verificationField = verificationFields.get(columnName) if verificationField is not None: while len(verificationField.data) < index: verificationField.data.append(defs.PADDING) verificationField.mask.append(True) verificationField.data.append(columnValue) verificationField.mask.append(False) else: inputDataField = inputData.get(columnName) if inputDataField is None: inputDataField = [] inputData[columnName] = inputDataField inputMask[columnName] = [] inputMaskField = inputMask[columnName] while len(inputDataField) < index: inputDataField.append(defs.PADDING) inputMaskField.append(True) inputDataField.append(columnValue) inputMaskField.append(False) for verificationField in verificationFields.values(): while len(verificationField.data) < index: verificationField.data.append(defs.PADDING) verificationField.mask.append(True) for columnName in inputData: inputDataField = inputData[columnName] inputMaskField = inputMask[columnName] while len(inputDataField) < index: inputDataField.append(defs.PADDING) inputMaskField.append(True) for columnName, verificationField in verificationFields.items(): inputData[columnName] = verificationField.data inputMask[columnName] = verificationField.mask model = self.getparent() if performanceTable is None: performanceTable = FakePerformanceTable() performanceTable.begin("make DataTable") dataTable = DataTable(model, inputData, inputMask, inputState=None) performanceTable.end("make DataTable") functionTable = FunctionTable() for miningField in model.xpath("pmml:MiningSchema/pmml:MiningField"): miningField.replaceField(dataTable, functionTable, performanceTable) for calculable in model.calculableTrans(): calculable.calculate(dataTable, functionTable, performanceTable) score = model.calculateScore(dataTable, functionTable, performanceTable) dataTable.score = score[None] if model.name is not None: for key, value in score.items(): if key is None: dataTable.fields[model.name] = value else: dataTable.fields["%s.%s" % (model.name, key)] = value for outputField in self.xpath("../pmml:Output/pmml:OutputField"): displayName = outputField.get("displayName", outputField["name"]) outputField.format(dataTable, functionTable, performanceTable, score) output = [] for verificationField in verificationFields.values(): observedOutput = dataTable.fields.get(verificationField["field"]) if observedOutput is None: raise defs.PmmlValidationError("VerificationField references field \"%s\" but it was not produced by the model") fieldType = observedOutput.fieldType if fieldType.dataType == "object": try: newArray = [float(x) for x in observedOutput.data] except ValueError: pass else: fieldType = FakeFieldType("double", "continuous") observedOutput._data = newArray for index in xrange(len(dataTable)): record = {"field": verificationField["field"], "index": index} record["expectedMissing"] = verificationField.mask[index] record["observedMissing"] = (observedOutput.mask is not None and observedOutput.mask[index] != defs.VALID) if record["expectedMissing"] != record["observedMissing"]: record["success"] = False output.append(record) elif not record["expectedMissing"]: record["expectedValue"] = fieldType.stringToValue(verificationField.data[index]) record["observedValue"] = observedOutput.data[index] record["expectedPythonValue"] = fieldType.valueToPython(record["expectedValue"]) record["observedPythonValue"] = fieldType.valueToPython(record["observedValue"]) record["expectedDisplayValue"] = fieldType.valueToString(record["expectedValue"]) record["observedDisplayValue"] = fieldType.valueToString(record["observedValue"]) if fieldType.optype == "continuous": if (abs(record["expectedValue"]) <= verificationField.zeroThreshold) and (abs(record["observedValue"]) <= verificationField.zeroThreshold): record["success"] = True else: record["success"] = ((record["expectedValue"] * (1.0 - verificationField.precision)) <= record["observedValue"] <= (record["expectedValue"] * (1.0 + verificationField.precision))) if not record["success"] or showSuccess: output.append(record) else: if record["expectedValue"] != record["observedValue"]: record["success"] = False output.append(record) else: record["success"] = True if showSuccess: output.append(record) return output
def calculateScore(self, dataTable, functionTable, performanceTable): """Calculate the score of this model. This method is called by C{calculate} to separate operations that are performed by all models (in C{calculate}) from operations that are performed by specific models (in C{calculateScore}). @type subTable: DataTable @param subTable: The DataTable representing this model's lexical scope. @type functionTable: FunctionTable or None @param functionTable: A table of functions. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: A DataColumn containing the score. """ performanceTable.begin("ClusteringModel") performanceTable.begin("set up") distributionBased = (self["modelClass"] == "distributionBased") clusteringFields = self.xpath( "pmml:ClusteringField[not(@isCenterField='false')]") fieldWeights = [ clusteringField.get("fieldWeight", defaultFromXsd=True, convertType=True) for clusteringField in clusteringFields ] for fieldWeight in fieldWeights: if fieldWeight < 0.0: raise defs.PmmlValidationError( "ClusteringField fieldWeights must all be non-negative (encountered %g)" % fieldWeight) clusters = self.xpath("pmml:Cluster") comparisonMeasure = self.childOfClass(ComparisonMeasure) defaultCompareFunction = comparisonMeasure.get("compareFunction", defaultFromXsd=True) metric = comparisonMeasure.childOfClass(PmmlClusteringMetric) metrictag = metric.t performanceTable.end("set up") for clusteringField in clusteringFields: dataType = dataTable.fields[ clusteringField["field"]].fieldType.dataType if dataType == "string": raise defs.PmmlValidationError( "ClusteringField \"%s\" has dataType \"%s\", which cannot be used for clustering" % (clusteringField["field"], dataType)) missingValueWeights = self.childOfTag("MissingValueWeights") if missingValueWeights is None: adjustM = None else: performanceTable.begin("MissingValueWeights") missingWeights = missingValueWeights.childOfClass( PmmlArray).values(convertType=True) sumNMqi = NP("zeros", len(dataTable), dtype=NP.dtype(float)) for clusteringField, missingWeight in zip(clusteringFields, missingWeights): clusteringField.addToAdjustM(dataTable, functionTable, performanceTable, sumNMqi, missingWeight) adjustM = NP(sum(missingWeights) / sumNMqi) adjustM[NP(sumNMqi == 0.0)] = 1.0 performanceTable.end("MissingValueWeights") anyInvalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) for clusteringField in clusteringFields: mask = dataTable.fields[clusteringField["field"]].mask if mask is not None: NP("logical_or", anyInvalid, NP(mask == defs.INVALID), anyInvalid) bestClusterId = None bestClusterAffinity = None allClusterAffinities = {} for index, cluster in enumerate(clusters): array = cluster.childOfClass(PmmlArray) if array is None: raise defs.PmmlValidationError( "Cluster must have an array to designate its center") centerStrings = array.values(convertType=False) if len(centerStrings) != len(clusteringFields): raise defs.PmmlValidationError( "Cluster array has %d components, but there are %d ClusteringFields with isCenterField=true" % (len(centerStrings), len(clusteringFields))) performanceTable.begin(metrictag) if distributionBased: matrix = cluster.xpath("pmml:Covariances/pmml:Matrix") if len(matrix) != 1: raise defs.PmmlValidationError( "In distribution-based clustering, all clusters must have a Covariances/Matrix" ) try: covarianceMatrix = NP("array", matrix[0].values(), dtype=NP.dtype(float)) except ValueError: raise defs.PmmlValidationError( "Covariances/Matrix must contain real numbers for distribution-based clustering" ) else: covarianceMatrix = None state = self._State() metric.initialize(state, len(dataTable), len(clusteringFields), distributionBased) for clusteringField, centerString, fieldWeight in zip( clusteringFields, centerStrings, fieldWeights): if isinstance(metric, PmmlClusteringMetricBinary): metric.accumulateBinary( state, dataTable.fields[clusteringField["field"]], centerString, distributionBased) else: performanceTable.pause(metrictag) cxy = clusteringField.compare(dataTable, functionTable, performanceTable, centerString, defaultCompareFunction, anyInvalid) performanceTable.unpause(metrictag) metric.accumulate(state, cxy, fieldWeight, distributionBased) distance = metric.finalizeDistance(state, adjustM, distributionBased, covarianceMatrix) del state performanceTable.end(metrictag) if index == 0: bestClusterId = NP("ones", len(dataTable), dtype=NP.dtype(int)) # 1-based index bestClusterAffinity = distance better = NP(distance < bestClusterAffinity) bestClusterId[better] = index + 1 # 1-based index bestClusterAffinity[better] = distance[better] allClusterAffinities[cluster.get("id", "%d" % (index + 1))] = distance if not anyInvalid.any(): scoreMask = None else: scoreMask = NP(anyInvalid * defs.INVALID) performanceTable.begin("set scores") score = {} performanceTable.begin("predictedValue") fieldType = FakeFieldType("string", "categorical") clusterIdentifiers = NP("empty", len(dataTable), dtype=fieldType.dtype) for index, cluster in enumerate(clusters): value = fieldType.stringToValue( cluster.get("id", "%d" % (index + 1))) clusterIdentifiers[NP(bestClusterId == (index + 1))] = value score[None] = DataColumn(fieldType, clusterIdentifiers, scoreMask) performanceTable.end("predictedValue") if self.subFields["predictedDisplayValue"]: performanceTable.begin("predictedDisplayValue") fieldType = FakeFieldType("string", "categorical") clusterNames = NP("empty", len(dataTable), dtype=fieldType.dtype) for index, cluster in enumerate(clusters): value = fieldType.stringToValue(cluster.get("name", "")) clusterNames[NP(bestClusterId == (index + 1))] = value score["predictedDisplayValue"] = DataColumn( fieldType, clusterNames, scoreMask) performanceTable.end("predictedDisplayValue") if self.subFields["entity"]: performanceTable.begin("entity") fieldType = FakeFieldType("object", "any") entities = NP("empty", len(dataTable), dtype=fieldType.dtype) for index, cluster in enumerate(clusters): value = fieldType.stringToValue(cluster.get("name", "")) indexPlusOne = index + 1 for i in xrange(len(entities)): if bestClusterId[i] == indexPlusOne: entities[i] = cluster score["entity"] = DataColumn(fieldType, entities, scoreMask) performanceTable.end("entity") if self.subFields["clusterId"]: performanceTable.begin("clusterId") fieldType = FakeFieldType("integer", "continuous") score["clusterId"] = DataColumn(fieldType, bestClusterId, scoreMask) performanceTable.end("clusterId") if self.subFields["entityId"]: performanceTable.begin("entityId") fieldType = FakeFieldType("integer", "continuous") score["entityId"] = DataColumn(fieldType, bestClusterId, scoreMask) performanceTable.end("entityId") if self.subFields["clusterAffinity"]: performanceTable.begin("clusterAffinity") fieldType = FakeFieldType("double", "continuous") score["clusterAffinity"] = DataColumn(fieldType, bestClusterAffinity, scoreMask) performanceTable.end("clusterAffinity") if self.subFields["affinity"]: performanceTable.begin("affinity") fieldType = FakeFieldType("double", "continuous") score["affinity"] = DataColumn(fieldType, bestClusterAffinity, scoreMask) performanceTable.end("affinity") if self.subFields["all"]: performanceTable.begin("all") fieldType = FakeFieldType("double", "continuous") for identifier, distance in allClusterAffinities.items(): score["all.%s" % identifier] = DataColumn( fieldType, distance, scoreMask) performanceTable.end("all") performanceTable.end("set scores") performanceTable.end("ClusteringModel") return score
class PlotHistogram(PmmlPlotContent): """Represents a 1d histogram of the data. PMML subelements: - PlotExpression role="data": the numeric or categorical data. - PlotNumericExpression role="weight": histogram weights. - PlotSelection: expression or predicate to filter the data before plotting. - Intervals: non-uniform (numerical) histogram bins. - Values: explicit (categorical) histogram values. - PlotSvgMarker: inline SVG for histograms drawn with markers, where the markers are SVG pictograms. PMML attributes: - svgId: id for the resulting SVG element. - stateId: key for persistent storage in a DataTableState. - numBins: number of histogram bins. - low: histogram low edge. - high: histogram high edge. - normalized: if "false", the histogram represents the number of counts in each bin; if "true", the histogram represents density, with a total integral (taking into account bin widths) of 1.0. - cumulative: if "false", the histogram approximates a probability density function (PDF) with flat-top bins; if "true", the histogram approximates a cumulative distribution function (CDF) with linear-top bins. - vertical: if "true", plot the "data" expression on the x axis and the counts/density/cumulative values on the y axis. - visualization: one of "skyline", "polyline", "smooth", "points", "errorbars". - gap: size of the space between histogram bars in SVG coordinates. - marker: marker to use for "points" visualization (see PlotScatter). - style: CSS style properties. CSS properties: - fill, fill-opacity: color of the histogram bars. - stroke, stroke-dasharray, stroke-dashoffset, stroke-linecap, stroke-linejoin, stroke-miterlimit, stroke-opacity, stroke-width: properties of the line drawing. - marker-size, marker-outline: marker style for "points" visualization. See the source code for the full XSD. """ styleProperties = [ "fill", "fill-opacity", "stroke", "stroke-dasharray", "stroke-dashoffset", "stroke-linecap", "stroke-linejoin", "stroke-miterlimit", "stroke-opacity", "stroke-width", "marker-size", "marker-outline", ] styleDefaults = { "fill": "none", "stroke": "black", "marker-size": "5", "marker-outline": "none" } xsd = """<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"> <xs:element name="PlotHistogram"> <xs:complexType> <xs:sequence> <xs:element ref="Extension" minOccurs="0" maxOccurs="unbounded" /> <xs:element ref="PlotExpression" minOccurs="1" maxOccurs="1" /> <xs:element ref="PlotNumericExpression" minOccurs="0" maxOccurs="1" /> <xs:element ref="PlotSelection" minOccurs="0" maxOccurs="1" /> <xs:choice minOccurs="0" maxOccurs="1"> <xs:element ref="Interval" minOccurs="1" maxOccurs="unbounded" /> <xs:element ref="Value" minOccurs="1" maxOccurs="unbounded" /> </xs:choice> <xs:element ref="PlotSvgMarker" minOccurs="0" maxOccurs="1" /> </xs:sequence> <xs:attribute name="svgId" type="xs:string" use="optional" /> <xs:attribute name="stateId" type="xs:string" use="optional" /> <xs:attribute name="numBins" type="xs:positiveInteger" use="optional" /> <xs:attribute name="low" type="xs:double" use="optional" /> <xs:attribute name="high" type="xs:double" use="optional" /> <xs:attribute name="normalized" type="xs:boolean" use="optional" default="false" /> <xs:attribute name="cumulative" type="xs:boolean" use="optional" default="false" /> <xs:attribute name="vertical" type="xs:boolean" use="optional" default="true" /> <xs:attribute name="visualization" use="optional" default="skyline"> <xs:simpleType> <xs:restriction base="xs:string"> <xs:enumeration value="skyline" /> <xs:enumeration value="polyline" /> <xs:enumeration value="smooth" /> <xs:enumeration value="points" /> <xs:enumeration value="errorbars" /> </xs:restriction> </xs:simpleType> </xs:attribute> <xs:attribute name="gap" type="xs:double" use="optional" default="0.0" /> <xs:attribute name="marker" type="PLOT-MARKER-TYPE" use="optional" default="circle" /> <xs:attribute name="style" type="xs:string" use="optional" default="%s" /> </xs:complexType> </xs:element> </xs:schema> """ % PlotStyle.toString(styleDefaults) fieldType = FakeFieldType("double", "continuous") fieldTypeNumeric = FakeFieldType("double", "continuous") @staticmethod def establishBinType(fieldType, intervals, values): """Determine the type of binning to use for a histogram with the given FieldType, Intervals, and Values. @type fieldType: FieldType @param fieldType: The FieldType of the plot expression. @type intervals: list of PmmlBinding @param intervals: The <Interval> elements; may be empty. @type values: list of PmmlBinding @param values: The <Value> elements; may be empty. @rtype: string @return: One of "nonuniform", "explicit", "unique", "scale". """ if len(intervals) > 0: if not fieldType.isnumeric() and not fieldType.istemporal(): raise defs.PmmlValidationError( "Explicit Intervals are intended for numerical data, not %r" % fieldType) return "nonuniform" elif len(values) > 0: if not fieldType.isstring(): raise defs.PmmlValidationError( "Explicit Values are intended for string data, not %r" % fieldType) return "explicit" elif fieldType.isstring(): return "unique" else: if not fieldType.isnumeric() and not fieldType.istemporal(): raise defs.PmmlValidationError( "PlotHistogram requires numerical or string data, not %r" % fieldType) return "scale" @staticmethod def determineScaleBins(numBins, low, high, array): """Determine the C{numBins}, C{low}, and C{high} of the histogram from explicitly set values where available and implicitly derived values where necessary. Explicitly set values always override implicit values derived from the dataset. - C{low}, C{high} implicit values are the extrema of the dataset. - C{numBins} implicit value is the Freedman-Diaconis heuristic for number of histogram bins. @type numBins: int or None @param numBins: Input number of bins. @type low: number or None @param low: Low edge. @type high: number or None @param high: High edge. @type array: 1d Numpy array of numbers @param array: Dataset to use to implicitly derive values. @rtype: 3-tuple @return: C{numBins}, C{low}, C{high} """ generateLow = (low is None) generateHigh = (high is None) if generateLow: low = float(array.min()) if generateHigh: high = float(array.max()) if low == high: low, high = low - 1.0, high + 1.0 elif high < low: if generateLow: low = high - 1.0 elif generateHigh: high = low + 1.0 else: raise defs.PmmlValidationError( "PlotHistogram attributes low and high must be in the right order: low = %g, high = %g" % (low, high)) else: if generateLow and generateHigh: low, high = low - 0.2 * (high - low), high + 0.2 * (high - low) elif generateLow: low = low - 0.2 * (high - low) elif generateHigh: high = high + 0.2 * (high - low) if numBins is None: # the Freedman-Diaconis rule q1, q3 = NP("percentile", array, [25.0, 75.0]) binWidth = 2.0 * (q3 - q1) / math.pow(len(array), 1.0 / 3.0) if binWidth > 0.0: numBins = max(10, int(math.ceil((high - low) / binWidth))) else: numBins = 10 return numBins, low, high @staticmethod def selectInterval(fieldType, array, index, lastIndex, interval, edges, lastLimitPoint, lastClosed, lastInterval): """Select rows of an array within an interval as part of filling a non-uniform histogram. @type fieldType: FieldType @param fieldType: FieldType used to interpret the bounds of the interval. @type array: 1d Numpy array @param array: Values to select. @type index: int @param index: Current bin index. @type lastIndex: int @param lastIndex: Previous bin index. @type interval: PmmlBinding @param interval: PMML <Interval> element defining the interval. @type edges: list of 2-tuples @param edges: Pairs of interpreted C{leftMargin}, C{rightMargin} for the histogram. @type lastLimitPoint: number @param lastLimitPoint: Larger of the two last edges. ("Limit point" because it may have been open or closed.) @type lastClosed: bool @param lastClosed: If True, the last limit point was closed. @type lastInterval: PmmlBinding @param lastInterval: PMML <Interval> for the last bin. @rtype: 4-tuple @return: C{selection} (1d Numpy array of bool), C{lastLimitPoint}, C{lastClosed}, C{lastInterval} """ closure = interval["closure"] leftMargin = interval.get("leftMargin") rightMargin = interval.get("rightMargin") selection = None if leftMargin is None and rightMargin is None and len(intervals) != 1: raise defs.PmmlValidationError( "If a histogram bin is unbounded on both ends, it must be the only bin" ) if leftMargin is not None: try: leftMargin = fieldType.stringToValue(leftMargin) except ValueError: raise defs.PmmlValidationError( "Improper value in Interval leftMargin specification: \"%s\"" % leftMargin) if closure in ("openClosed", "openOpen"): if selection is None: selection = NP(leftMargin < array) else: NP("logical_and", selection, NP(leftMargin < array), selection) elif closure in ("closedOpen", "closedClosed"): if selection is None: selection = NP(leftMargin <= array) else: NP("logical_and", selection, NP(leftMargin <= array), selection) if lastLimitPoint is not None: if leftMargin < lastLimitPoint or ( leftMargin == lastLimitPoint and (closure in ("closedOpen", "closedClosed")) and lastClosed): raise defs.PmmlValidationError( "Intervals are out of order or overlap: %r and %r" % (lastInterval, interval)) elif index != 0: raise defs.PmmlValidationError( "Only the first Interval can have an open-ended leftMargin: %r" % interval) if rightMargin is not None: try: rightMargin = fieldType.stringToValue(rightMargin) except ValueError: raise defs.PmmlValidationError( "Improper value in Interval rightMargin specification: \"%s\"" % rightMargin) if closure in ("openOpen", "closedOpen"): if selection is None: selection = NP(array < rightMargin) else: NP("logical_and", selection, NP(array < rightMargin), selection) elif closure in ("openClosed", "closedClosed"): if selection is None: selection = NP(array <= rightMargin) else: NP("logical_and", selection, NP(array <= rightMargin), selection) lastLimitPoint = rightMargin lastClosed = (closure in ("openClosed", "closedClosed")) lastInterval = interval elif index != lastIndex: raise defs.PmmlValidationError( "Only the last Interval can have an open-ended rightMargin: %r" % interval) edges.append((leftMargin, rightMargin)) return selection, lastLimitPoint, lastClosed, lastInterval def prepare(self, state, dataTable, functionTable, performanceTable, plotRange): """Prepare a plot element for drawing. This stage consists of calculating all quantities and determing the bounds of the data. These bounds may be unioned with bounds from other plot elements that overlay this plot element, so the drawing (which requires a finalized coordinate system) cannot begin yet. This method modifies C{plotRange}. @type state: ad-hoc Python object @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type dataTable: DataTable @param dataTable: Contains the data to plot. @type functionTable: FunctionTable @param functionTable: Defines functions that may be used to transform data for plotting. @type performanceTable: PerformanceTable @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process. @type plotRange: PlotRange @param plotRange: The bounding box of plot coordinates that this function will expand. """ self.checkRoles(["data", "weight"]) dataExpression = self.xpath("pmml:PlotExpression[@role='data']") weightExpression = self.xpath( "pmml:PlotNumericExpression[@role='weight']") cutExpression = self.xpath("pmml:PlotSelection") if len(dataExpression) != 1: raise defs.PmmlValidationError( "PlotHistogram requires a PlotNumericExpression with role \"data\"" ) dataColumn = dataExpression[0].evaluate(dataTable, functionTable, performanceTable) if len(weightExpression) == 0: weight = None elif len(weightExpression) == 1: weight = weightExpression[0].evaluate(dataTable, functionTable, performanceTable) else: raise defs.PmmlValidationError( "PlotHistogram may not have more than one PlotNumericExpression with role \"data\"" ) if len(cutExpression) == 1: selection = cutExpression[0].select(dataTable, functionTable, performanceTable) else: selection = NP("ones", len(dataTable), NP.dtype(bool)) performanceTable.begin("PlotHistogram prepare") self._saveContext(dataTable) if dataColumn.mask is not None: NP("logical_and", selection, NP(dataColumn.mask == defs.VALID), selection) if weight is not None and weight.mask is not None: NP("logical_and", selection, NP(weight.mask == defs.VALID), selection) array = dataColumn.data[selection] if weight is not None: weight = weight.data[selection] persistentState = {} stateId = self.get("stateId") if stateId is not None: if stateId in dataTable.state: persistentState = dataTable.state[stateId] else: dataTable.state[stateId] = persistentState intervals = self.xpath("pmml:Interval") values = self.xpath("pmml:Value") if "binType" not in persistentState: performanceTable.begin("establish binType") binType = self.establishBinType(dataColumn.fieldType, intervals, values) persistentState["binType"] = binType if binType == "nonuniform": persistentState["count"] = [0.0] * len(intervals) elif binType == "explicit": persistentState["count"] = [0.0] * len(values) elif binType == "unique": persistentState["count"] = {} elif binType == "scale": numBins = self.get("numBins", convertType=True) low = self.get("low", convertType=True) high = self.get("high", convertType=True) numBins, low, high = self.determineScaleBins( numBins, low, high, array) persistentState["low"] = low persistentState["high"] = high persistentState["numBins"] = numBins persistentState["count"] = [0.0] * numBins performanceTable.end("establish binType") missingSum = 0.0 if persistentState["binType"] == "nonuniform": performanceTable.begin("binType nonuniform") count = [0.0] * len(intervals) edges = [] lastLimitPoint = None lastClosed = None lastInterval = None for index, interval in enumerate(intervals): selection, lastLimitPoint, lastClosed, lastInterval = self.selectInterval( dataColumn.fieldType, array, index, len(intervals) - 1, interval, edges, lastLimitPoint, lastClosed, lastInterval) if selection is not None: if weight is None: count[index] += NP("count_nonzero", selection) else: count[index] += weight[selection].sum() persistentState["count"] = [ x + y for x, y in itertools.izip(count, persistentState["count"]) ] state.fieldType = self.fieldTypeNumeric state.count = persistentState["count"] state.edges = edges lowEdge = min(low for low, high in edges if low is not None) highEdge = max(high for low, high in edges if high is not None) performanceTable.end("binType nonuniform") elif persistentState["binType"] == "explicit": performanceTable.begin("binType explicit") count = [0.0] * len(values) displayValues = [] for index, value in enumerate(values): internalValue = dataColumn.fieldType.stringToValue( value["value"]) displayValues.append( value.get( "displayValue", dataColumn.fieldType.valueToString(internalValue, displayValue=True))) selection = NP(array == internalValue) if weight is None: count[index] += NP("count_nonzero", selection) else: count[index] += weight[selection].sum() persistentState["count"] = [ x + y for x, y in itertools.izip(count, persistentState["count"]) ] state.fieldType = dataColumn.fieldType state.count = persistentState["count"] state.edges = displayValues performanceTable.end("binType explicit") elif persistentState["binType"] == "unique": performanceTable.begin("binType unique") uniques, inverse = NP("unique", array, return_inverse=True) if weight is None: counts = NP("bincount", inverse) else: counts = NP("bincount", inverse, weights=weight) persistentCount = persistentState["count"] for i, u in enumerate(uniques): string = dataColumn.fieldType.valueToString(u, displayValue=False) if string in persistentCount: persistentCount[string] += counts[i] else: persistentCount[string] = counts[i] tosort = [(count, string) for string, count in persistentCount.items()] tosort.sort(reverse=True) numBins = self.get("numBins", convertType=True) if numBins is not None: missingSum = sum(count for count, string in tosort[numBins:]) tosort = tosort[:numBins] state.fieldType = dataColumn.fieldType state.count = [count for count, string in tosort] state.edges = [ dataColumn.fieldType.valueToString( dataColumn.fieldType.stringToValue(string), displayValue=True) for count, string in tosort ] performanceTable.end("binType unique") elif persistentState["binType"] == "scale": performanceTable.begin("binType scale") numBins = persistentState["numBins"] low = persistentState["low"] high = persistentState["high"] binWidth = (high - low) / float(numBins) binAssignments = NP("array", NP("floor", NP(NP(array - low) / binWidth)), dtype=NP.dtype(int)) binAssignments[NP(binAssignments > numBins)] = numBins binAssignments[NP(binAssignments < 0)] = numBins if len(binAssignments) == 0: count = NP("empty", 0, dtype=NP.dtype(float)) else: if weight is None: count = NP("bincount", binAssignments) else: count = NP("bincount", binAssignments, weights=weight) if len(count) < numBins: padded = NP("zeros", numBins, dtype=NP.dtype(float)) padded[:len(count)] = count else: padded = count persistentState["count"] = [ x + y for x, y in itertools.izip(padded, persistentState["count"]) ] state.fieldType = self.fieldTypeNumeric state.count = persistentState["count"] state.edges = [(low + i * binWidth, low + (i + 1) * binWidth) for i in xrange(numBins)] lowEdge = low highEdge = high performanceTable.end("binType scale") if self.get("normalized", defaultFromXsd=True, convertType=True): if state.fieldType is self.fieldTypeNumeric: weightedValues = 0.0 for (low, high), value in itertools.izip(state.edges, state.count): if low is not None and high is not None: weightedValues += value / (high - low) newCount = [] for (low, high), value in zip(state.edges, state.count): if low is None or high is None: newCount.append(0.0) else: newCount.append(value / (high - low) / weightedValues) state.count = newCount else: totalCount = sum(state.count) + missingSum state.count = [float(x) / totalCount for x in state.count] if self.get("cumulative", defaultFromXsd=True, convertType=True): maximum = sum(state.count) else: maximum = max(state.count) if self.get("vertical", defaultFromXsd=True, convertType=True): plotRange.yminPush(0.0, self.fieldType, sticky=True) if state.fieldType is self.fieldTypeNumeric: plotRange.xminPush(lowEdge, state.fieldType, sticky=True) plotRange.xmaxPush(highEdge, state.fieldType, sticky=True) plotRange.ymaxPush(maximum, state.fieldType, sticky=False) else: plotRange.expand( NP("array", state.edges, dtype=NP.dtype(object)), NP("ones", len(state.edges), dtype=NP.dtype(float)) * maximum, state.fieldType, self.fieldType) else: plotRange.xminPush(0.0, self.fieldType, sticky=True) if state.fieldType is self.fieldTypeNumeric: plotRange.yminPush(lowEdge, state.fieldType, sticky=True) plotRange.ymaxPush(highEdge, state.fieldType, sticky=True) plotRange.xmaxPush(maximum, state.fieldType, sticky=False) else: plotRange.expand( NP("ones", len(state.edges), dtype=NP.dtype(float)) * maximum, NP("array", state.edges, dtype=NP.dtype(object)), self.fieldType, state.fieldType) performanceTable.end("PlotHistogram prepare") def draw(self, state, plotCoordinates, plotDefinitions, performanceTable): """Draw the plot element. This stage consists of creating an SVG image of the pre-computed data. @type state: ad-hoc Python object @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type plotCoordinates: PlotCoordinates @param plotCoordinates: The coordinate system in which this plot element will be placed. @type plotDefinitions: PlotDefinitions @type plotDefinitions: The dictionary of key-value pairs that forms the <defs> section of the SVG document. @type performanceTable: PerformanceTable @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process. @rtype: SvgBinding @return: An SVG fragment representing the fully drawn plot element. """ svg = SvgBinding.elementMaker performanceTable.begin("PlotHistogram draw") cumulative = self.get("cumulative", defaultFromXsd=True, convertType=True) vertical = self.get("vertical", defaultFromXsd=True, convertType=True) visualization = self.get("visualization", defaultFromXsd=True) output = svg.g() if len(state.count) > 0: if state.fieldType is not self.fieldTypeNumeric: if vertical: strings = plotCoordinates.xstrings else: strings = plotCoordinates.ystrings newCount = [] for string in strings: try: index = state.edges.index(string) except ValueError: newCount.append(0.0) else: newCount.append(state.count[index]) state.count = newCount state.edges = [(i - 0.5, i + 0.5) for i in xrange(len(strings))] if vertical: Ax = NP("array", [ low if low is not None else float("-inf") for low, high in state.edges ], dtype=NP.dtype(float)) Bx = NP(Ax.copy()) Cx = NP("array", [ high if high is not None else float("inf") for low, high in state.edges ], dtype=NP.dtype(float)) Dx = NP(Cx.copy()) Ay = NP("zeros", len(state.count), dtype=NP.dtype(float)) if cumulative: Cy = NP("cumsum", NP("array", state.count, dtype=NP.dtype(float))) By = NP("roll", Cy, 1) By[0] = 0.0 else: By = NP("array", state.count, dtype=NP.dtype(float)) Cy = NP(By.copy()) Dy = NP(Ay.copy()) else: if cumulative: Cx = NP("cumsum", NP("array", state.count, dtype=NP.dtype(float))) Bx = NP("roll", Cx, 1) Bx[0] = 0.0 else: Bx = NP("array", state.count, dtype=NP.dtype(float)) Cx = NP(Bx.copy()) Ax = NP("zeros", len(state.count), dtype=NP.dtype(float)) Dx = NP(Ax.copy()) Ay = NP("array", [ low if low is not None else float("-inf") for low, high in state.edges ], dtype=NP.dtype(float)) By = NP(Ay.copy()) Cy = NP("array", [ high if high is not None else float("inf") for low, high in state.edges ], dtype=NP.dtype(float)) Dy = NP(Cy.copy()) AX, AY = plotCoordinates(Ax, Ay) BX, BY = plotCoordinates(Bx, By) CX, CY = plotCoordinates(Cx, Cy) DX, DY = plotCoordinates(Dx, Dy) if visualization == "skyline": gap = self.get("gap", defaultFromXsd=True, convertType=True) if vertical: if gap > 0.0 and NP( NP(DX - gap / 2.0) - NP(AX + gap / 2.0)).min() > 0.0: AX += gap / 2.0 BX += gap / 2.0 CX -= gap / 2.0 DX -= gap / 2.0 else: if gap > 0.0 and NP( NP(AY + gap / 2.0) - NP(DY - gap / 2.0)).min() > 0.0: AY -= gap / 2.0 BY -= gap / 2.0 CY += gap / 2.0 DY += gap / 2.0 pathdata = [] nextIsMoveto = True for i in xrange(len(state.count)): iprev = i - 1 inext = i + 1 if vertical and By[i] == 0.0 and Cy[i] == 0.0: if i > 0 and not nextIsMoveto: pathdata.append("L %r %r" % (DX[iprev], DY[iprev])) nextIsMoveto = True elif not vertical and Bx[i] == 0.0 and Cx[i] == 0.0: if i > 0 and not nextIsMoveto: pathdata.append("L %r %r" % (DX[iprev], DY[iprev])) nextIsMoveto = True else: if nextIsMoveto or gap > 0.0 or ( vertical and DX[iprev] != AX[i]) or ( not vertical and DY[iprev] != AY[i]): pathdata.append("M %r %r" % (AX[i], AY[i])) nextIsMoveto = False pathdata.append("L %r %r" % (BX[i], BY[i])) pathdata.append("L %r %r" % (CX[i], CY[i])) if i == len(state.count) - 1 or gap > 0.0 or ( vertical and DX[i] != AX[inext]) or ( not vertical and DY[i] != AY[inext]): pathdata.append("L %r %r" % (DX[i], DY[i])) style = self.getStyleState() del style["marker-size"] del style["marker-outline"] output.append( svg.path(d=" ".join(pathdata), style=PlotStyle.toString(style))) elif visualization == "polyline": pathdata = [] for i in xrange(len(state.count)): if i == 0: pathdata.append("M %r %r" % (AX[i], AY[i])) pathdata.append("L %r %r" % ((BX[i] + CX[i]) / 2.0, (BY[i] + CY[i]) / 2.0)) if i == len(state.count) - 1: pathdata.append("L %r %r" % (DX[i], DY[i])) style = self.getStyleState() del style["marker-size"] del style["marker-outline"] output.append( svg.path(d=" ".join(pathdata), style=PlotStyle.toString(style))) elif visualization == "smooth": smoothingSamples = math.ceil(len(state.count) / 2.0) BCX = NP(NP(BX + CX) / 2.0) BCY = NP(NP(BY + CY) / 2.0) xarray = NP("array", [AX[0]] + list(BCX) + [DX[-1]], dtype=NP.dtype(float)) yarray = NP("array", [AY[0]] + list(BCY) + [DY[-1]], dtype=NP.dtype(float)) samples = NP("linspace", AX[0], DX[-1], int(smoothingSamples), endpoint=True) smoothingScale = abs(DX[-1] - AX[0]) / smoothingSamples xlist, ylist, dxlist, dylist = PlotCurve.pointsToSmoothCurve( xarray, yarray, samples, smoothingScale, False) pathdata = PlotCurve.formatPathdata(xlist, ylist, dxlist, dylist, PlotCoordinates(), False, True) style = self.getStyleState() fillStyle = dict( (x, style[x]) for x in style if x.startswith("fill")) fillStyle["stroke"] = "none" strokeStyle = dict( (x, style[x]) for x in style if x.startswith("stroke")) if style["fill"] != "none" and len(pathdata) > 0: if vertical: firstPoint = plotCoordinates(Ax[0], 0.0) lastPoint = plotCoordinates(Dx[-1], 0.0) else: firstPoint = plotCoordinates(0.0, Ay[0]) lastPoint = plotCoordinates(0.0, Dy[-1]) pathdata2 = [ "M %r %r" % firstPoint, pathdata[0].replace("M", "L") ] pathdata2.extend(pathdata[1:]) pathdata2.append(pathdata[-1]) pathdata2.append("L %r %r" % lastPoint) output.append( svg.path(d=" ".join(pathdata2), style=PlotStyle.toString(fillStyle))) output.append( svg.path(d=" ".join(pathdata), style=PlotStyle.toString(strokeStyle))) elif visualization == "points": currentStyle = PlotStyle.toDict(self.get("style") or {}) style = self.getStyleState() if "fill" not in currentStyle: style["fill"] = "black" BCX = NP(NP(BX + CX) / 2.0) BCY = NP(NP(BY + CY) / 2.0) svgId = self.get("svgId") if svgId is None: svgIdMarker = plotDefinitions.uniqueName() else: svgIdMarker = svgId + ".marker" marker = PlotScatter.makeMarker( svgIdMarker, self.get("marker", defaultFromXsd=True), style, self.childOfTag("PlotSvgMarker")) plotDefinitions[marker.get("id")] = marker markerReference = "#" + marker.get("id") output.extend( svg.use( **{ "x": repr(x), "y": repr(y), defs.XLINK_HREF: markerReference }) for x, y in itertools.izip(BCX, BCY)) else: raise NotImplementedError("TODO: add 'errorbars'") svgId = self.get("svgId") if svgId is not None: output["id"] = svgId performanceTable.end("PlotHistogram draw") return output
def evaluate(self, dataTable, functionTable, performanceTable): """Evaluate the expression, using a DataTable as input. @type dataTable: DataTable @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression. @type functionTable: FunctionTable @param functionTable: The FunctionTable, containing any functions that might be called in this expression. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: The result of the calculation as a DataColumn. """ performanceTable.begin("Discretize") dataColumn = dataTable.fields[self["field"]] if dataColumn.fieldType.dataType in ("object", "string", "boolean"): raise defs.PmmlValidationError("Discretize requires a numeric input field, but \"%s\" is" % dataColumn.fieldType.dataType) fieldType = FakeFieldType(self.get("dataType", "string"), self.get("optype", self._optype)) fieldType._newValuesAllowed = True defaultValue = self.get("defaultValue") if defaultValue is not None: defaultValue = fieldType.stringToValue(defaultValue) data = NP("empty", len(dataTable), dtype=fieldType.dtype) mask = NP("empty", len(dataTable), dtype=defs.maskType) if defaultValue is None: mask[:] = defs.MISSING else: data[:] = defaultValue mask[:] = defs.VALID for discretizeBin in self.childrenOfTag("DiscretizeBin"): try: binValue = fieldType.stringToValue(discretizeBin["binValue"]) except ValueError: raise defs.PmmlValidationError("Cannot cast DiscretizeBin binValue \"%s\" as %s %s" % (discretizeBin["binValue"], fieldType.optype, fieldType.dataType)) fieldType.values.append(FakeFieldValue(value=binValue)) interval = discretizeBin.childOfTag("Interval") closure = interval["closure"] leftMargin = interval.get("leftMargin") rightMargin = interval.get("rightMargin") selection = None if leftMargin is not None: try: leftMargin = dataColumn.fieldType.stringToValue(leftMargin) except ValueError: raise defs.PmmlValidationError("Improper value in Interval leftMargin specification: \"%s\"" % leftMargin) if closure in ("openClosed", "openOpen"): if selection is None: selection = NP(leftMargin < dataColumn.data) else: NP("logical_and", selection, NP(leftMargin < dataColumn.data), selection) elif closure in ("closedOpen", "closedClosed"): if selection is None: selection = NP(leftMargin <= dataColumn.data) else: NP("logical_and", selection, NP(leftMargin <= dataColumn.data), selection) if rightMargin is not None: try: rightMargin = dataColumn.fieldType.stringToValue(rightMargin) except ValueError: raise defs.PmmlValidationError("Improper value in Interval rightMargin specification: \"%s\"" % rightMargin) if closure in ("openOpen", "closedOpen"): if selection is None: selection = NP(dataColumn.data < rightMargin) else: NP("logical_and", selection, NP(dataColumn.data < rightMargin), selection) elif closure in ("openClosed", "closedClosed"): if selection is None: selection = NP(dataColumn.data <= rightMargin) else: NP("logical_and", selection, NP(dataColumn.data <= rightMargin), selection) if selection is not None: NP("logical_and", selection, NP(dataColumn.mask == defs.VALID), selection) data[selection] = binValue mask[selection] = defs.VALID mask[NP(dataColumn.mask == defs.MISSING)] = defs.MISSING mask[NP(dataColumn.mask == defs.INVALID)] = defs.INVALID data, mask = FieldCastMethods.applyMapMissingTo(fieldType, data, mask, self.get("mapMissingTo")) performanceTable.end("Discretize") return DataColumn(fieldType, data, mask)
def verify(self, showSuccess=False, performanceTable=None): """Run the model verification tests defined by this element. The output is a list of results (all results or only failures, depending on C{showSuccess}), each of which is a dictionary of field names to values. Fields are: - "success": was the comparison successful? - "expectedMissing", "observedMissing": is the expected/observed value missing? - "expectedValue", "observedValue": result as an internal value. - "expectedPythonValue", "observedPythonValue": result as a Python value. - "expectedDisplayValue", "observedDisplayValue": result as a string displayValue. Only "success", "expectedMissing", and "observedMissing" appear if the "is missing?" comparison was unsuccessful. @type showSuccess: bool @param showSuccess: If True, emit output even if the tests are successful. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: JSON-like list of dicts @return: As described above. """ verificationFields = {} for verificationField in self.xpath( "pmml:VerificationFields/pmml:VerificationField"): verificationField.column = verificationField.get( "column", verificationField["field"]) verificationField.precision = verificationField.get( "precision", defaultFromXsd=True, convertType=True) verificationField.zeroThreshold = verificationField.get( "zeroThreshold", defaultFromXsd=True, convertType=True) verificationField.data = [] verificationField.mask = [] verificationFields[verificationField.column] = verificationField inputData = {} inputMask = {} for index, row in enumerate( self.childOfClass(TableInterface).iterate()): for columnName, columnValue in row.items(): verificationField = verificationFields.get(columnName) if verificationField is not None: while len(verificationField.data) < index: verificationField.data.append(defs.PADDING) verificationField.mask.append(True) verificationField.data.append(columnValue) verificationField.mask.append(False) else: inputDataField = inputData.get(columnName) if inputDataField is None: inputDataField = [] inputData[columnName] = inputDataField inputMask[columnName] = [] inputMaskField = inputMask[columnName] while len(inputDataField) < index: inputDataField.append(defs.PADDING) inputMaskField.append(True) inputDataField.append(columnValue) inputMaskField.append(False) for verificationField in verificationFields.values(): while len(verificationField.data) < index: verificationField.data.append(defs.PADDING) verificationField.mask.append(True) for columnName in inputData: inputDataField = inputData[columnName] inputMaskField = inputMask[columnName] while len(inputDataField) < index: inputDataField.append(defs.PADDING) inputMaskField.append(True) for columnName, verificationField in verificationFields.items(): inputData[columnName] = verificationField.data inputMask[columnName] = verificationField.mask model = self.getparent() if performanceTable is None: performanceTable = FakePerformanceTable() performanceTable.begin("make DataTable") dataTable = DataTable(model, inputData, inputMask, inputState=None) performanceTable.end("make DataTable") functionTable = FunctionTable() for miningField in model.xpath("pmml:MiningSchema/pmml:MiningField"): miningField.replaceField(dataTable, functionTable, performanceTable) for calculable in model.calculableTrans(): calculable.calculate(dataTable, functionTable, performanceTable) score = model.calculateScore(dataTable, functionTable, performanceTable) dataTable.score = score[None] if model.name is not None: for key, value in score.items(): if key is None: dataTable.fields[model.name] = value else: dataTable.fields["%s.%s" % (model.name, key)] = value for outputField in self.xpath("../pmml:Output/pmml:OutputField"): displayName = outputField.get("displayName", outputField["name"]) outputField.format(dataTable, functionTable, performanceTable, score) output = [] for verificationField in verificationFields.values(): observedOutput = dataTable.fields.get(verificationField["field"]) if observedOutput is None: raise defs.PmmlValidationError( "VerificationField references field \"%s\" but it was not produced by the model" ) fieldType = observedOutput.fieldType if fieldType.dataType == "object": try: newArray = [float(x) for x in observedOutput.data] except ValueError: pass else: fieldType = FakeFieldType("double", "continuous") observedOutput._data = newArray for index in xrange(len(dataTable)): record = {"field": verificationField["field"], "index": index} record["expectedMissing"] = verificationField.mask[index] record["observedMissing"] = ( observedOutput.mask is not None and observedOutput.mask[index] != defs.VALID) if record["expectedMissing"] != record["observedMissing"]: record["success"] = False output.append(record) elif not record["expectedMissing"]: record["expectedValue"] = fieldType.stringToValue( verificationField.data[index]) record["observedValue"] = observedOutput.data[index] record["expectedPythonValue"] = fieldType.valueToPython( record["expectedValue"]) record["observedPythonValue"] = fieldType.valueToPython( record["observedValue"]) record["expectedDisplayValue"] = fieldType.valueToString( record["expectedValue"]) record["observedDisplayValue"] = fieldType.valueToString( record["observedValue"]) if fieldType.optype == "continuous": if (abs(record["expectedValue"]) <= verificationField.zeroThreshold) and ( abs(record["observedValue"]) <= verificationField.zeroThreshold): record["success"] = True else: record["success"] = ( (record["expectedValue"] * (1.0 - verificationField.precision)) <= record["observedValue"] <= (record["expectedValue"] * (1.0 + verificationField.precision))) if not record["success"] or showSuccess: output.append(record) else: if record["expectedValue"] != record["observedValue"]: record["success"] = False output.append(record) else: record["success"] = True if showSuccess: output.append(record) return output
class BaselineModel(PmmlModel): """BaselineModel implements the baseline model in PMML, which is a collection of change-detection routines. U{PMML specification<http://www.dmg.org/v4-1/BaselineModel.html>}. """ scoreType = FakeFieldType("double", "continuous") def calculateScore(self, dataTable, functionTable, performanceTable): """Calculate the score of this model. This method is called by C{calculate} to separate operations that are performed by all models (in C{calculate}) from operations that are performed by specific models (in C{calculateScore}). @type subTable: DataTable @param subTable: The DataTable representing this model's lexical scope. @type functionTable: FunctionTable or None @param functionTable: A table of functions. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: A DataColumn containing the score. """ testDistributions = self.childOfTag("TestDistributions") testStatistic = testDistributions.get("testStatistic") performanceTable.begin("BaselineModel %s" % testStatistic) fieldName = testDistributions.get("field") dataColumn = dataTable.fields[fieldName] if testStatistic == "zValue": score = self.zValue(testDistributions, fieldName, dataColumn, dataTable.state, performanceTable) elif testStatistic == "CUSUM": score = self.cusum(testDistributions, fieldName, dataColumn, dataTable.state, performanceTable) else: raise NotImplementedError("TODO: add more testStatistics") performanceTable.end("BaselineModel %s" % testStatistic) return score def zValue(self, testDistributions, fieldName, dataColumn, state, performanceTable): """Calculate the score of a zValue TestStatistic. @type testDistributions: PmmlBinding @param testDistributions: The <TestDistributions> element. @type fieldName: string @param fieldName: The field name (for error messages). @type dataColumn: DataColumn @param dataColumn: The field. @type state: DataTableState @param state: The persistent state object (not used). @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: dict @return: A dictionary mapping PMML "feature" strings to DataColumns; zValue only defines the None key ("predictedValue"). """ if dataColumn.fieldType.dataType in ("object", "string", "boolean", "date", "time", "dateTime"): raise TypeError( "Field \"%s\" has dataType \"%s\", which is incompatible with BaselineModel.zValue" % (fieldName, dataColumn.fieldType.dataType)) distributions = testDistributions.xpath( "pmml:Baseline/*[@mean and @variance]") if len(distributions) == 0: raise defs.PmmlValidationError( "BaselineModel zValue requires a distribution with a mean and a variance" ) distribution = distributions[0] mean = float(distribution.get("mean")) variance = float(distribution.get("variance")) if variance <= 0.0: raise defs.PmmlValidationError( "Variance must be positive, not %g" % variance) return { None: DataColumn(self.scoreType, NP(NP(dataColumn.data - mean) / math.sqrt(variance)), dataColumn.mask) } def cusum(self, testDistributions, fieldName, dataColumn, state, performanceTable): """Calculate the score of a CUSUM TestStatistic. The CUSUM cumulative sum is a stateful calculation: each row depends on the result of the previous row. To continue calculations through multiple calls to C{calc} or C{calculate}, pass a DataTableState object and give the BaselineModel a C{stateId} attribute. The C{stateId} is not valid in strict PMML, but it can be inserted after validation or used in custom-ODG models (C{from augustus.odg import *}). @type testDistributions: PmmlBinding @param testDistributions: The <TestDistributions> element. @type fieldName: string @param fieldName: The field name (for error messages). @type dataColumn: DataColumn @param dataColumn: The field. @type state: DataTableState @param state: The persistent state object, which is used to initialize the start state and save the end state of the cumulative sum. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: dict @return: A dictionary mapping PMML "feature" strings to DataColumns; CUSUM only defines the None key ("predictedValue"). """ baseline = testDistributions.xpath( "pmml:Baseline/pmml:GaussianDistribution | pmml:Baseline/pmml:PoissonDistribution" ) alternate = testDistributions.xpath( "pmml:Alternate/pmml:GaussianDistribution | pmml:Alternate/pmml:PoissonDistribution" ) if len(baseline) == 0 or len(alternate) == 0: raise defs.PmmlValidationError( "BaselineModel CUSUM requires a Baseline and an Alternate that are either GaussianDistribution or PoissonDistribution" ) ratios = alternate[0].logpdf(dataColumn.data) - baseline[0].logpdf( dataColumn.data) if dataColumn.mask is None: good = NP("ones", len(dataColumn), dtype=NP.dtype(bool)) else: good = NP(dataColumn.mask == defs.VALID) stateId = self.get("stateId") last = None if stateId is not None: last = state.get(stateId) if last is None: last = 0.0 resetValue = testDistributions.get("resetValue", defaultFromXsd=True, convertType=True) output = NP("empty", len(dataColumn), dtype=NP.dtype(float)) performanceTable.begin("fill CUSUM") for index in xrange(len(dataColumn)): if good[index]: last = max(resetValue, last + ratios[index]) output[index] = last performanceTable.end("fill CUSUM") if stateId is not None: state[stateId] = last return {None: DataColumn(self.scoreType, output, None)}
def replaceField(self, dataTable, functionTable, performanceTable): """Replace a field in the DataTable for outlier removal, missing value handling, and invalid value treatment. @type dataTable: DataTable @param dataTable: The pre-built DataTable. @type functionTable: FunctionTable @param functionTable: A table of functions. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. """ dataColumn = dataTable.fields.get(self.name) if dataColumn is None: return performanceTable.begin("MiningField") optype = self.get("optype", dataColumn.fieldType.optype) if optype != dataColumn.fieldType.optype: dataColumn = FieldCastMethods.cast( FakeFieldType(dataColumn.fieldType.dataType, optype), dataColumn) data = dataColumn.data mask = dataColumn.mask outliers = self.get("outliers") lowValue = self.get("lowValue") if lowValue is not None: lowValue = dataColumn.fieldType.stringToValue(lowValue) if outliers == "asMissingValues": selection = NP(dataColumn.data < lowValue) mask = FieldCastMethods.outliersAsMissing( mask, dataColumn.mask, selection) elif outliers == "asExtremeValues": selection = NP(dataColumn.data < lowValue) if data is dataColumn.data: data = NP("copy", data) data.setflags(write=True) data[selection] = lowValue highValue = self.get("highValue") if highValue is not None: highValue = dataColumn.fieldType.stringToValue(highValue) if outliers == "asMissingValues": selection = NP(dataColumn.data > highValue) mask = FieldCastMethods.outliersAsMissing( mask, dataColumn.mask, selection) elif outliers == "asExtremeValues": selection = NP(dataColumn.data > highValue) if data is dataColumn.data: data = NP("copy", data) data.setflags(write=True) data[selection] = highValue mask = FieldCastMethods.applyInvalidValueTreatment( mask, self.get("invalidValueTreatment")) data, mask = FieldCastMethods.applyMapMissingTo( dataColumn.fieldType, data, mask, self.get("missingValueReplacement")) dataTable.fields.replaceField( self.name, DataColumn(dataColumn.fieldType, data, mask)) performanceTable.end("MiningField")
class NormDiscrete(PmmlExpression): """NormDiscrete implements an expression that acts as an indicator function on categorical fields, return 1 when a field is equal to a given value, 0 otherwise. U{PMML specification<http://www.dmg.org/v4-1/Transformations.html>}. """ _fieldType = FakeFieldType("integer", "continuous") def evaluate(self, dataTable, functionTable, performanceTable): """Evaluate the expression, using a DataTable as input. @type dataTable: DataTable @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression. @type functionTable: FunctionTable @param functionTable: The FunctionTable, containing any functions that might be called in this expression. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: The result of the calculation as a DataColumn. """ performanceTable.begin("NormDiscrete") dataColumn = dataTable.fields[self["field"]] value = dataColumn.fieldType.stringToValue(self["value"]) data = NP("array", NP(dataColumn.data == value), dtype=self._fieldType.dtype) data, mask = FieldCastMethods.applyMapMissingTo( self._fieldType, data, dataColumn.mask, self.get("mapMissingTo")) performanceTable.end("NormDiscrete") return DataColumn(self._fieldType, data, mask) @staticmethod def fanOutByValue(modelLoader, fieldName, dataColumn, prefix=None): """Create a suite of NormDiscrete transformations, one indicator function for each unique value in a categorical dataset. @type modelLoader: ModelLoader @param modelLoader: The ModelLoader used to create the new PMML nodes. @type fieldName: FieldName @param fieldName: The name of the categorical field to fan out, used in the names of the new fields. @type dataColumn: DataColumn @param dataColumn: The categorical dataset. @type prefix: string or None @param prefix: The PMML prefix, used to create an lxml.etree.ElementMaker. """ if prefix is None: E = modelLoader.elementMaker() else: E = modelLoader.elementMaker(prefix) if dataColumn.mask is None: values = NP("unique", dataColumn.data) else: values = NP("unique", dataColumn.data[NP(dataColumn.mask == defs.VALID)]) derivedFields = [] for value in values: stringValue = dataColumn.fieldType.valueToString(value) normDiscrete = E.NormDiscrete(field=fieldName, value=stringValue) derivedField = E.DerivedField(normDiscrete, name=("%s.%s" % (fieldName, stringValue)), dataType="integer", optype="continuous") derivedFields.append(derivedField) return derivedFields
def format(self, subTable, functionTable, performanceTable, score): """Extract or post-process output for the output field of a DataTable. @type subTable: DataTable @param subTable: The DataTable associated with this local lexical scope. @type functionTable: FunctionTable or None @param functionTable: A table of functions. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @type score: dict @param score: Dictionary mapping PMML score "feature" strings to DataColumns. This dictionary always contains a None key, which is the basic feature ("predictedValue"). @rtype: DataColumn @return: The output that would go into an output field of a DataTable. """ performanceTable.begin("OutputField") feature = self.get("feature") if feature is None: dataColumn = subTable.fields[self["name"]] elif feature == "predictedValue": dataColumn = score[None] elif feature == "predictedDisplayValue": original = score[None] toString = original.fieldType.valueToString data = NP("empty", len(subTable), dtype=NP.dtype(object)) for i, x in enumerate(original.data): data[i] = toString(x) dataColumn = DataColumn(FakeFieldType("string", "continuous"), data, None) elif feature == "transformedValue": expression = self.childOfClass(PmmlExpression) if expression is None: raise defs.PmmlValidationError( "OutputField with feature \"transformedValue\" requires an EXPRESSION" ) performanceTable.pause("OutputField") dataColumn = expression.evaluate(subTable, functionTable, performanceTable) performanceTable.unpause("OutputField") elif feature == "decision": decisions = self.childOfTag("Decisions") if decisions is None: raise defs.PmmlValidationError( "OutputField with feature \"decision\" requires a Decisions block" ) performanceTable.pause("OutputField") dataColumn = self.childOfClass(PmmlExpression).evaluate( subTable, functionTable, performanceTable) performanceTable.unpause("OutputField") if dataColumn.mask is None: valid = None else: valid = NP(dataColumn.mask == defs.VALID) fieldType = FakeFieldType("object", "any") data = NP("empty", len(subTable), dtype=fieldType.dtype) mask = NP( NP("ones", len(subTable), dtype=defs.maskType) * defs.MISSING) for decision in decisions.childrenOfTag("Decision"): value = dataColumn.fieldType.stringToValue(decision["value"]) selection = NP(dataColumn.data == value) if valid is not None: NP("logical_and", selection, valid, selection) for i in xrange(len(data)): if selection[i]: data[i] = decision mask[selection] = defs.VALID if not mask.any(): mask = None dataColumn = DataColumn(fieldType, data, mask) elif feature in score: dataColumn = score[feature] else: model = self.getparent() if model is not None: model = model.getparent() if model is None: model = "(orphaned OutputField; no parent model)" else: model = model.t raise defs.PmmlValidationError( "Models of type %s do not produce \"%s\" features (or at least, it is not yet implemented by Augustus)" % (model, feature)) dataType = self.get("dataType", dataColumn.fieldType.dataType) optype = self.get("optype", dataColumn.fieldType.optype) if (dataType != dataColumn.fieldType.dataType or optype != dataColumn.fieldType.optype) and feature not in ( "predictedDisplayValue", "decision"): dataColumn = FieldCastMethods.cast(FakeFieldType(dataType, optype), dataColumn) if feature is not None: subTable.fields[self.get("displayName", self["name"])] = dataColumn performanceTable.end("OutputField") return dataColumn