Ejemplo n.º 1
0
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("NormDiscrete")

        dataColumn = dataTable.fields[self["field"]]
        value = dataColumn.fieldType.stringToValue(self["value"])
        data = NP("array",
                  NP(dataColumn.data == value),
                  dtype=self._fieldType.dtype)
        data, mask = FieldCastMethods.applyMapMissingTo(
            self._fieldType, data, dataColumn.mask, self.get("mapMissingTo"))

        performanceTable.end("NormDiscrete")
        return DataColumn(self._fieldType, data, mask)
Ejemplo n.º 2
0
    def evaluate(self, dataTable, functionTable, performanceTable, text=None):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @type text: string or None
        @param text: If None, use the text of this Formula object; otherwise, use C{text} instead.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        if text is None:
            text = self.text

        performanceTable.begin("Formula parse")
        parsed = Formula.parse(text)
        performanceTable.end("Formula parse")

        performanceTable.begin("Formula evaluate")
        dataColumn = parsed.evaluate(dataTable, functionTable, performanceTable)

        if dataColumn.mask is None:
            return dataColumn

        data = dataColumn.data
        mask = dataColumn.mask
        mask = FieldCastMethods.applyInvalidValueTreatment(mask, self.get("invalidValueTreatment"))
        data, mask = FieldCastMethods.applyMapMissingTo(dataColumn.fieldType, data, mask, self.get("mapMissingTo"))

        performanceTable.end("Formula evaluate")
        return DataColumn(dataColumn.fieldType, data, mask)
Ejemplo n.º 3
0
    def _toDataColumn_string(self, data, mask):
        dataColumn = self._toDataColumn_object(data, mask)

        data = dataColumn.data
        mask = dataColumn.mask
        data.setflags(write=True)
        if mask is not None:
            mask.setflags(write=True)

        if mask is not None:
            for i, x in enumerate(dataColumn.data):
                if (x is None or (isinstance(x, float) and math.isnan(x))) and mask[i] == defs.VALID:
                    mask[i] = defs.MISSING
                elif not isinstance(x, basestring):
                    data[i] = repr(x)

        else:
            for i, x in enumerate(dataColumn.data):
                if x is None or (isinstance(x, float) and math.isnan(x)):
                    if mask is None:
                        mask = NP("zeros", len(data), dtype=defs.maskType)
                    mask[i] = defs.MISSING
                elif not isinstance(x, basestring):
                    data[i] = repr(x)

            if mask is not None:
                dataColumn._mask = mask

        data, mask = self._checkValues(data, mask)
        data, mask = self._checkIntervals(data, mask)

        return DataColumn(self, data, mask)
Ejemplo n.º 4
0
        def evaluate(self, dataTable, functionTable, performanceTable,
                     arguments):
            arguments = [
                x.evaluate(dataTable, functionTable, performanceTable)
                for x in arguments
            ]
            performanceTable.begin("built-in \"%s\"" % self.name)

            fieldType = self.fieldTypeFromSignature(arguments)
            left, right = arguments
            dataColumn = DataColumn(
                fieldType, NP("arctan2", left.data, right.data),
                DataColumn.mapAnyMissingInvalid([left.mask, right.mask]))

            performanceTable.end("built-in \"%s\"" % self.name)
            return dataColumn
Ejemplo n.º 5
0
    def _toDataColumn_dateTime(self, data, mask):
        data, mask = self._checkNumpy(data, mask, tryToCast=False)
        data, mask = self._checkNonNumpy(data, mask)

        data2 = NP("empty", len(data), dtype=self.dtype)
        mask2 = NP("zeros", len(data), dtype=defs.maskType)

        for i, x in enumerate(data):
            if (mask is not None and mask[i]) or (isinstance(x, float) and math.isnan(x)) or (isinstance(x, basestring) and x.upper() == "NAN"):
                data2[i] = defs.PADDING
                mask2[i] = defs.MISSING
            else:
                try:
                    data2[i] = self.stringToValue(x)
                except (ValueError, TypeError):
                    data2[i] = defs.PADDING
                    mask2[i] = defs.INVALID

        if not mask2.any():
            data, mask = data2, None
        else:
            data, mask = data2, mask2

        data, mask = self._checkValues(data, mask)
        data, mask = self._checkIntervals(data, mask)
        return DataColumn(self, data, mask)
Ejemplo n.º 6
0
    def _toDataColumn_dateTimeNumber(self, data, mask):
        dataColumn = self._toDataColumn_number(data, mask)
        data, mask = NP(NP(dataColumn.data * self._factor) + self._offset), dataColumn.mask

        data, mask = self._checkValues(data, mask)
        data, mask = self._checkIntervals(data, mask)
        return DataColumn(self, data, mask)
Ejemplo n.º 7
0
    def functionAverageFake(self, value, howmany, fieldType):
        """Averages rows in a DataColumn when it is known that there are no matches.

        @type value: number
        @param value: Initial and final value.
        @type howmany: int
        @param howmany: Number of rows.
        @type fieldType: FieldType
        @param fieldType: The type of field to emulate.
        @rtype: DataColumn
        @return: The faked results.
        """

        fieldType = FakeFieldType("double", "continuous")
        numerator = NP("empty", howmany, dtype=fieldType.dtype)
        denominator = NP("empty", howmany, dtype=fieldType.dtype)
        numerator[:] = value[0]
        denominator[:] = value[1]
        data = NP(numerator / denominator)
        if value[1] == 0:
            mask = NP("empty", howmany, dtype=defs.maskType)
            mask[:] = defs.INVALID
        else:
            mask = None
        return DataColumn(fieldType, data, mask)
Ejemplo n.º 8
0
        def evaluate(self, dataTable, functionTable, performanceTable,
                     arguments):
            arguments = [
                x.evaluate(dataTable, functionTable, performanceTable)
                for x in arguments
            ]
            performanceTable.begin("built-in \"%s\"" % self.name)

            fieldType = self.allBooleanType(arguments, atleast=2)

            data = NP("zeros", len(dataTable), dtype=fieldType.dtype)
            mask = None
            allbad = NP("ones", len(dataTable), dtype=NP.dtype(bool))

            (data, allbad), mask = self.applySkipMissing((data, allbad), mask,
                                                         arguments)

            if allbad.any():
                if mask is None:
                    mask = allbad * defs.MISSING
                else:
                    NP("logical_and", allbad, NP(mask == defs.VALID), allbad)
                    mask[allbad] = defs.MISSING

            performanceTable.end("built-in \"%s\"" % self.name)
            return DataColumn(fieldType, data, mask)
Ejemplo n.º 9
0
        def evaluate(self, dataTable, functionTable, performanceTable,
                     arguments):
            arguments = [
                x.evaluate(dataTable, functionTable, performanceTable)
                for x in arguments
            ]
            performanceTable.begin("built-in \"%s\"" % self.name)

            fieldType = self.fieldTypeFromSignature(arguments)
            dataColumn = DataColumn(
                fieldType, NP("cos", arguments[0].data * arguments[1].data),
                DataColumn.mapAnyMissingInvalid(
                    [arguments[0].mask, arguments[1].mask]))

            performanceTable.end("built-in \"%s\"" % self.name)
            return dataColumn
Ejemplo n.º 10
0
    def functionAverage(self, dataColumn, whereMask, groupSelection, getstate,
                        setstate):
        """Averages rows in a DataColumn, possibly with an SQL where mask and groupField.

        @type dataColumn: DataColumn
        @param dataColumn: The input data column.
        @type whereMask: 1d Numpy array of bool, or None
        @param whereMask: The result of the SQL where selection.
        @type groupSelection: 1d Numpy array of bool, or None.
        @param groupSelection: Rows corresponding to a particular value of the groupField.
        @type getstate: callable function
        @param getstate: Retrieve staring values from the DataTableState.
        @type setstate: callable function
        @param setstate: Store ending values to the DataTableState.
        @rtype: DataColumn
        @return: A column of averaged rows.
        """

        fieldType = FakeFieldType("double", "continuous")

        if dataColumn.fieldType.dataType not in ("integer", "float", "double"):
            raise defs.PmmlValidationError(
                "Aggregate function \"average\" requires a numeric input field: \"integer\", \"float\", \"double\""
            )

        denominator = NP("ones", len(dataColumn), dtype=fieldType.dtype)
        if dataColumn.mask is not None:
            NP("logical_and", denominator, NP(dataColumn.mask == defs.VALID),
               denominator)

        if whereMask is not None:
            NP("logical_and", denominator, whereMask, denominator)

        if groupSelection is not None:
            NP("logical_and", denominator, groupSelection, denominator)

        numerator = NP("multiply", denominator, dataColumn.data)

        if getstate is not None and len(dataColumn) > 0:
            startingState = getstate()
            if startingState is not None:
                startingNumerator, startingDenominator = startingState
                numerator[0] += startingNumerator
                denominator[0] += startingDenominator

        numerator = NP("cumsum", numerator)
        denominator = NP("cumsum", denominator)

        data = NP(numerator / denominator)
        mask = NP(NP("logical_not", NP("isfinite", data)) * defs.INVALID)
        if not mask.any():
            mask = None

        if setstate is not None and len(dataColumn) > 0:
            setstate((numerator[-1], denominator[-1]))

        return DataColumn(fieldType, data, mask)
Ejemplo n.º 11
0
    def calculate(self, dataTable, functionTable=None, performanceTable=None):
        """Perform a calculation directly, without constructing a
        DataTable first.

        This method is intended for performance-critical cases where
        the DataTable would be built without having to analyze the
        PMML for field type context.

        This method modifies the input DataTable and FunctionTable.

        @type dataTable: DataTable
        @param dataTable: The pre-built DataTable.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.  Initially, it contains only the built-in functions, but any user functions defined in PMML would be added to it.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataTable
        @return: A DataTable containing the result, usually a modified version of the input.
        """

        if functionTable is None:
            functionTable = FunctionTable()
        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        if not self.get("isScorable", defaultFromXsd=True, convertType=True):
            dataTable.score = DataColumn(self.scoreType,
                                         NP(NP("ones", len(dataTable), dtype=self.scoreType.dtype) * defs.PADDING),
                                         NP(NP("ones", len(dataTable), dtype=defs.maskType) * defs.INVALID))
            return dataTable

        subTable = dataTable.subTable()

        for miningField in self.xpath("pmml:MiningSchema/pmml:MiningField"):
            miningField.replaceField(subTable, functionTable, performanceTable)

        for calculable in self.calculableTrans():
            calculable.calculate(subTable, functionTable, performanceTable)

        score = self.calculateScore(subTable, functionTable, performanceTable)
        dataTable.score = score[None]
        if self.name is not None:
            for key, value in score.items():
                if key is None:
                    dataTable.fields[self.name] = value
                else:
                    dataTable.fields["%s.%s" % (self.name, key)] = value

        for outputField in self.xpath("pmml:Output/pmml:OutputField"):
            displayName = outputField.get("displayName", outputField["name"])
            dataTable.output[displayName] = outputField.format(subTable, functionTable, performanceTable, score)

        for fieldName in subTable.output:
            dataTable.output[fieldName] = subTable.output[fieldName]

        return dataTable.score
Ejemplo n.º 12
0
        def evaluate(self, dataTable, functionTable, performanceTable, arguments):
            arguments = [x.evaluate(dataTable, functionTable, performanceTable) for x in arguments]
            performanceTable.begin("built-in \"%s\"" % self.name)

            fieldType = self.fieldTypeFromSignature(arguments)
            left, right = arguments
            dataColumn = DataColumn(fieldType, NP("arctan2", left.data, right.data), DataColumn.mapAnyMissingInvalid([left.mask, right.mask]))

            performanceTable.end("built-in \"%s\"" % self.name)
            return dataColumn
Ejemplo n.º 13
0
    def functionMultiset(self, dataColumn, whereMask, groupSelection, getstate,
                         setstate):
        """Derives a multiset of rows in a DataColumn, possibly with an SQL where mask and groupField.

        @type dataColumn: DataColumn
        @param dataColumn: The input data column.
        @type whereMask: 1d Numpy array of bool, or None
        @param whereMask: The result of the SQL where selection.
        @type groupSelection: 1d Numpy array of bool, or None.
        @param groupSelection: Rows corresponding to a particular value of the groupField.
        @type getstate: callable function
        @param getstate: Retrieve staring values from the DataTableState.
        @type setstate: callable function
        @param setstate: Store ending values to the DataTableState.
        @rtype: DataColumn of dict objects
        @return: A column of multisetted rows.
        """

        fieldType = FakeFieldType("object", "any")

        selection = NP("ones", len(dataColumn), dtype=NP.dtype(bool))
        if dataColumn.mask is not None:
            selection = NP("logical_and", selection,
                           NP(dataColumn.mask == defs.VALID))

        if whereMask is not None:
            NP("logical_and", selection, whereMask, selection)

        if groupSelection is not None:
            NP("logical_and", selection, groupSelection, selection)

        multiset = {}
        if getstate is not None:
            startingState = getstate()
            if startingState is not None:
                multiset = startingState
        current = dict(multiset)

        data = NP("empty", len(dataColumn), dtype=NP.dtype(object))

        toPython = dataColumn.fieldType.valueToPython
        for i, x in enumerate(dataColumn.data):
            if selection[i]:
                value = toPython(x)
                if value not in multiset:
                    multiset[value] = 0
                multiset[value] += 1
                current = dict(multiset)
            data[i] = current

        if setstate is not None:
            setstate(multiset)

        return DataColumn(fieldType, data, None)
Ejemplo n.º 14
0
    def _toDataColumn_number(self, data, mask):
        data, mask = self._checkNumpy(data, mask)
        if isinstance(data, NP.ndarray) and (mask is None or isinstance(mask, NP.ndarray)) and data.dtype == self.dtype:
            mask2 = NP("isnan", data)
            if mask is None:
                mask = NP("array", mask2, defs.maskType) * defs.MISSING
            else:
                mask[mask2] = defs.MISSING

        else:
            data, mask = self._checkNonNumpy(data, mask)
            try:
                data = NP("array", data, dtype=self.dtype)
                # mask is handled in the else statement after the except block

            except (ValueError, TypeError):
                data2 = NP("empty", len(data), dtype=self.dtype)
                if mask is None:
                    mask2 = NP("zeros", len(data), dtype=defs.maskType)
                else:
                    mask2 = NP("fromiter", ((defs.VALID if not m else defs.MISSING) for m in mask), dtype=defs.maskType, count=len(mask))

                for i, v in enumerate(data):
                    try:
                        data2[i] = v
                        if mask2[i] == defs.VALID and ((isinstance(v, float) and math.isnan(v)) or (isinstance(v, basestring) and v.upper() == "NAN")):
                            mask2[i] = defs.MISSING
                        if v is None:
                            raise TypeError
                    except (ValueError, TypeError):
                        data2[i] = defs.PADDING
                        if mask2[i] == defs.VALID:
                            if (isinstance(v, float) and math.isnan(v)) or (isinstance(v, basestring) and v.upper() == "NAN"):
                                mask2[i] = defs.MISSING
                            else:
                                mask2[i] = defs.INVALID

                if not mask2.any():
                    mask2 = None

                data, mask = data2, mask2

            else:
                mask2 = NP("isnan", data)
                if mask is None:
                    mask = NP("array", mask2, defs.maskType)
                else:
                    mask = NP(NP("array", NP("logical_or", mask2, NP("fromiter", (m != 0 for m in mask), dtype=NP.dtype(bool), count=len(mask))), defs.maskType) * defs.MISSING)
                if not mask.any():
                    mask = None

        data, mask = self._checkValues(data, mask)
        data, mask = self._checkIntervals(data, mask)
        return DataColumn(self, data, mask)
Ejemplo n.º 15
0
        def evaluate(self, dataTable, functionTable, performanceTable,
                     arguments):
            arguments = [
                x.evaluate(dataTable, functionTable, performanceTable)
                for x in arguments
            ]
            performanceTable.begin("built-in \"%s\"" % self.name)

            fieldType = self.fieldTypeFromSignature(arguments)
            data = NP("arctanh", arguments[0].data)
            mask = self.maskInvalid(data, arguments[0].mask)

            performanceTable.end("built-in \"%s\"" % self.name)
            return DataColumn(fieldType, data, mask)
Ejemplo n.º 16
0
        def evaluate(self, dataTable, functionTable, performanceTable,
                     arguments):
            arguments = [
                x.evaluate(dataTable, functionTable, performanceTable)
                for x in arguments
            ]
            performanceTable.begin("built-in \"%s\"" % self.name)

            fieldType = self.fieldTypeFromSignature(arguments)
            dataColumn = DataColumn(fieldType, NP("sin", arguments[0].data),
                                    arguments[0].mask)

            performanceTable.end("built-in \"%s\"" % self.name)
            return dataColumn
Ejemplo n.º 17
0
        def evaluate(self, dataTable, functionTable, performanceTable,
                     arguments):
            arguments = [
                x.evaluate(dataTable, functionTable, performanceTable)
                for x in arguments
            ]
            performanceTable.begin("built-in \"%s\"" % self.name)

            fieldType = self.fieldTypeFromSignature(arguments)
            left, right = arguments

            zeroDenominators = NP(NP(right.data == 0.0) * defs.INVALID)
            if not zeroDenominators.any():
                zeroDenominators = None

            mask = DataColumn.mapAnyMissingInvalid(
                [zeroDenominators, left.mask, right.mask])

            dataColumn = DataColumn(fieldType,
                                    NP("floor_divide", left.data, right.data),
                                    mask)

            performanceTable.end("built-in \"%s\"" % self.name)
            return dataColumn
Ejemplo n.º 18
0
    def functionSum(self, dataColumn, whereMask, groupSelection, getstate,
                    setstate):
        """Adds up rows in a DataColumn, possibly with an SQL where mask and groupField.

        @type dataColumn: DataColumn
        @param dataColumn: The input data column.
        @type whereMask: 1d Numpy array of bool, or None
        @param whereMask: The result of the SQL where selection.
        @type groupSelection: 1d Numpy array of bool, or None.
        @param groupSelection: Rows corresponding to a particular value of the groupField.
        @type getstate: callable function
        @param getstate: Retrieve staring values from the DataTableState.
        @type setstate: callable function
        @param setstate: Store ending values to the DataTableState.
        @rtype: DataColumn
        @return: A column of added rows.
        """

        fieldType = FakeFieldType("double", "continuous")

        if dataColumn.fieldType.dataType not in ("integer", "float", "double"):
            raise defs.PmmlValidationError(
                "Aggregate function \"sum\" requires a numeric input field: \"integer\", \"float\", \"double\""
            )

        ones = NP("ones", len(dataColumn), dtype=fieldType.dtype)
        if dataColumn.mask is not None:
            NP("logical_and", ones, NP(dataColumn.mask == defs.VALID), ones)

        if whereMask is not None:
            NP("logical_and", ones, whereMask, ones)

        if groupSelection is not None:
            NP("logical_and", ones, groupSelection, ones)

        NP("multiply", ones, dataColumn.data, ones)

        if getstate is not None and len(dataColumn) > 0:
            startingState = getstate()
            if startingState is not None:
                ones[0] += startingState

        data = NP("cumsum", ones)

        if setstate is not None and len(dataColumn) > 0:
            setstate(data[-1])

        return DataColumn(fieldType, data, None)
Ejemplo n.º 19
0
    def singleton(self, inputData, inputMask=None, inputState=None):
        """Create a single-row DataTable for event-based processes.

        This static method is to the DataTable constructor, but it
        creates a DataTable with only one row and it uses the Python
        data type of the C{inputData} to define a type, rather than an
        explicit C{context}.

        @type inputData: dict-like mapping from strings to single values (not lists)
        @param inputData: A single data record.
        @type inputMask: dict-like mapping from strings to single C{defs.maskType} values (not lists), or None
        @param inputMask: A single mask.
        @type inputState: DataTableState or None
        @param inputState: Initial state of the DataTable.  To continue a previous calculation, use the C{dataTable.state} from the previous calculation.
        """

        dataColumns = OrderedDict()
        for fieldName in sorted(inputData.keys()):
            value = inputData[fieldName]

            if isinstance(value, basestring):
                fieldType = FakeFieldType("string", "continuous")
            elif isinstance(value, float):
                fieldType = FakeFieldType("double", "continuous")
            elif isinstance(value, int):
                fieldType = FakeFieldType("integer", "continuous")
            elif isinstance(value, bool):
                fieldType = FakeFieldType("boolean", "continuous")

            # TODO: PMML date types (when passed a datetype.datetype object)

            else:
                fieldType = FakeFieldType("object", "any")

            data = NP("empty", 1, dtype=fieldType.dtype)
            data[0] = value

            if inputMask is None or inputMask.get(fieldName) is None:
                mask = None
            else:
                mask = NP("empty", 1, dtype=defs.maskType)
                mask[0] = inputMask.get(fieldName)

            dataColumns[fieldName] = DataColumn(fieldType, data, mask)

        dataTable = DataTable.__new__(DataTable)
        dataTable._configure(dataColumns, inputState)
        return dataTable
Ejemplo n.º 20
0
    def functionMinMaxFake(self, value, howmany, fieldType):
        """Minimizes or maximizes rows in a DataColumn when it is known that there are no matches.

        @type value: number
        @param value: Initial and final value.
        @type howmany: int
        @param howmany: Number of rows.
        @type fieldType: FieldType
        @param fieldType: The type of field to emulate.
        @rtype: DataColumn
        @return: The faked results.
        """

        data = NP("empty", howmany, dtype=fieldType.dtype)
        data[:] = value
        return DataColumn(fieldType, data, None)
Ejemplo n.º 21
0
    def functionMultisetFake(self, value, howmany, fieldType):
        """Derives a multiset of rows in a DataColumn when it is known that there are no matches.

        @type value: number
        @param value: Initial and final value.
        @type howmany: int
        @param howmany: Number of rows.
        @type fieldType: FieldType
        @param fieldType: The type of field to emulate.
        @rtype: DataColumn
        @return: The faked results.
        """

        fieldType = FakeFieldType("object", "any")
        data = NP("empty", howmany, dtype=fieldType.dtype)
        data[:] = value
        return DataColumn(fieldType, data, None)
Ejemplo n.º 22
0
        def evaluate(self, dataTable, functionTable, performanceTable, arguments):
            arguments = [x.evaluate(dataTable, functionTable, performanceTable) for x in arguments]
            performanceTable.begin("built-in \"%s\"" % self.name)

            fieldType = self.fieldTypeFromSignature(arguments)
            left, right = arguments

            zeroDenominators = NP(NP(right.data == 0.0) * defs.INVALID)
            if not zeroDenominators.any():
                zeroDenominators = None

            mask = DataColumn.mapAnyMissingInvalid([zeroDenominators, left.mask, right.mask])

            dataColumn = DataColumn(fieldType, NP("floor_divide", left.data, right.data), mask)

            performanceTable.end("built-in \"%s\"" % self.name)
            return dataColumn
Ejemplo n.º 23
0
    def zValue(self, testDistributions, fieldName, dataColumn, state,
               performanceTable):
        """Calculate the score of a zValue TestStatistic.

        @type testDistributions: PmmlBinding
        @param testDistributions: The <TestDistributions> element.
        @type fieldName: string
        @param fieldName: The field name (for error messages).
        @type dataColumn: DataColumn
        @param dataColumn: The field.
        @type state: DataTableState
        @param state: The persistent state object (not used).
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: dict
        @return: A dictionary mapping PMML "feature" strings to DataColumns; zValue only defines the None key ("predictedValue").
        """

        if dataColumn.fieldType.dataType in ("object", "string", "boolean",
                                             "date", "time", "dateTime"):
            raise TypeError(
                "Field \"%s\" has dataType \"%s\", which is incompatible with BaselineModel.zValue"
                % (fieldName, dataColumn.fieldType.dataType))

        distributions = testDistributions.xpath(
            "pmml:Baseline/*[@mean and @variance]")
        if len(distributions) == 0:
            raise defs.PmmlValidationError(
                "BaselineModel zValue requires a distribution with a mean and a variance"
            )

        distribution = distributions[0]
        mean = float(distribution.get("mean"))
        variance = float(distribution.get("variance"))
        if variance <= 0.0:
            raise defs.PmmlValidationError(
                "Variance must be positive, not %g" % variance)

        return {
            None:
            DataColumn(self.scoreType,
                       NP(NP(dataColumn.data - mean) / math.sqrt(variance)),
                       dataColumn.mask)
        }
Ejemplo n.º 24
0
        def evaluate(self, dataTable, functionTable, performanceTable,
                     arguments):
            performanceTable.begin("built-in \"%s\"" % self.name)

            fieldType = self._typeReverseMap[BOOL]
            if len(arguments) != 2:
                raise defs.PmmlValidationError(
                    "Function \"like\" requires exactly two arguments")

            if isinstance(arguments[1], Constant):
                pattern = arguments[1].evaluateOne(convertType=False)
                try:
                    pattern = re.compile(pattern)
                except sre_constants as err:
                    raise defs.PmmlValidationError(
                        "Could not compile regex pattern \"%s\": %s" %
                        (pattern, str(err)))
            else:
                raise defs.PmmlValidationError(
                    "Function \"like\" requires its second argument (the regex pattern) to be a Constant"
                )

            performanceTable.pause("built-in \"%s\"" % self.name)
            test = arguments[0].evaluate(dataTable, functionTable,
                                         performanceTable)
            performanceTable.unpause("built-in \"%s\"" % self.name)

            if test.fieldType.optype == "continuous":
                d = test.data
                data = NP("fromiter", (re.match(pattern, d[i]) is not None
                                       for i in xrange(len(dataTable))),
                          dtype=fieldType.dtype,
                          count=len(dataTable))

            else:
                d = test.data
                ds = test.fieldType.valueToString
                data = NP("fromiter", (re.match(pattern, ds(d[i])) is not None
                                       for i in xrange(len(dataTable))),
                          dtype=fieldType.dtype,
                          count=len(dataTable))

            performanceTable.end("built-in \"%s\"" % self.name)
            return DataColumn(fieldType, data, test.mask)
Ejemplo n.º 25
0
    def _toDataColumn_object(self, data, mask):
        data, mask = self._checkNumpy(data, mask)
        if isinstance(data, NP.ndarray) and (mask is None or isinstance(mask, NP.ndarray)) and data.dtype == self.dtype:
            pass  # proceed to return statement (after checking values and intervals)

        else:
            data, mask = self._checkNonNumpy(data, mask)
            data = NP.array(data, dtype=self.dtype)

            if mask is None:
                mask = NP("fromiter", (defs.MISSING if (isinstance(d, float) and math.isnan(d)) else defs.VALID for d in data), dtype=defs.maskType, count=len(data))
            else:
                mask = NP("fromiter", (defs.MISSING if (m != 0 or (isinstance(data[i], float) and math.isnan(data[i]))) else defs.VALID for i, m in enumerate(mask)), dtype=defs.maskType, count=len(mask))
            if not mask.any():
                mask = None

        data, mask = self._checkValues(data, mask)
        data, mask = self._checkIntervals(data, mask)
        return DataColumn(self, data, mask)
Ejemplo n.º 26
0
    def functionCount(self, dataColumn, whereMask, groupSelection, getstate,
                      setstate):
        """Counts rows in a DataColumn, possibly with an SQL where mask and groupField.

        @type dataColumn: DataColumn
        @param dataColumn: The input data column.
        @type whereMask: 1d Numpy array of bool, or None
        @param whereMask: The result of the SQL where selection.
        @type groupSelection: 1d Numpy array of bool, or None.
        @param groupSelection: Rows corresponding to a particular value of the groupField.
        @type getstate: callable function
        @param getstate: Retrieve staring values from the DataTableState.
        @type setstate: callable function
        @param setstate: Store ending values to the DataTableState.
        @rtype: DataColumn
        @return: A column of counted rows.
        """

        fieldType = FakeFieldType("integer", "continuous")

        ones = NP("ones", len(dataColumn), dtype=fieldType.dtype)
        if dataColumn.mask is not None:
            NP("logical_and", ones, NP(dataColumn.mask == defs.VALID), ones)

        if whereMask is not None:
            NP("logical_and", ones, whereMask, ones)

        if groupSelection is not None:
            NP("logical_and", ones, groupSelection, ones)

        if getstate is not None and len(dataColumn) > 0:
            startingState = getstate()
            if startingState is not None:
                ones[0] += startingState

        data = NP("cumsum", ones)

        if setstate is not None and len(dataColumn) > 0:
            setstate(data[-1])

        return DataColumn(fieldType, data, None)
Ejemplo n.º 27
0
        def evaluate(self, dataTable, functionTable, performanceTable, arguments):
            arguments = [x.evaluate(dataTable, functionTable, performanceTable) for x in arguments]
            performanceTable.begin("built-in \"%s\"" % self.name)

            fieldType = self.fieldTypeFromSignature(arguments)
            test, low, high = arguments

            if test.fieldType.dataType == "object" or (test.fieldType.dataType == "string" and test.fieldType.optype == "continuous" and low.fieldType.optype == "continuous"):
                ld = test.data
                rd = low.data
                data = NP("fromiter", (ld[i] >= rd[i] for i in xrange(len(dataTable))), dtype=fieldType.dtype, count=len(dataTable))

            elif test.fieldType.dataType == "string":
                ld = test.data
                rd = low.data
                l2s = test.fieldType.valueToString
                r2s = low.fieldType.valueToString
                data = NP("fromiter", (l2s(ld[i]) >= r2s(rd[i]) for i in xrange(len(dataTable))), dtype=fieldType.dtype, count=len(dataTable))

            else:
                data = NP("greater_equal", test.data, low.data)

            if test.fieldType.dataType == "object" or (test.fieldType.dataType == "string" and test.fieldType.optype == "continuous" and high.fieldType.optype == "continuous"):
                ld = test.data
                rd = high.data
                datahigh = NP("fromiter", (ld[i] <= rd[i] for i in xrange(len(dataTable))), dtype=fieldType.dtype, count=len(dataTable))

            elif test.fieldType.dataType == "string":
                ld = test.data
                rd = high.data
                l2s = test.fieldType.valueToString
                r2s = high.fieldType.valueToString
                datahigh = NP("fromiter", (l2s(ld[i]) <= r2s(rd[i]) for i in xrange(len(dataTable))), dtype=fieldType.dtype, count=len(dataTable))

            else:
                datahigh = NP("less_equal", test.data, high.data)

            NP("logical_and", data, datahigh, data)

            performanceTable.end("built-in \"%s\"" % self.name)
            return DataColumn(fieldType, data, DataColumn.mapAnyMissingInvalid([test.mask, low.mask, high.mask]))
Ejemplo n.º 28
0
    def buildManually(self,
                      fieldTypes,
                      internalArrays,
                      internalMasks=None,
                      inputState=None):
        """Create a DataTable from pre-built Numpy arrays filled with
        internal values rather than user-friendly values.  For experts
        only.

        @type fieldTypes: dict of str to FieldTypes
        @param fieldTypes: Maps field names to their FieldType.
        @type internalArrays: dict of str to 1d Numpy arrays.
        @param internalArrays: Maps field names to the internal data.
        @type internalMasks: dict of str to 1d Numpy arrays, or None
        @param internalMasks: Maps field names to the masks, or None for no masks.
        @type inputState: DataTableState or None
        @param inputState: Initial state of the DataTable.  To continue a previous calculation, use the C{dataTable.state} from the previous calculation.
        @raise ValueError: If the C{fieldTypes}, C{internalArrays}, or C{internalMasks} have different field names, this function raises an error.
        """

        if internalMasks is None:
            internalMasks = dict((x, None) for x in internalArrays)

        if set(fieldTypes) != set(internalArrays) or set(fieldTypes) != set(
                internalMasks):
            raise ValueError(
                "Mismatch between fieldType names, internalArray names, or internalMasks names"
            )

        dataColumns = {}
        for name in sorted(fieldTypes):
            dataColumns[name] = DataColumn(fieldTypes[name],
                                           internalArrays[name],
                                           internalMasks[name])

        dataTable = DataTable.__new__(DataTable)
        dataTable._configure(dataColumns, inputState)
        return dataTable
Ejemplo n.º 29
0
    def _toDataColumn_internal(self, data, mask):
        data, mask = self._checkNumpy(data, mask, tryToCast=False)
        data, mask = self._checkNonNumpy(data, mask)
        
        try:
            data = NP("fromiter", (self.stringToValue(d) for d in data), dtype=self.dtype, count=len(data))
            # mask is handled in the else statement after the except block

        except ValueError:
            data2 = NP("empty", len(data), dtype=self.dtype)
            if mask is None:
                mask2 = NP("zeros", len(data), dtype=defs.maskType)
            else:
                mask2 = NP("fromiter", (defs.VALID if not m else defs.MISSING for m in mask), dtype=defs.maskType, count=len(mask))

            for i, v in enumerate(data):
                if isinstance(v, float) and math.isnan(v):
                    data2[i] = defs.PADDING
                    mask2[i] = defs.MISSING
                else:
                    try:
                        data2[i] = self.stringToValue(v)
                    except (ValueError, TypeError):
                        data2[i] = defs.PADDING
                        mask2[i] = defs.INVALID

            if not mask2.any():
                mask2 = None

            data, mask = data2, mask2

        else:
            if mask is not None and not isinstance(mask, NP.ndarray):
                mask = NP("array", mask, dtype=defs.maskType)

        # this is the only _toDataColumn that doesn't check values and intervals because these were checked in _setup for categorical and ordinal strings

        return DataColumn(self, data, mask)
Ejemplo n.º 30
0
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("Apply")

        function = functionTable.get(self.get("function"))
        if function is None:
            raise LookupError(
                "Apply references function \"%s\", but it does not exist" %
                self.get("function"))

        arguments = self.childrenOfClass(PmmlExpression)

        performanceTable.pause("Apply")
        dataColumn = function.evaluate(dataTable, functionTable,
                                       performanceTable, arguments)
        performanceTable.unpause("Apply")

        mask = FieldCastMethods.applyInvalidValueTreatment(
            dataColumn.mask, self.get("invalidValueTreatment"))
        data, mask = FieldCastMethods.applyMapMissingTo(
            dataColumn.fieldType, dataColumn.data, mask,
            self.get("mapMissingTo"))

        performanceTable.end("Apply")
        return DataColumn(dataColumn.fieldType, data, mask)
Ejemplo n.º 31
0
    def _selectMax(self, dataTable, functionTable, performanceTable, segmentation):
        """Used by C{calculateScore}."""

        performanceTable.begin("Segmentation max")

        scoresData = NP("empty", len(dataTable), dtype=NP.dtype(object))
        filled = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
        unfilled = NP("ones", len(dataTable), dtype=NP.dtype(bool))

        newOutputData = []
        for segment in segmentation.childrenOfTag("Segment", iterator=True):
            performanceTable.pause("Segmentation max")
            selection = segment.childOfClass(PmmlPredicate).evaluate(dataTable, functionTable, performanceTable)
            performanceTable.unpause("Segmentation max")
            if not selection.any():
                continue
            
            subTable = dataTable.subTable(selection)
            subModel = segment.childOfClass(PmmlModel)
            performanceTable.pause("Segmentation max")
            subModel.calculate(subTable, functionTable, performanceTable)
            performanceTable.unpause("Segmentation max")

            if subTable.score.fieldType.dataType in ("string", "boolean", "object"):
                raise defs.PmmlValidationError("Segmentation with multipleModelMethod=\"max\" cannot be applied to models that produce dataType \"%s\"" % subTable.score.fieldType.dataType)

            # ignore invalid in matches (like the built-in "min" Apply function)
            if subTable.score.mask is not None:
                NP("logical_and", selection, NP(subTable.score.mask == defs.VALID), selection)

            selectionFilled = NP("logical_and", selection, filled)
            selectionUnfilled = NP("logical_and", selection, unfilled)
            filled_selection = filled[selection]
            unfilled_selection = unfilled[selection]

            left, right = subTable.score.data[filled_selection], scoresData[selectionFilled]
            condition = NP(left > right)
            scoresData[selectionFilled] = NP("where", condition, left, right)
            scoresData[selectionUnfilled] = subTable.score.data[unfilled_selection]

            for fieldName, dataColumn in subTable.output.items():
                if fieldName not in dataTable.output:
                    data = NP("empty", len(dataTable), dtype=dataColumn.fieldType.dtype)
                    data[selectionUnfilled] = dataColumn.data

                    mask = NP(NP("ones", len(dataTable), dtype=defs.maskType) * defs.MISSING)
                    if dataColumn.mask is None:
                        mask[selectionUnfilled] = defs.VALID
                    else:
                        mask[selectionUnfilled] = dataColumn.mask

                    newDataColumn = DataColumn(dataColumn.fieldType, data, mask)
                    newDataColumn._unlock()
                    dataTable.output[fieldName] = newDataColumn
                    newOutputData.append(newDataColumn)

                else:
                    newDataColumn = dataTable.output[fieldName]

                    newDataColumn.data[selectionFilled] = NP("where", condition, dataColumn.data[filled_selection], newDataColumn.data[selectionFilled])
                    newDataColumn.data[selectionUnfilled] = dataColumn.data[unfilled_selection]

                    if dataColumn.mask is None:
                        newDataColumn.mask[selectionUnfilled] = defs.VALID
                    else:
                        newDataColumn.mask[selectionUnfilled] = dataColumn.mask

            filled += selectionUnfilled
            unfilled -= selectionUnfilled

        for newDataColumn in newOutputData:
            if not newDataColumn.mask.any():
                newDataColumn._mask = None
            newDataColumn._lock()
            
        if filled.all():
            scoresMask = None
        else:
            scoresMask = NP(NP("logical_not", filled) * defs.MISSING)
        
        scores = DataColumn(self.scoreType, scoresData, scoresMask)

        performanceTable.end("Segmentation max")
        return {None: scores}
Ejemplo n.º 32
0
        def evaluate(self, dataTable, functionTable, performanceTable, arguments):
            arguments = [x.evaluate(dataTable, functionTable, performanceTable) for x in arguments]
            performanceTable.begin("built-in \"%s\"" % self.name)

            fieldType = self.fieldTypeFromSignature(arguments)
            dataColumn = DataColumn(fieldType, NP("cos", arguments[0].data * arguments[1].data), DataColumn.mapAnyMissingInvalid([arguments[0].mask, arguments[1].mask]))

            performanceTable.end("built-in \"%s\"" % self.name)
            return dataColumn
Ejemplo n.º 33
0
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        function = self["function"]
        groupField = self.get("groupField")

        if groupField is None:
            performanceTable.begin("Aggregate %s" % function)
        else:
            performanceTable.begin("Aggregate %s groupField" % function)

        dataColumn = dataTable.fields[self["field"]]
        whereMask = self.where(dataTable, functionTable, performanceTable)
        stateId = self.get("stateId")

        if groupField is None:
            if stateId is None:
                getstate = None
                setstate = None
            else:

                def getstate():
                    return dataTable.state.get(stateId)

                def setstate(value):
                    dataTable.state[stateId] = value

            if function == "count":
                dataColumn = self.functionCount(dataColumn, whereMask, None,
                                                getstate, setstate)

            elif function == "sum":
                dataColumn = self.functionSum(dataColumn, whereMask, None,
                                              getstate, setstate)

            elif function == "average":
                dataColumn = self.functionAverage(dataColumn, whereMask, None,
                                                  getstate, setstate)

            elif function == "min":
                dataColumn = self.functionMin(dataColumn, whereMask, None,
                                              getstate, setstate)

            elif function == "max":
                dataColumn = self.functionMax(dataColumn, whereMask, None,
                                              getstate, setstate)

            elif function == "multiset":
                dataColumn = self.functionMultiset(dataColumn, whereMask, None,
                                                   getstate, setstate)

            performanceTable.end("Aggregate %s" % function)
            return dataColumn

        else:
            groupColumn = dataTable.fields[groupField]
            if groupColumn.mask is None:
                validGroup = groupColumn.data
            else:
                validGroup = groupColumn.data[NP(
                    groupColumn.mask == defs.VALID)]

            if stateId is not None:
                state = dataTable.state.get(stateId)
                if state is None:
                    record = {}
                else:
                    record = state

            valuesSeen = dict((stringValue, False) for stringValue in record)

            groupTables = {}
            groupColumnFieldType = None
            for groupValue in NP("unique", validGroup):
                groupSelection = NP(groupColumn.data == groupValue)
                if groupColumn.mask is not None:
                    NP("logical_and", groupSelection,
                       NP(groupColumn.mask == defs.VALID), groupSelection)

                groupColumnFieldType = groupColumn.fieldType
                stringValue = groupColumnFieldType.valueToString(groupValue)

                if stringValue in record:

                    def getstate():
                        return record[stringValue]
                else:
                    getstate = None

                def setstate(value):
                    record[stringValue] = value

                valuesSeen[stringValue] = True
                value = groupColumnFieldType.valueToPython(groupValue)

                if function == "count":
                    groupTables[value] = self.functionCount(
                        dataColumn, whereMask, groupSelection, getstate,
                        setstate)

                elif function == "sum":
                    groupTables[value] = self.functionSum(
                        dataColumn, whereMask, groupSelection, getstate,
                        setstate)

                elif function == "average":
                    groupTables[value] = self.functionAverage(
                        dataColumn, whereMask, groupSelection, getstate,
                        setstate)

                elif function == "min":
                    groupTables[value] = self.functionMin(
                        dataColumn, whereMask, groupSelection, getstate,
                        setstate)

                elif function == "max":
                    groupTables[value] = self.functionMax(
                        dataColumn, whereMask, groupSelection, getstate,
                        setstate)

                elif function == "multiset":
                    groupTables[value] = self.functionMultiset(
                        dataColumn, whereMask, groupSelection, getstate,
                        setstate)

            if stateId is not None:
                dataTable.state[stateId] = record

            for stringValue in valuesSeen:
                if not valuesSeen[stringValue]:
                    value = groupColumnFieldType.valueToPython(
                        groupColumnFieldType.stringToValue(stringValue))

                    if function == "count":
                        groupTables[value] = self.functionCountFake(
                            record[stringValue], len(dataTable),
                            dataColumn.fieldType)

                    elif function == "sum":
                        groupTables[value] = self.functionSumFake(
                            record[stringValue], len(dataTable),
                            dataColumn.fieldType)

                    elif function == "average":
                        groupTables[value] = self.functionAverageFake(
                            record[stringValue], len(dataTable),
                            dataColumn.fieldType)

                    elif function in ("min", "max"):
                        groupTables[value] = self.functionMinMaxFake(
                            record[stringValue], len(dataTable),
                            dataColumn.fieldType)

                    elif function == "multiset":
                        groupTables[value] = self.functionMultisetFake(
                            record[stringValue], len(dataTable),
                            dataColumn.fieldType)

            performanceTable.begin("Aggregate %s groupField collect" %
                                   function)

            fieldType = FakeFieldType("object", "any")
            data = NP("empty", len(dataTable), dtype=NP.dtype(object))

            if function == "count":
                for i in xrange(len(dataTable)):
                    data[i] = dict((value, table.data[i])
                                   for value, table in groupTables.items()
                                   if table.data[i] != 0)

            elif function == "sum":
                for i in xrange(len(dataTable)):
                    data[i] = dict((value, table.data[i])
                                   for value, table in groupTables.items()
                                   if table.data[i] != 0.0)

            elif function == "average":
                for i in xrange(len(dataTable)):
                    data[i] = dict(
                        (value, table.data[i])
                        for value, table in groupTables.items()
                        if table.data[i] > 0.0 or table.data[i] <= 0.0)

            elif function in ("min", "max"):
                for table in groupTables.values():
                    if table.mask is None:
                        table._mask = NP("zeros",
                                         len(table),
                                         dtype=defs.maskType)
                for i in xrange(len(dataTable)):
                    data[i] = dict((value, table.data[i])
                                   for value, table in groupTables.items()
                                   if table.mask[i] == defs.VALID)

            elif function == "multiset":
                for i in xrange(len(dataTable)):
                    data[i] = dict((value, table.data[i])
                                   for value, table in groupTables.items()
                                   if len(table.data[i]) > 0)

            performanceTable.end("Aggregate %s groupField collect" % function)
            performanceTable.end("Aggregate %s groupField" % function)
            return DataColumn(fieldType, data, None)
Ejemplo n.º 34
0
    def _selectFirst(self, dataTable, functionTable, performanceTable, segmentation):
        """Used by C{calculateScore}."""

        performanceTable.begin("Segmentation selectFirst")

        scoresData = NP("empty", len(dataTable), dtype=NP.dtype(object))
        scoresMask = NP("zeros", len(dataTable), dtype=defs.maskType)
        unfilled = NP("ones", len(dataTable), dtype=NP.dtype(bool))
        segments = NP("empty", len(dataTable), dtype=NP.dtype(object))

        newOutputData = []
        for segment in segmentation.childrenOfTag("Segment", iterator=True):
            performanceTable.pause("Segmentation selectFirst")
            selection = segment.childOfClass(PmmlPredicate).evaluate(dataTable, functionTable, performanceTable)
            performanceTable.unpause("Segmentation selectFirst")
            NP("logical_and", selection, unfilled, selection)
            if not selection.any():
                continue

            subTable = dataTable.subTable(selection)
            subModel = segment.childOfClass(PmmlModel)
            performanceTable.pause("Segmentation selectFirst")

            subModel.calculate(subTable, functionTable, performanceTable)
            performanceTable.unpause("Segmentation selectFirst")

            scoresData[selection] = subTable.score.data
            if subTable.score.mask is not None:
                scoresMask[selection] = subTable.score.mask
            else:
                scoresMask[selection] = defs.VALID

            segmentName = segment.get("id")
            if segmentName is not None:
                segments[selection] = segmentName

            for fieldName, dataColumn in subTable.output.items():
                if fieldName not in dataTable.output:
                    data = NP("empty", len(dataTable), dtype=dataColumn.fieldType.dtype)
                    data[selection] = dataColumn.data

                    mask = NP(NP("ones", len(dataTable), dtype=defs.maskType) * defs.MISSING)
                    if dataColumn.mask is None:
                        mask[selection] = defs.VALID
                    else:
                        mask[selection] = dataColumn.mask

                    newDataColumn = DataColumn(dataColumn.fieldType, data, mask)
                    newDataColumn._unlock()
                    dataTable.output[fieldName] = newDataColumn
                    newOutputData.append(newDataColumn)

                else:
                    newDataColumn = dataTable.output[fieldName]

                    newDataColumn.data[selection] = dataColumn.data
                    if dataColumn.mask is None:
                        newDataColumn.mask[selection] = defs.VALID
                    else:
                        newDataColumn.mask[selection] = dataColumn.mask

            unfilled -= selection
            if not unfilled.any():
                break

        for newDataColumn in newOutputData:
            if not newDataColumn.mask.any():
                newDataColumn._mask = None
            newDataColumn._lock()

        if not scoresMask.any():
            scoresMask = None

        scores = DataColumn(self.scoreType, scoresData, scoresMask)

        if self.name is None:
            performanceTable.end("Segmentation selectFirst")
            return {None: scores}
        else:
            performanceTable.end("Segmentation selectFirst")
            return {None: scores, "segment": DataColumn(self.scoreTypeSegment, segments, None)}
Ejemplo n.º 35
0
 def _fromDataColumn_timeSeconds(self, dataColumn):
     transformedData = NP(NP("mod", NP(dataColumn.data - self._offset), self._microsecondsPerDay) / float(self._factor))
     return self._fromDataColumn_number(DataColumn(self, transformedData, dataColumn.mask))
Ejemplo n.º 36
0
    def functionMax(self, dataColumn, whereMask, groupSelection, getstate,
                    setstate):
        """Finds the maximum of rows in a DataColumn, possibly with an SQL where mask and groupField.

        @type dataColumn: DataColumn
        @param dataColumn: The input data column.
        @type whereMask: 1d Numpy array of bool, or None
        @param whereMask: The result of the SQL where selection.
        @type groupSelection: 1d Numpy array of bool, or None.
        @param groupSelection: Rows corresponding to a particular value of the groupField.
        @type getstate: callable function
        @param getstate: Retrieve staring values from the DataTableState.
        @type setstate: callable function
        @param setstate: Store ending values to the DataTableState.
        @rtype: DataColumn
        @return: A column of maximized rows.
        """

        fieldType = dataColumn.fieldType

        if fieldType.optype not in ("continuous", "ordinal"):
            raise defs.PmmlValidationError(
                "Aggregate function \"min\" requires a continuous or ordinal input field"
            )

        if dataColumn.mask is None:
            selection = NP("ones", len(dataColumn), dtype=NP.dtype(bool))
        else:
            selection = NP(dataColumn.mask == defs.VALID)

        if whereMask is not None:
            NP("logical_and", selection, whereMask, selection)

        if groupSelection is not None:
            NP("logical_and", selection, groupSelection, selection)

        maximum = None
        if getstate is not None:
            startingState = getstate()
            if startingState is not None:
                maximum = startingState

        data = NP("empty", len(dataColumn), dtype=fieldType.dtype)
        mask = NP("zeros", len(dataColumn), dtype=defs.maskType)

        for i, x in enumerate(dataColumn.data):
            if selection[i]:
                if maximum is None or x > maximum:
                    maximum = x
            if maximum is None:
                mask[i] = defs.INVALID
            else:
                data[i] = maximum

        if not mask.any():
            mask = None

        if setstate is not None:
            setstate(maximum)

        return DataColumn(fieldType, data, mask)