コード例 #1
0
ファイル: Function.py プロジェクト: Huskyeder/augustus
    def maskInvalid(self, data, mask):
        """Helper method to replace NaN and infinite values with
        INVALID after a potentially dangerous operation.

        Example::

            result = NP("log", dataColumn.data)    # log(0) = -inf, log(-x) = nan
            resultMask = self.maskInvalid(result, dataColumn.mask)
            return DataColumn(fakeFieldType, result, resultMask)

        The input C{data} and C{mask} are not modified by this
        method; a substitute mask is returned.

        @type data: 1d Numpy array
        @param data: The dataset that may contain NaN and infinite values.
        @type mask: 1d Numpy array of C{defs.maskType}, or None
        @param mask: The original mask.
        @rtype: 1d Numpy array of C{defs.maskType}, or None
        @return: The new mask.
        """

        bad = NP("logical_not", NP("isfinite", data))
        if bad.any():
            if mask is None:
                mask = bad * defs.INVALID
            else:
                NP("logical_and", bad, NP(mask == defs.VALID), bad)
                if not mask.flags.writeable:
                    mask = NP("copy", mask)
                    mask.setflags(write=True)
                mask[bad] = defs.INVALID
        if mask is not None and not mask.any():
            mask = None
        return mask
コード例 #2
0
ファイル: Function.py プロジェクト: soedjais/augustus
    def maskInvalid(self, data, mask):
        """Helper method to replace NaN and infinite values with
        INVALID after a potentially dangerous operation.

        Example::

            result = NP("log", dataColumn.data)    # log(0) = -inf, log(-x) = nan
            resultMask = self.maskInvalid(result, dataColumn.mask)
            return DataColumn(fakeFieldType, result, resultMask)

        The input C{data} and C{mask} are not modified by this
        method; a substitute mask is returned.

        @type data: 1d Numpy array
        @param data: The dataset that may contain NaN and infinite values.
        @type mask: 1d Numpy array of C{defs.maskType}, or None
        @param mask: The original mask.
        @rtype: 1d Numpy array of C{defs.maskType}, or None
        @return: The new mask.
        """

        bad = NP("logical_not", NP("isfinite", data))
        if bad.any():
            if mask is None:
                mask = bad * defs.INVALID
            else:
                NP("logical_and", bad, NP(mask == defs.VALID), bad)
                if not mask.flags.writeable:
                    mask = NP("copy", mask)
                    mask.setflags(write=True)
                mask[bad] = defs.INVALID
        if mask is not None and not mask.any():
            mask = None
        return mask
コード例 #3
0
    def _toDataColumn_number(self, data, mask):
        data, mask = self._checkNumpy(data, mask)
        if isinstance(data, NP.ndarray) and (mask is None or isinstance(mask, NP.ndarray)) and data.dtype == self.dtype:
            mask2 = NP("isnan", data)
            if mask is None:
                mask = NP("array", mask2, defs.maskType) * defs.MISSING
            else:
                mask[mask2] = defs.MISSING

        else:
            data, mask = self._checkNonNumpy(data, mask)
            try:
                data = NP("array", data, dtype=self.dtype)
                # mask is handled in the else statement after the except block

            except (ValueError, TypeError):
                data2 = NP("empty", len(data), dtype=self.dtype)
                if mask is None:
                    mask2 = NP("zeros", len(data), dtype=defs.maskType)
                else:
                    mask2 = NP("fromiter", ((defs.VALID if not m else defs.MISSING) for m in mask), dtype=defs.maskType, count=len(mask))

                for i, v in enumerate(data):
                    try:
                        data2[i] = v
                        if mask2[i] == defs.VALID and ((isinstance(v, float) and math.isnan(v)) or (isinstance(v, basestring) and v.upper() == "NAN")):
                            mask2[i] = defs.MISSING
                        if v is None:
                            raise TypeError
                    except (ValueError, TypeError):
                        data2[i] = defs.PADDING
                        if mask2[i] == defs.VALID:
                            if (isinstance(v, float) and math.isnan(v)) or (isinstance(v, basestring) and v.upper() == "NAN"):
                                mask2[i] = defs.MISSING
                            else:
                                mask2[i] = defs.INVALID

                if not mask2.any():
                    mask2 = None

                data, mask = data2, mask2

            else:
                mask2 = NP("isnan", data)
                if mask is None:
                    mask = NP("array", mask2, defs.maskType)
                else:
                    mask = NP(NP("array", NP("logical_or", mask2, NP("fromiter", (m != 0 for m in mask), dtype=NP.dtype(bool), count=len(mask))), defs.maskType) * defs.MISSING)
                if not mask.any():
                    mask = None

        data, mask = self._checkValues(data, mask)
        data, mask = self._checkIntervals(data, mask)
        return DataColumn(self, data, mask)
コード例 #4
0
        def evaluate(self, dataTable, functionTable, performanceTable,
                     arguments):
            arguments = [
                x.evaluate(dataTable, functionTable, performanceTable)
                for x in arguments
            ]
            performanceTable.begin("built-in \"%s\"" % self.name)

            fieldType = self.allBooleanType(arguments, atleast=2)

            data = NP("zeros", len(dataTable), dtype=fieldType.dtype)
            mask = None
            allbad = NP("ones", len(dataTable), dtype=NP.dtype(bool))

            (data, allbad), mask = self.applySkipMissing((data, allbad), mask,
                                                         arguments)

            if allbad.any():
                if mask is None:
                    mask = allbad * defs.MISSING
                else:
                    NP("logical_and", allbad, NP(mask == defs.VALID), allbad)
                    mask[allbad] = defs.MISSING

            performanceTable.end("built-in \"%s\"" % self.name)
            return DataColumn(fieldType, data, mask)
コード例 #5
0
    def _toDataColumn_dateTime(self, data, mask):
        data, mask = self._checkNumpy(data, mask, tryToCast=False)
        data, mask = self._checkNonNumpy(data, mask)

        data2 = NP("empty", len(data), dtype=self.dtype)
        mask2 = NP("zeros", len(data), dtype=defs.maskType)

        for i, x in enumerate(data):
            if (mask is not None and mask[i]) or (isinstance(x, float) and math.isnan(x)) or (isinstance(x, basestring) and x.upper() == "NAN"):
                data2[i] = defs.PADDING
                mask2[i] = defs.MISSING
            else:
                try:
                    data2[i] = self.stringToValue(x)
                except (ValueError, TypeError):
                    data2[i] = defs.PADDING
                    mask2[i] = defs.INVALID

        if not mask2.any():
            data, mask = data2, None
        else:
            data, mask = data2, mask2

        data, mask = self._checkValues(data, mask)
        data, mask = self._checkIntervals(data, mask)
        return DataColumn(self, data, mask)
コード例 #6
0
ファイル: Aggregate.py プロジェクト: Huskyeder/augustus
    def functionMax(self, dataColumn, whereMask, groupSelection, getstate, setstate):
        """Finds the maximum of rows in a DataColumn, possibly with an SQL where mask and groupField.

        @type dataColumn: DataColumn
        @param dataColumn: The input data column.
        @type whereMask: 1d Numpy array of bool, or None
        @param whereMask: The result of the SQL where selection.
        @type groupSelection: 1d Numpy array of bool, or None.
        @param groupSelection: Rows corresponding to a particular value of the groupField.
        @type getstate: callable function
        @param getstate: Retrieve staring values from the DataTableState.
        @type setstate: callable function
        @param setstate: Store ending values to the DataTableState.
        @rtype: DataColumn
        @return: A column of maximized rows.
        """

        fieldType = dataColumn.fieldType

        if fieldType.optype not in ("continuous", "ordinal"):
            raise defs.PmmlValidationError("Aggregate function \"min\" requires a continuous or ordinal input field")

        if dataColumn.mask is None:
            selection = NP("ones", len(dataColumn), dtype=NP.dtype(bool))
        else:
            selection = NP(dataColumn.mask == defs.VALID)

        if whereMask is not None:
            NP("logical_and", selection, whereMask, selection)

        if groupSelection is not None:
            NP("logical_and", selection, groupSelection, selection)

        maximum = None
        if getstate is not None:
            startingState = getstate()
            if startingState is not None:
                maximum = startingState

        data = NP("empty", len(dataColumn), dtype=fieldType.dtype)
        mask = NP("zeros", len(dataColumn), dtype=defs.maskType)

        for i, x in enumerate(dataColumn.data):
            if selection[i]:
                if maximum is None or x > maximum:
                    maximum = x
            if maximum is None:
                mask[i] = defs.INVALID
            else:
                data[i] = maximum

        if not mask.any():
            mask = None

        if setstate is not None:
            setstate(maximum)

        return DataColumn(fieldType, data, mask)
コード例 #7
0
ファイル: Aggregate.py プロジェクト: soedjais/augustus
    def functionAverage(self, dataColumn, whereMask, groupSelection, getstate,
                        setstate):
        """Averages rows in a DataColumn, possibly with an SQL where mask and groupField.

        @type dataColumn: DataColumn
        @param dataColumn: The input data column.
        @type whereMask: 1d Numpy array of bool, or None
        @param whereMask: The result of the SQL where selection.
        @type groupSelection: 1d Numpy array of bool, or None.
        @param groupSelection: Rows corresponding to a particular value of the groupField.
        @type getstate: callable function
        @param getstate: Retrieve staring values from the DataTableState.
        @type setstate: callable function
        @param setstate: Store ending values to the DataTableState.
        @rtype: DataColumn
        @return: A column of averaged rows.
        """

        fieldType = FakeFieldType("double", "continuous")

        if dataColumn.fieldType.dataType not in ("integer", "float", "double"):
            raise defs.PmmlValidationError(
                "Aggregate function \"average\" requires a numeric input field: \"integer\", \"float\", \"double\""
            )

        denominator = NP("ones", len(dataColumn), dtype=fieldType.dtype)
        if dataColumn.mask is not None:
            NP("logical_and", denominator, NP(dataColumn.mask == defs.VALID),
               denominator)

        if whereMask is not None:
            NP("logical_and", denominator, whereMask, denominator)

        if groupSelection is not None:
            NP("logical_and", denominator, groupSelection, denominator)

        numerator = NP("multiply", denominator, dataColumn.data)

        if getstate is not None and len(dataColumn) > 0:
            startingState = getstate()
            if startingState is not None:
                startingNumerator, startingDenominator = startingState
                numerator[0] += startingNumerator
                denominator[0] += startingDenominator

        numerator = NP("cumsum", numerator)
        denominator = NP("cumsum", denominator)

        data = NP(numerator / denominator)
        mask = NP(NP("logical_not", NP("isfinite", data)) * defs.INVALID)
        if not mask.any():
            mask = None

        if setstate is not None and len(dataColumn) > 0:
            setstate((numerator[-1], denominator[-1]))

        return DataColumn(fieldType, data, mask)
コード例 #8
0
ファイル: Aggregate.py プロジェクト: Huskyeder/augustus
    def functionAverage(self, dataColumn, whereMask, groupSelection, getstate, setstate):
        """Averages rows in a DataColumn, possibly with an SQL where mask and groupField.

        @type dataColumn: DataColumn
        @param dataColumn: The input data column.
        @type whereMask: 1d Numpy array of bool, or None
        @param whereMask: The result of the SQL where selection.
        @type groupSelection: 1d Numpy array of bool, or None.
        @param groupSelection: Rows corresponding to a particular value of the groupField.
        @type getstate: callable function
        @param getstate: Retrieve staring values from the DataTableState.
        @type setstate: callable function
        @param setstate: Store ending values to the DataTableState.
        @rtype: DataColumn
        @return: A column of averaged rows.
        """

        fieldType = FakeFieldType("double", "continuous")

        if dataColumn.fieldType.dataType not in ("integer", "float", "double"):
            raise defs.PmmlValidationError("Aggregate function \"average\" requires a numeric input field: \"integer\", \"float\", \"double\"")

        denominator = NP("ones", len(dataColumn), dtype=fieldType.dtype)
        if dataColumn.mask is not None:
            NP("logical_and", denominator, NP(dataColumn.mask == defs.VALID), denominator)

        if whereMask is not None:
            NP("logical_and", denominator, whereMask, denominator)

        if groupSelection is not None:
            NP("logical_and", denominator, groupSelection, denominator)

        numerator = NP("multiply", denominator, dataColumn.data)

        if getstate is not None and len(dataColumn) > 0:
            startingState  = getstate()
            if startingState is not None:
                startingNumerator, startingDenominator = startingState
                numerator[0] += startingNumerator
                denominator[0] += startingDenominator

        numerator = NP("cumsum", numerator)
        denominator = NP("cumsum", denominator)

        data = NP(numerator / denominator)
        mask = NP(NP("logical_not", NP("isfinite", data)) * defs.INVALID)
        if not mask.any():
            mask = None

        if setstate is not None and len(dataColumn) > 0:
            setstate((numerator[-1], denominator[-1]))

        return DataColumn(fieldType, data, mask)
コード例 #9
0
ファイル: FieldCastMethods.py プロジェクト: soedjais/augustus
    def applyMapMissingTo(fieldType, data, mask, mapMissingTo, overwrite=False):
        """Replace MISSING values with a given substitute.

        This function does not modify the original data (unless
        C{overwrite} is True), but it returns a substitute.  Example
        use::

            data, mask = dataColumn.data, dataColumn.mask
            data, mask = FieldCastMethods.applyMapMissingTo(dataColumn.fieldType, data, mask, "-999")
            return DataColumn(dataColumn.fieldType, data, mask)

        It can also be used in conjunction with other FieldCastMethods.

        @type fieldType: FieldType
        @param fieldType: The data fieldType (to interpret C{mapMissingTo}).
        @type data: 1d Numpy array
        @param data: The data.
        @type mask: 1d Numpy array of dtype defs.maskType, or None
        @param mask: The mask.
        @type mapMissingTo: string
        @param mapMissingTo: The replacement value, represented as a string (e.g. directly from a PMML attribute).
        @type overwrite: bool
        @param overwrite: If True, temporarily unlike and overwrite the original mask.
        @rtype: 2-tuple of 1d Numpy arrays
        @return: The new data and mask.
        """

        if mask is None: return data, mask

        if mapMissingTo is not None:
            selection = NP(mask == defs.MISSING)
            try:
                mappedValue = fieldType.stringToValue(mapMissingTo)
            except ValueError as err:
                raise defs.PmmlValidationError("mapMissingTo string \"%s\" cannot be cast as %r: %s" % (mapMissingTo, fieldType, str(err)))

            if overwrite:
                data.setflags(write=True)
                mask.setflags(write=True)
            else:
                data = NP("copy", data)
                mask = NP("copy", mask)

            data[selection] = mappedValue
            mask[selection] = defs.VALID

            if not mask.any():
                mask = None

        return data, mask
コード例 #10
0
        def evaluate(self, dataTable, functionTable, performanceTable, arguments):
            arguments = [x.evaluate(dataTable, functionTable, performanceTable) for x in arguments]
            performanceTable.begin("built-in \"%s\"" % self.name)

            fieldType = self.fieldTypeFromSignature(arguments)
            left, right = arguments

            zeroDenominators = NP(NP(right.data == 0.0) * defs.INVALID)
            if not zeroDenominators.any():
                zeroDenominators = None

            mask = DataColumn.mapAnyMissingInvalid([zeroDenominators, left.mask, right.mask])

            dataColumn = DataColumn(fieldType, NP("floor_divide", left.data, right.data), mask)

            performanceTable.end("built-in \"%s\"" % self.name)
            return dataColumn
コード例 #11
0
    def _checkIntervals(self, data, mask):
        intervals = self.intervals
        if len(intervals) == 0:
            return data, mask

        # innocent until proven guilty
        invalid = NP("zeros", len(data), dtype=NP.dtype(bool))
        for interval in intervals:
            closure = interval["closure"]
            leftMargin = interval.get("leftMargin")
            rightMargin = interval.get("rightMargin")

            if leftMargin is not None:
                try:
                    leftMargin = self.stringToValue(leftMargin)
                except ValueError:
                    raise defs.PmmlValidationError("Improper value in Interval leftMargin specification: \"%s\"" % leftMargin)

                if closure in ("openClosed", "openOpen"):
                    invalid[NP(data <= leftMargin)] = True
                elif closure in ("closedOpen", "closedClosed"):
                    invalid[NP(data < leftMargin)] = True

            if rightMargin is not None:
                try:
                    rightMargin = self.stringToValue(rightMargin)
                except ValueError:
                    raise defs.PmmlValidationError("Improper value in Interval rightMargin specification: \"%s\"" % rightMargin)

                if closure in ("openOpen", "closedOpen"):
                    invalid[NP(data >= rightMargin)] = True
                elif closure in ("openClosed", "closedClosed"):
                    invalid[NP(data > rightMargin)] = True

        if not invalid.any():
            return data, mask

        if mask is None:
            return data, NP(invalid * defs.INVALID)
        else:
            NP("logical_and", invalid, NP(mask == defs.VALID), invalid)   # only change what wasn't already marked as MISSING
            mask[invalid] = defs.INVALID
            return data, mask
コード例 #12
0
    def _toDataColumn_object(self, data, mask):
        data, mask = self._checkNumpy(data, mask)
        if isinstance(data, NP.ndarray) and (mask is None or isinstance(mask, NP.ndarray)) and data.dtype == self.dtype:
            pass  # proceed to return statement (after checking values and intervals)

        else:
            data, mask = self._checkNonNumpy(data, mask)
            data = NP.array(data, dtype=self.dtype)

            if mask is None:
                mask = NP("fromiter", (defs.MISSING if (isinstance(d, float) and math.isnan(d)) else defs.VALID for d in data), dtype=defs.maskType, count=len(data))
            else:
                mask = NP("fromiter", (defs.MISSING if (m != 0 or (isinstance(data[i], float) and math.isnan(data[i]))) else defs.VALID for i, m in enumerate(mask)), dtype=defs.maskType, count=len(mask))
            if not mask.any():
                mask = None

        data, mask = self._checkValues(data, mask)
        data, mask = self._checkIntervals(data, mask)
        return DataColumn(self, data, mask)
コード例 #13
0
        def evaluate(self, dataTable, functionTable, performanceTable, arguments):
            arguments = [x.evaluate(dataTable, functionTable, performanceTable) for x in arguments]
            performanceTable.begin("built-in \"%s\"" % self.name)

            fieldType = self.allBooleanType(arguments, atleast=2)

            data = NP("zeros", len(dataTable), dtype=fieldType.dtype)
            mask = None
            allbad = NP("ones", len(dataTable), dtype=NP.dtype(bool))

            (data, allbad), mask = self.applySkipMissing((data, allbad), mask, arguments)

            if allbad.any():
                if mask is None:
                    mask = allbad * defs.MISSING
                else:
                    NP("logical_and", allbad, NP(mask == defs.VALID), allbad)
                    mask[allbad] = defs.MISSING

            performanceTable.end("built-in \"%s\"" % self.name)
            return DataColumn(fieldType, data, mask)
コード例 #14
0
    def _toDataColumn_internal(self, data, mask):
        data, mask = self._checkNumpy(data, mask, tryToCast=False)
        data, mask = self._checkNonNumpy(data, mask)
        
        try:
            data = NP("fromiter", (self.stringToValue(d) for d in data), dtype=self.dtype, count=len(data))
            # mask is handled in the else statement after the except block

        except ValueError:
            data2 = NP("empty", len(data), dtype=self.dtype)
            if mask is None:
                mask2 = NP("zeros", len(data), dtype=defs.maskType)
            else:
                mask2 = NP("fromiter", (defs.VALID if not m else defs.MISSING for m in mask), dtype=defs.maskType, count=len(mask))

            for i, v in enumerate(data):
                if isinstance(v, float) and math.isnan(v):
                    data2[i] = defs.PADDING
                    mask2[i] = defs.MISSING
                else:
                    try:
                        data2[i] = self.stringToValue(v)
                    except (ValueError, TypeError):
                        data2[i] = defs.PADDING
                        mask2[i] = defs.INVALID

            if not mask2.any():
                mask2 = None

            data, mask = data2, mask2

        else:
            if mask is not None and not isinstance(mask, NP.ndarray):
                mask = NP("array", mask, dtype=defs.maskType)

        # this is the only _toDataColumn that doesn't check values and intervals because these were checked in _setup for categorical and ordinal strings

        return DataColumn(self, data, mask)
コード例 #15
0
        def evaluate(self, dataTable, functionTable, performanceTable,
                     arguments):
            arguments = [
                x.evaluate(dataTable, functionTable, performanceTable)
                for x in arguments
            ]
            performanceTable.begin("built-in \"%s\"" % self.name)

            fieldType = self.fieldTypeFromSignature(arguments)
            left, right = arguments

            zeroDenominators = NP(NP(right.data == 0.0) * defs.INVALID)
            if not zeroDenominators.any():
                zeroDenominators = None

            mask = DataColumn.mapAnyMissingInvalid(
                [zeroDenominators, left.mask, right.mask])

            dataColumn = DataColumn(fieldType,
                                    NP("floor_divide", left.data, right.data),
                                    mask)

            performanceTable.end("built-in \"%s\"" % self.name)
            return dataColumn
コード例 #16
0
ファイル: MiningModel.py プロジェクト: soedjais/augustus
    def _selectFirst(self, dataTable, functionTable, performanceTable,
                     segmentation):
        """Used by C{calculateScore}."""

        performanceTable.begin("Segmentation selectFirst")

        scoresData = NP("empty", len(dataTable), dtype=NP.dtype(object))
        scoresMask = NP("zeros", len(dataTable), dtype=defs.maskType)
        unfilled = NP("ones", len(dataTable), dtype=NP.dtype(bool))
        segments = NP("empty", len(dataTable), dtype=NP.dtype(object))

        newOutputData = []
        for segment in segmentation.childrenOfTag("Segment", iterator=True):
            performanceTable.pause("Segmentation selectFirst")
            selection = segment.childOfClass(PmmlPredicate).evaluate(
                dataTable, functionTable, performanceTable)
            performanceTable.unpause("Segmentation selectFirst")
            NP("logical_and", selection, unfilled, selection)
            if not selection.any():
                continue

            subTable = dataTable.subTable(selection)
            subModel = segment.childOfClass(PmmlModel)
            performanceTable.pause("Segmentation selectFirst")

            subModel.calculate(subTable, functionTable, performanceTable)
            performanceTable.unpause("Segmentation selectFirst")

            scoresData[selection] = subTable.score.data
            if subTable.score.mask is not None:
                scoresMask[selection] = subTable.score.mask
            else:
                scoresMask[selection] = defs.VALID

            segmentName = segment.get("id")
            if segmentName is not None:
                segments[selection] = segmentName

            for fieldName, dataColumn in subTable.output.items():
                if fieldName not in dataTable.output:
                    data = NP("empty",
                              len(dataTable),
                              dtype=dataColumn.fieldType.dtype)
                    data[selection] = dataColumn.data

                    mask = NP(
                        NP("ones", len(dataTable), dtype=defs.maskType) *
                        defs.MISSING)
                    if dataColumn.mask is None:
                        mask[selection] = defs.VALID
                    else:
                        mask[selection] = dataColumn.mask

                    newDataColumn = DataColumn(dataColumn.fieldType, data,
                                               mask)
                    newDataColumn._unlock()
                    dataTable.output[fieldName] = newDataColumn
                    newOutputData.append(newDataColumn)

                else:
                    newDataColumn = dataTable.output[fieldName]

                    newDataColumn.data[selection] = dataColumn.data
                    if dataColumn.mask is None:
                        newDataColumn.mask[selection] = defs.VALID
                    else:
                        newDataColumn.mask[selection] = dataColumn.mask

            unfilled -= selection
            if not unfilled.any():
                break

        for newDataColumn in newOutputData:
            if not newDataColumn.mask.any():
                newDataColumn._mask = None
            newDataColumn._lock()

        if not scoresMask.any():
            scoresMask = None

        scores = DataColumn(self.scoreType, scoresData, scoresMask)

        if self.name is None:
            performanceTable.end("Segmentation selectFirst")
            return {None: scores}
        else:
            performanceTable.end("Segmentation selectFirst")
            return {
                None: scores,
                "segment": DataColumn(self.scoreTypeSegment, segments, None)
            }
コード例 #17
0
ファイル: MiningModel.py プロジェクト: soedjais/augustus
    def _selectAllMedianMajority(self, dataTable, functionTable,
                                 performanceTable, segmentation, which):
        """Used by C{calculateScore}."""

        if which is self.SELECT_ALL:
            performanceLabel = "Segmentation selectAll"
        elif which is self.MEDIAN:
            performanceLabel = "Segmentation median"
        elif which is self.MAJORITY_VOTE:
            performanceLabel = "Segmentation majorityVote"
        elif which is self.WEIGHTED_MAJORITY_VOTE:
            performanceLabel = "Segmentation weightedMajorityVote"
        performanceTable.begin(performanceLabel)

        scores = [[] for x in xrange(len(dataTable))]
        if which is self.SELECT_ALL:
            segments = [[] for x in xrange(len(dataTable))]

        newOutputData = {}
        for segment in segmentation.childrenOfTag("Segment", iterator=True):
            performanceTable.pause(performanceLabel)
            selection = segment.childOfClass(PmmlPredicate).evaluate(
                dataTable, functionTable, performanceTable)
            performanceTable.unpause(performanceLabel)
            if not selection.any():
                continue

            segmentName = segment.get("id")
            indexes = NP("nonzero", selection)[0]

            subTable = dataTable.subTable(selection)
            subModel = segment.childOfClass(PmmlModel)

            performanceTable.pause(performanceLabel)
            subModel.calculate(subTable, functionTable, performanceTable)
            performanceTable.unpause(performanceLabel)

            if which is self.MEDIAN and subTable.score.fieldType.dataType in (
                    "string", "boolean", "object"):
                raise defs.PmmlValidationError(
                    "Segmentation with multipleModelMethod=\"median\" cannot be applied to models that produce dataType \"%s\""
                    % subTable.score.fieldType.dataType)

            scoreData = subTable.score.data
            scoreMask = subTable.score.mask
            indexesUsed = indexes
            if which is self.SELECT_ALL:
                for subIndex, index in enumerate(indexes):
                    if scoreMask is None or scoreMask[subIndex] == defs.VALID:
                        scores[index].append(scoreData[subIndex])
                        segments[index].append(segmentName)

            elif which is self.MEDIAN:
                for subIndex, index in enumerate(indexes):
                    if scoreMask is None or scoreMask[subIndex] == defs.VALID:
                        scores[index].append(scoreData[subIndex])

            elif which in (self.MAJORITY_VOTE, self.WEIGHTED_MAJORITY_VOTE):
                if which is self.MAJORITY_VOTE:
                    weight = 1.0
                else:
                    weight = float(segment.get("weight", 1.0))
                for subIndex, index in enumerate(indexes):
                    if scoreMask is None or scoreMask[subIndex] == defs.VALID:
                        newValue = scoreData[subIndex]
                        score = scores[index]
                        found = False
                        for pair in score:
                            if pair[0] == newValue:
                                pair[1] += weight
                                found = True
                                break
                        if not found:
                            score.append([newValue, weight])

            if which is self.SELECT_ALL:
                for fieldName, dataColumn in subTable.output.items():
                    newData = newOutputData.get(fieldName)
                    if newData is None:
                        newData = [[] for x in xrange(len(dataTable))]
                        newOutputData[fieldName] = newData

                    dataColumnData = dataColumn.data
                    dataColumnMask = dataColumn.mask
                    for subIndex, index in enumerate(indexes):
                        if scoreMask is None or scoreMask[
                                subIndex] == defs.VALID:
                            if dataColumnMask is None or dataColumnMask[
                                    subIndex] == defs.VALID:
                                newData[index].append(dataColumnData[subIndex])
                            else:
                                newData[index].append(None)

        if which is self.SELECT_ALL:
            for fieldName, newData in newOutputData.items():
                finalNewData = NP("empty",
                                  len(dataTable),
                                  dtype=NP.dtype(object))
                for index, newDatum in enumerate(newData):
                    finalNewData[index] = tuple(newDatum)
                dataTable.output[fieldName] = DataColumn(
                    self.scoreType, finalNewData, None)

            finalScoresData = NP("empty",
                                 len(dataTable),
                                 dtype=NP.dtype(object))
            for index, score in enumerate(scores):
                finalScoresData[index] = tuple(score)
            finalScores = DataColumn(self.scoreType, finalScoresData, None)

            if self.name is None:
                performanceTable.end(performanceLabel)
                return {None: finalScores}
            else:
                finalSegmentsData = NP("empty",
                                       len(dataTable),
                                       dtype=NP.dtype(object))
                for index, segment in enumerate(segments):
                    finalSegmentsData[index] = tuple(segment)

                performanceTable.end(performanceLabel)
                return {
                    None:
                    finalScores,
                    "segment":
                    DataColumn(self.scoreTypeSegment, finalSegmentsData, None)
                }

        elif which is self.MEDIAN:
            finalScoresData = NP("empty",
                                 len(dataTable),
                                 dtype=NP.dtype(object))
            finalScoresMask = NP("empty", len(dataTable), dtype=defs.maskType)
            for index, score in enumerate(scores):
                if len(score) > 0:
                    finalScoresData[index] = NP("median", score)
                    finalScoresMask[index] = defs.VALID
                else:
                    finalScoresMask[index] = defs.INVALID

            if not finalScoresMask.any():
                finalScoresMask = None
            finalScores = DataColumn(self.scoreType, finalScoresData,
                                     finalScoresMask)

            performanceTable.end(performanceLabel)
            return {None: finalScores}

        elif which in (self.MAJORITY_VOTE, self.WEIGHTED_MAJORITY_VOTE):
            finalScoresData = NP("empty",
                                 len(dataTable),
                                 dtype=NP.dtype(object))
            finalScoresMask = NP("empty", len(dataTable), dtype=defs.maskType)
            cardinality = NP("empty",
                             len(dataTable),
                             dtype=self.scoreTypeCardinality.dtype)

            for index, score in enumerate(scores):
                bestN, bestValue = None, None
                for value, N in score:
                    if bestN is None or N > bestN:
                        bestN = N
                        bestValue = value
                if bestN is not None:
                    finalScoresData[index] = bestValue
                    finalScoresMask[index] = defs.VALID
                    cardinality[index] = bestN
                else:
                    finalScoresMask[index] = defs.INVALID
                    cardinality[index] = 0

            if not finalScoresMask.any():
                finalScoresMask = None
            finalScores = DataColumn(self.scoreType, finalScoresData,
                                     finalScoresMask)

            if self.name is None:
                performanceTable.end(performanceLabel)
                return {None: finalScores}
            else:
                finalCardinality = DataColumn(self.scoreTypeCardinality,
                                              cardinality, None)

                performanceTable.end(performanceLabel)
                return {None: finalScores, "cardinality": finalCardinality}
コード例 #18
0
ファイル: MiningModel.py プロジェクト: soedjais/augustus
    def _sumAverageWeighted(self, dataTable, functionTable, performanceTable,
                            segmentation, which):
        """Used by C{calculateScore}."""

        if which is self.SUM:
            performanceLabel = "Segmentation sum"
        elif which is self.AVERAGE:
            performanceLabel = "Segmentation average"
        elif which is self.WEIGHTED_AVERAGE:
            performanceLabel = "Segmentation weightedAverage"
        performanceTable.begin(performanceLabel)

        scoresData = NP("zeros", len(dataTable), dtype=NP.dtype(object))
        if which is not self.SUM:
            denominator = NP("zeros", len(dataTable), dtype=NP.dtype(float))
        invalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool))

        for segment in segmentation.childrenOfTag("Segment", iterator=True):
            performanceTable.pause(performanceLabel)
            selection = segment.childOfClass(PmmlPredicate).evaluate(
                dataTable, functionTable, performanceTable)
            performanceTable.unpause(performanceLabel)
            if not selection.any():
                continue

            subTable = dataTable.subTable(selection)
            subModel = segment.childOfClass(PmmlModel)
            performanceTable.pause(performanceLabel)
            subModel.calculate(subTable, functionTable, performanceTable)
            performanceTable.unpause(performanceLabel)

            if subTable.score.fieldType.dataType in ("string", "boolean",
                                                     "object"):
                raise defs.PmmlValidationError(
                    "Segmentation with multipleModelMethod=\"%s\" cannot be applied to models that produce dataType \"%s\""
                    % (self.childOfTag("Segmentation").get(
                        "multipleModelMethod"),
                       subTable.score.fieldType.dataType))

            # ignore invalid in matches (like the built-in "+" and "avg" Apply functions)
            if subTable.score.mask is not None:
                NP("logical_and", selection,
                   NP(subTable.score.mask == defs.VALID), selection)

            if which is self.SUM:
                scoresData[selection] += subTable.score.data
            if which is self.AVERAGE:
                scoresData[selection] += subTable.score.data
                denominator[selection] += 1.0
            elif which is self.WEIGHTED_AVERAGE:
                weight = float(segment.get("weight", 1.0))
                scoresData[selection] += (subTable.score.data * weight)
                denominator[selection] += weight

            if subTable.score.mask is not None:
                invalid[selection] = NP("logical_or", invalid[selection],
                                        NP(subTable.score.mask != defs.VALID))

        if which is not self.SUM:
            NP("logical_or", invalid, NP(denominator == 0.0), invalid)
            valid = NP("logical_not", invalid)
            scoresData[valid] /= denominator[valid]

        if invalid.any():
            scoresMask = NP(
                NP("array", invalid, dtype=defs.maskType) * defs.INVALID)
        else:
            scoresMask = None

        scores = DataColumn(self.scoreType, scoresData, scoresMask)

        performanceTable.end(performanceLabel)
        return {None: scores}
コード例 #19
0
ファイル: OutputField.py プロジェクト: Huskyeder/augustus
    def format(self, subTable, functionTable, performanceTable, score):
        """Extract or post-process output for the output field of a DataTable.

        @type subTable: DataTable
        @param subTable: The DataTable associated with this local lexical scope.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @type score: dict
        @param score: Dictionary mapping PMML score "feature" strings to DataColumns.  This dictionary always contains a None key, which is the basic feature ("predictedValue").
        @rtype: DataColumn
        @return: The output that would go into an output field of a DataTable.
        """

        performanceTable.begin("OutputField")

        feature = self.get("feature")
        if feature is None:
            dataColumn = subTable.fields[self["name"]]

        elif feature == "predictedValue":
            dataColumn = score[None]

        elif feature == "predictedDisplayValue":
            original = score[None]
            toString = original.fieldType.valueToString
            data = NP("empty", len(subTable), dtype=NP.dtype(object))
            for i, x in enumerate(original.data):
                data[i] = toString(x)
            dataColumn = DataColumn(FakeFieldType("string", "continuous"), data, None)

        elif feature == "transformedValue":
            expression = self.childOfClass(PmmlExpression)
            if expression is None:
                raise defs.PmmlValidationError("OutputField with feature \"transformedValue\" requires an EXPRESSION")
            
            performanceTable.pause("OutputField")
            dataColumn = expression.evaluate(subTable, functionTable, performanceTable)
            performanceTable.unpause("OutputField")

        elif feature == "decision":
            decisions = self.childOfTag("Decisions")
            if decisions is None:
                raise defs.PmmlValidationError("OutputField with feature \"decision\" requires a Decisions block")

            performanceTable.pause("OutputField")
            dataColumn = self.childOfClass(PmmlExpression).evaluate(subTable, functionTable, performanceTable)
            performanceTable.unpause("OutputField")

            if dataColumn.mask is None:
                valid = None
            else:
                valid = NP(dataColumn.mask == defs.VALID)

            fieldType = FakeFieldType("object", "any")
            data = NP("empty", len(subTable), dtype=fieldType.dtype)
            mask = NP(NP("ones", len(subTable), dtype=defs.maskType) * defs.MISSING)

            for decision in decisions.childrenOfTag("Decision"):
                value = dataColumn.fieldType.stringToValue(decision["value"])

                selection = NP(dataColumn.data == value)
                if valid is not None:
                    NP("logical_and", selection, valid, selection)

                for i in xrange(len(data)):
                    if selection[i]:
                        data[i] = decision

                mask[selection] = defs.VALID
            
            if not mask.any():
                mask = None

            dataColumn = DataColumn(fieldType, data, mask)

        elif feature in score:
            dataColumn = score[feature]

        else:
            model = self.getparent()
            if model is not None: model = model.getparent()

            if model is None:
                model = "(orphaned OutputField; no parent model)"
            else:
                model = model.t

            raise defs.PmmlValidationError("Models of type %s do not produce \"%s\" features (or at least, it is not yet implemented by Augustus)" % (model, feature))

        dataType = self.get("dataType", dataColumn.fieldType.dataType)
        optype = self.get("optype", dataColumn.fieldType.optype)
        if (dataType != dataColumn.fieldType.dataType or optype != dataColumn.fieldType.optype) and feature not in ("predictedDisplayValue", "decision"):
            dataColumn = FieldCastMethods.cast(FakeFieldType(dataType, optype), dataColumn)

        if feature is not None:
            subTable.fields[self.get("displayName", self["name"])] = dataColumn

        performanceTable.end("OutputField")
        return dataColumn
コード例 #20
0
ファイル: MiningModel.py プロジェクト: Huskyeder/augustus
    def _sumAverageWeighted(self, dataTable, functionTable, performanceTable, segmentation, which):
        """Used by C{calculateScore}."""

        if which is self.SUM:
            performanceLabel = "Segmentation sum"
        elif which is self.AVERAGE:
            performanceLabel = "Segmentation average"
        elif which is self.WEIGHTED_AVERAGE:
            performanceLabel = "Segmentation weightedAverage"
        performanceTable.begin(performanceLabel)

        scoresData = NP("zeros", len(dataTable), dtype=NP.dtype(object))
        if which is not self.SUM:
            denominator = NP("zeros", len(dataTable), dtype=NP.dtype(float))
        invalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool))

        for segment in segmentation.childrenOfTag("Segment", iterator=True):
            performanceTable.pause(performanceLabel)
            selection = segment.childOfClass(PmmlPredicate).evaluate(dataTable, functionTable, performanceTable)
            performanceTable.unpause(performanceLabel)
            if not selection.any():
                continue
            
            subTable = dataTable.subTable(selection)
            subModel = segment.childOfClass(PmmlModel)
            performanceTable.pause(performanceLabel)
            subModel.calculate(subTable, functionTable, performanceTable)
            performanceTable.unpause(performanceLabel)

            if subTable.score.fieldType.dataType in ("string", "boolean", "object"):
                raise defs.PmmlValidationError("Segmentation with multipleModelMethod=\"%s\" cannot be applied to models that produce dataType \"%s\"" % (self.childOfTag("Segmentation").get("multipleModelMethod"), subTable.score.fieldType.dataType))

            # ignore invalid in matches (like the built-in "+" and "avg" Apply functions)
            if subTable.score.mask is not None:
                NP("logical_and", selection, NP(subTable.score.mask == defs.VALID), selection)

            if which is self.SUM:
                scoresData[selection] += subTable.score.data
            if which is self.AVERAGE:
                scoresData[selection] += subTable.score.data
                denominator[selection] += 1.0
            elif which is self.WEIGHTED_AVERAGE:
                weight = float(segment.get("weight", 1.0))
                scoresData[selection] += (subTable.score.data * weight)
                denominator[selection] += weight

            if subTable.score.mask is not None:
                invalid[selection] = NP("logical_or", invalid[selection], NP(subTable.score.mask != defs.VALID))

        if which is not self.SUM:
            NP("logical_or", invalid, NP(denominator == 0.0), invalid)
            valid = NP("logical_not", invalid)
            scoresData[valid] /= denominator[valid]

        if invalid.any():
            scoresMask = NP(NP("array", invalid, dtype=defs.maskType) * defs.INVALID)
        else:
            scoresMask = None
        
        scores = DataColumn(self.scoreType, scoresData, scoresMask)

        performanceTable.end(performanceLabel)
        return {None: scores}
コード例 #21
0
ファイル: Aggregate.py プロジェクト: soedjais/augustus
    def functionMax(self, dataColumn, whereMask, groupSelection, getstate,
                    setstate):
        """Finds the maximum of rows in a DataColumn, possibly with an SQL where mask and groupField.

        @type dataColumn: DataColumn
        @param dataColumn: The input data column.
        @type whereMask: 1d Numpy array of bool, or None
        @param whereMask: The result of the SQL where selection.
        @type groupSelection: 1d Numpy array of bool, or None.
        @param groupSelection: Rows corresponding to a particular value of the groupField.
        @type getstate: callable function
        @param getstate: Retrieve staring values from the DataTableState.
        @type setstate: callable function
        @param setstate: Store ending values to the DataTableState.
        @rtype: DataColumn
        @return: A column of maximized rows.
        """

        fieldType = dataColumn.fieldType

        if fieldType.optype not in ("continuous", "ordinal"):
            raise defs.PmmlValidationError(
                "Aggregate function \"min\" requires a continuous or ordinal input field"
            )

        if dataColumn.mask is None:
            selection = NP("ones", len(dataColumn), dtype=NP.dtype(bool))
        else:
            selection = NP(dataColumn.mask == defs.VALID)

        if whereMask is not None:
            NP("logical_and", selection, whereMask, selection)

        if groupSelection is not None:
            NP("logical_and", selection, groupSelection, selection)

        maximum = None
        if getstate is not None:
            startingState = getstate()
            if startingState is not None:
                maximum = startingState

        data = NP("empty", len(dataColumn), dtype=fieldType.dtype)
        mask = NP("zeros", len(dataColumn), dtype=defs.maskType)

        for i, x in enumerate(dataColumn.data):
            if selection[i]:
                if maximum is None or x > maximum:
                    maximum = x
            if maximum is None:
                mask[i] = defs.INVALID
            else:
                data[i] = maximum

        if not mask.any():
            mask = None

        if setstate is not None:
            setstate(maximum)

        return DataColumn(fieldType, data, mask)
コード例 #22
0
ファイル: MiningModel.py プロジェクト: Huskyeder/augustus
    def _selectAllMedianMajority(self, dataTable, functionTable, performanceTable, segmentation, which):
        """Used by C{calculateScore}."""

        if which is self.SELECT_ALL:
            performanceLabel = "Segmentation selectAll"
        elif which is self.MEDIAN:
            performanceLabel = "Segmentation median"
        elif which is self.MAJORITY_VOTE:
            performanceLabel = "Segmentation majorityVote"
        elif which is self.WEIGHTED_MAJORITY_VOTE:
            performanceLabel = "Segmentation weightedMajorityVote"
        performanceTable.begin(performanceLabel)

        scores = [[] for x in xrange(len(dataTable))]
        if which is self.SELECT_ALL:
            segments = [[] for x in xrange(len(dataTable))]

        newOutputData = {}
        for segment in segmentation.childrenOfTag("Segment", iterator=True):
            performanceTable.pause(performanceLabel)
            selection = segment.childOfClass(PmmlPredicate).evaluate(dataTable, functionTable, performanceTable)
            performanceTable.unpause(performanceLabel)
            if not selection.any():
                continue

            segmentName = segment.get("id")
            indexes = NP("nonzero", selection)[0]

            subTable = dataTable.subTable(selection)
            subModel = segment.childOfClass(PmmlModel)

            performanceTable.pause(performanceLabel)
            subModel.calculate(subTable, functionTable, performanceTable)
            performanceTable.unpause(performanceLabel)

            if which is self.MEDIAN and subTable.score.fieldType.dataType in ("string", "boolean", "object"):
                raise defs.PmmlValidationError("Segmentation with multipleModelMethod=\"median\" cannot be applied to models that produce dataType \"%s\"" % subTable.score.fieldType.dataType)

            scoreData = subTable.score.data
            scoreMask = subTable.score.mask
            indexesUsed = indexes
            if which is self.SELECT_ALL:
                for subIndex, index in enumerate(indexes):
                    if scoreMask is None or scoreMask[subIndex] == defs.VALID:
                        scores[index].append(scoreData[subIndex])
                        segments[index].append(segmentName)

            elif which is self.MEDIAN:
                for subIndex, index in enumerate(indexes):
                    if scoreMask is None or scoreMask[subIndex] == defs.VALID:
                        scores[index].append(scoreData[subIndex])

            elif which in (self.MAJORITY_VOTE, self.WEIGHTED_MAJORITY_VOTE):
                if which is self.MAJORITY_VOTE:
                    weight = 1.0
                else:
                    weight = float(segment.get("weight", 1.0))
                for subIndex, index in enumerate(indexes):
                    if scoreMask is None or scoreMask[subIndex] == defs.VALID:
                        newValue = scoreData[subIndex]
                        score = scores[index]
                        found = False
                        for pair in score:
                            if pair[0] == newValue:
                                pair[1] += weight
                                found = True
                                break
                        if not found:
                            score.append([newValue, weight])

            if which is self.SELECT_ALL:
                for fieldName, dataColumn in subTable.output.items():
                    newData = newOutputData.get(fieldName)
                    if newData is None:
                        newData = [[] for x in xrange(len(dataTable))]
                        newOutputData[fieldName] = newData

                    dataColumnData = dataColumn.data
                    dataColumnMask = dataColumn.mask
                    for subIndex, index in enumerate(indexes):
                        if scoreMask is None or scoreMask[subIndex] == defs.VALID:
                            if dataColumnMask is None or dataColumnMask[subIndex] == defs.VALID:
                                newData[index].append(dataColumnData[subIndex])
                            else:
                                newData[index].append(None)

        if which is self.SELECT_ALL:
            for fieldName, newData in newOutputData.items():
                finalNewData = NP("empty", len(dataTable), dtype=NP.dtype(object))
                for index, newDatum in enumerate(newData):
                    finalNewData[index] = tuple(newDatum)
                dataTable.output[fieldName] = DataColumn(self.scoreType, finalNewData, None)

            finalScoresData = NP("empty", len(dataTable), dtype=NP.dtype(object))
            for index, score in enumerate(scores):
                finalScoresData[index] = tuple(score)
            finalScores = DataColumn(self.scoreType, finalScoresData, None)

            if self.name is None:
                performanceTable.end(performanceLabel)
                return {None: finalScores}
            else:
                finalSegmentsData = NP("empty", len(dataTable), dtype=NP.dtype(object))
                for index, segment in enumerate(segments):
                    finalSegmentsData[index] = tuple(segment)

                performanceTable.end(performanceLabel)
                return {None: finalScores, "segment": DataColumn(self.scoreTypeSegment, finalSegmentsData, None)}

        elif which is self.MEDIAN:
            finalScoresData = NP("empty", len(dataTable), dtype=NP.dtype(object))
            finalScoresMask = NP("empty", len(dataTable), dtype=defs.maskType)
            for index, score in enumerate(scores):
                if len(score) > 0:
                    finalScoresData[index] = NP("median", score)
                    finalScoresMask[index] = defs.VALID
                else:
                    finalScoresMask[index] = defs.INVALID

            if not finalScoresMask.any():
                finalScoresMask = None
            finalScores = DataColumn(self.scoreType, finalScoresData, finalScoresMask)

            performanceTable.end(performanceLabel)
            return {None: finalScores}

        elif which in (self.MAJORITY_VOTE, self.WEIGHTED_MAJORITY_VOTE):
            finalScoresData = NP("empty", len(dataTable), dtype=NP.dtype(object))
            finalScoresMask = NP("empty", len(dataTable), dtype=defs.maskType)
            cardinality = NP("empty", len(dataTable), dtype=self.scoreTypeCardinality.dtype)

            for index, score in enumerate(scores):
                bestN, bestValue = None, None
                for value, N in score:
                    if bestN is None or N > bestN:
                        bestN = N
                        bestValue = value
                if bestN is not None:
                    finalScoresData[index] = bestValue
                    finalScoresMask[index] = defs.VALID
                    cardinality[index] = bestN
                else:
                    finalScoresMask[index] = defs.INVALID
                    cardinality[index] = 0

            if not finalScoresMask.any():
                finalScoresMask = None
            finalScores = DataColumn(self.scoreType, finalScoresData, finalScoresMask)

            if self.name is None:
                performanceTable.end(performanceLabel)
                return {None: finalScores}
            else:
                finalCardinality = DataColumn(self.scoreTypeCardinality, cardinality, None)

                performanceTable.end(performanceLabel)
                return {None: finalScores, "cardinality": finalCardinality}
コード例 #23
0
ファイル: Node.py プロジェクト: soedjais/augustus
    def applyScore(self, dataTable, functionTable, performanceTable, selection,
                   score, missingValueStrategy, missingValuePenalty,
                   noTrueChildStrategy):
        """Walk through the tree by one Node, splitting the DataTable
        on the way down and merging it on the way back up.
        
        @type dataTable: DataTable
        @param dataTable: A DataTable containing all rows that match this node in the tree and those above it.
        @type functionTable: FunctionTable
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @type selection: 1d Numpy array of bool
        @param selection: The rows in this DataTable that match this Node.
        @type score: dict
        @param score: A dictionary that maps PMML score "features" to DataColumns.  The None key is "predictedValue" and is the only one guaranteed to exist.
        @type missingValueStrategy: singleton Python object, defined in the Node class
        @param missingValueStrategy: The tree's global missing value strategy.
        @type missingValuePenalty: number
        @param missingValuePenalty: The tree's global missing value penalty.
        @type noTrueChildStrategy: singleton Python object, defined in the Node class
        @param noTrueChildStrategy: The tree's global no-true-child strategy.
        """

        if not selection.any():
            return

        subNodes = self.childrenOfClass(Node)
        if len(subNodes) == 0:
            self.applyScoreLeaf(selection, score, performanceTable)

        else:
            performanceTable.begin("split downward")

            subTable = dataTable.subTable(selection)
            subScore = {}
            for name, field in score.items():
                if field.mask is None:
                    subScore[name] = DataColumn(field.fieldType,
                                                field.data[selection], None)
                else:
                    subScore[name] = DataColumn(field.fieldType,
                                                field.data[selection],
                                                field.mask[selection])
                subScore[name]._unlock()

            unset = NP("ones", len(subTable), dtype=NP.dtype(bool))

            performanceTable.end("split downward")

            for subNode in subNodes:
                subSelection, subUnknowns, subEncounteredUnknowns = subNode.evaluatePredicate(
                    subTable,
                    functionTable,
                    performanceTable,
                    returnUnknowns=True)

                performanceTable.begin("logical_and")
                NP("logical_and", subSelection, unset, subSelection)
                NP("logical_and", subSelection, NP("logical_not", subUnknowns),
                   subSelection)
                NP("logical_and", subUnknowns, unset, subUnknowns)
                NP("logical_and", subEncounteredUnknowns, unset,
                   subEncounteredUnknowns)
                NP("logical_and", unset, NP("logical_not", subSelection),
                   unset)
                performanceTable.end("logical_and")

                subNode.applyScore(subTable, functionTable, performanceTable,
                                   subSelection, subScore,
                                   missingValueStrategy, missingValuePenalty,
                                   noTrueChildStrategy)

                if "penaltyProduct" in subScore:
                    subScore["penaltyProduct"].data[
                        subEncounteredUnknowns] *= missingValuePenalty

                if subUnknowns.any():
                    if missingValueStrategy is self.LAST_PREDICTION:
                        self.applyScoreLeaf(subUnknowns, subScore,
                                            performanceTable)
                        NP("logical_and", unset, NP("logical_not",
                                                    subUnknowns), unset)

                    elif missingValueStrategy is self.NULL_PREDICTION:
                        NP("logical_and", unset, NP("logical_not",
                                                    subUnknowns), unset)

                    elif missingValueStrategy is self.DEFAULT_CHILD:
                        defaultChild = self.xpath("@defaultChild")
                        if len(defaultChild) == 0:
                            raise defs.PmmlValidationError(
                                "When missingValueStrategy is \"defaultChild\", every non-leaf node must have a defaultChild attribute"
                            )
                        defaultChild = defaultChild[0]

                        defaultNode = self.xpath("pmml:Node[@id='%s']" %
                                                 defaultChild)
                        if len(defaultNode) == 0:
                            raise defs.PmmlValidationError(
                                "The defaultChild \"%s\" is not found (no such id at this level)"
                                % defaultChild)
                        defaultNode = defaultNode[0]

                        NP("logical_and", unset, NP("logical_not",
                                                    subUnknowns), unset)
                        defaultNode.applyScore(subTable, functionTable,
                                               performanceTable, subUnknowns,
                                               subScore, missingValueStrategy,
                                               missingValuePenalty,
                                               noTrueChildStrategy)

                    elif missingValueStrategy is self.WEIGHTED_CONFIDENCE:
                        # this involves evaluating an ensemble of subtrees and choosing among them: too hard
                        raise NotImplementedError(
                            "missingValueStrategy=\"weightedConfidence\"")

                    elif missingValueStrategy is self.AGGREGATE_NODES:
                        # this involves evaluating an ensemble of subtrees and agregating over them: too hard
                        raise NotImplementedError(
                            "missingValueStrategy=\"aggregateNodes\"")

                    elif missingValueStrategy is self.NONE:
                        pass

                if not unset.any():
                    break

            if noTrueChildStrategy is self.RETURN_LAST_PREDICTION and unset.any(
            ):
                self.applyScoreLeaf(unset, subScore, performanceTable)

            performanceTable.begin("merge upward")

            for name, field in score.items():
                field.data[selection] = subScore[name].data
                if field.mask is not None:
                    field.mask[selection] = subScore[name].mask

            performanceTable.end("merge upward")
コード例 #24
0
ファイル: Node.py プロジェクト: Huskyeder/augustus
    def applyScore(self, dataTable, functionTable, performanceTable, selection, score, missingValueStrategy, missingValuePenalty, noTrueChildStrategy):
        """Walk through the tree by one Node, splitting the DataTable
        on the way down and merging it on the way back up.
        
        @type dataTable: DataTable
        @param dataTable: A DataTable containing all rows that match this node in the tree and those above it.
        @type functionTable: FunctionTable
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @type selection: 1d Numpy array of bool
        @param selection: The rows in this DataTable that match this Node.
        @type score: dict
        @param score: A dictionary that maps PMML score "features" to DataColumns.  The None key is "predictedValue" and is the only one guaranteed to exist.
        @type missingValueStrategy: singleton Python object, defined in the Node class
        @param missingValueStrategy: The tree's global missing value strategy.
        @type missingValuePenalty: number
        @param missingValuePenalty: The tree's global missing value penalty.
        @type noTrueChildStrategy: singleton Python object, defined in the Node class
        @param noTrueChildStrategy: The tree's global no-true-child strategy.
        """

        if not selection.any():
            return

        subNodes = self.childrenOfClass(Node)
        if len(subNodes) == 0:
            self.applyScoreLeaf(selection, score, performanceTable)

        else:
            performanceTable.begin("split downward")

            subTable = dataTable.subTable(selection)
            subScore = {}
            for name, field in score.items():
                if field.mask is None:
                    subScore[name] = DataColumn(field.fieldType, field.data[selection], None)
                else:
                    subScore[name] = DataColumn(field.fieldType, field.data[selection], field.mask[selection])
                subScore[name]._unlock()

            unset = NP("ones", len(subTable), dtype=NP.dtype(bool))

            performanceTable.end("split downward")

            for subNode in subNodes:
                subSelection, subUnknowns, subEncounteredUnknowns = subNode.evaluatePredicate(subTable, functionTable, performanceTable, returnUnknowns=True)

                performanceTable.begin("logical_and")
                NP("logical_and", subSelection, unset, subSelection)
                NP("logical_and", subSelection, NP("logical_not", subUnknowns), subSelection)
                NP("logical_and", subUnknowns, unset, subUnknowns)
                NP("logical_and", subEncounteredUnknowns, unset, subEncounteredUnknowns)
                NP("logical_and", unset, NP("logical_not", subSelection), unset)
                performanceTable.end("logical_and")

                subNode.applyScore(subTable, functionTable, performanceTable, subSelection, subScore, missingValueStrategy, missingValuePenalty, noTrueChildStrategy)

                if "penaltyProduct" in subScore:
                    subScore["penaltyProduct"].data[subEncounteredUnknowns] *= missingValuePenalty
                
                if subUnknowns.any():
                    if missingValueStrategy is self.LAST_PREDICTION:
                        self.applyScoreLeaf(subUnknowns, subScore, performanceTable)
                        NP("logical_and", unset, NP("logical_not", subUnknowns), unset)

                    elif missingValueStrategy is self.NULL_PREDICTION:
                        NP("logical_and", unset, NP("logical_not", subUnknowns), unset)

                    elif missingValueStrategy is self.DEFAULT_CHILD:
                        defaultChild = self.xpath("@defaultChild")
                        if len(defaultChild) == 0:
                            raise defs.PmmlValidationError("When missingValueStrategy is \"defaultChild\", every non-leaf node must have a defaultChild attribute")
                        defaultChild = defaultChild[0]

                        defaultNode = self.xpath("pmml:Node[@id='%s']" % defaultChild)
                        if len(defaultNode) == 0:
                            raise defs.PmmlValidationError("The defaultChild \"%s\" is not found (no such id at this level)" % defaultChild)
                        defaultNode = defaultNode[0]

                        NP("logical_and", unset, NP("logical_not", subUnknowns), unset)
                        defaultNode.applyScore(subTable, functionTable, performanceTable, subUnknowns, subScore, missingValueStrategy, missingValuePenalty, noTrueChildStrategy)

                    elif missingValueStrategy is self.WEIGHTED_CONFIDENCE:
                        # this involves evaluating an ensemble of subtrees and choosing among them: too hard
                        raise NotImplementedError("missingValueStrategy=\"weightedConfidence\"")

                    elif missingValueStrategy is self.AGGREGATE_NODES:
                        # this involves evaluating an ensemble of subtrees and agregating over them: too hard
                        raise NotImplementedError("missingValueStrategy=\"aggregateNodes\"")

                    elif missingValueStrategy is self.NONE:
                        pass

                if not unset.any():
                    break

            if noTrueChildStrategy is self.RETURN_LAST_PREDICTION and unset.any():
                self.applyScoreLeaf(unset, subScore, performanceTable)

            performanceTable.begin("merge upward")

            for name, field in score.items():
                field.data[selection] = subScore[name].data
                if field.mask is not None:
                    field.mask[selection] = subScore[name].mask

            performanceTable.end("merge upward")
コード例 #25
0
ファイル: OutputField.py プロジェクト: soedjais/augustus
    def format(self, subTable, functionTable, performanceTable, score):
        """Extract or post-process output for the output field of a DataTable.

        @type subTable: DataTable
        @param subTable: The DataTable associated with this local lexical scope.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @type score: dict
        @param score: Dictionary mapping PMML score "feature" strings to DataColumns.  This dictionary always contains a None key, which is the basic feature ("predictedValue").
        @rtype: DataColumn
        @return: The output that would go into an output field of a DataTable.
        """

        performanceTable.begin("OutputField")

        feature = self.get("feature")
        if feature is None:
            dataColumn = subTable.fields[self["name"]]

        elif feature == "predictedValue":
            dataColumn = score[None]

        elif feature == "predictedDisplayValue":
            original = score[None]
            toString = original.fieldType.valueToString
            data = NP("empty", len(subTable), dtype=NP.dtype(object))
            for i, x in enumerate(original.data):
                data[i] = toString(x)
            dataColumn = DataColumn(FakeFieldType("string", "continuous"),
                                    data, None)

        elif feature == "transformedValue":
            expression = self.childOfClass(PmmlExpression)
            if expression is None:
                raise defs.PmmlValidationError(
                    "OutputField with feature \"transformedValue\" requires an EXPRESSION"
                )

            performanceTable.pause("OutputField")
            dataColumn = expression.evaluate(subTable, functionTable,
                                             performanceTable)
            performanceTable.unpause("OutputField")

        elif feature == "decision":
            decisions = self.childOfTag("Decisions")
            if decisions is None:
                raise defs.PmmlValidationError(
                    "OutputField with feature \"decision\" requires a Decisions block"
                )

            performanceTable.pause("OutputField")
            dataColumn = self.childOfClass(PmmlExpression).evaluate(
                subTable, functionTable, performanceTable)
            performanceTable.unpause("OutputField")

            if dataColumn.mask is None:
                valid = None
            else:
                valid = NP(dataColumn.mask == defs.VALID)

            fieldType = FakeFieldType("object", "any")
            data = NP("empty", len(subTable), dtype=fieldType.dtype)
            mask = NP(
                NP("ones", len(subTable), dtype=defs.maskType) * defs.MISSING)

            for decision in decisions.childrenOfTag("Decision"):
                value = dataColumn.fieldType.stringToValue(decision["value"])

                selection = NP(dataColumn.data == value)
                if valid is not None:
                    NP("logical_and", selection, valid, selection)

                for i in xrange(len(data)):
                    if selection[i]:
                        data[i] = decision

                mask[selection] = defs.VALID

            if not mask.any():
                mask = None

            dataColumn = DataColumn(fieldType, data, mask)

        elif feature in score:
            dataColumn = score[feature]

        else:
            model = self.getparent()
            if model is not None: model = model.getparent()

            if model is None:
                model = "(orphaned OutputField; no parent model)"
            else:
                model = model.t

            raise defs.PmmlValidationError(
                "Models of type %s do not produce \"%s\" features (or at least, it is not yet implemented by Augustus)"
                % (model, feature))

        dataType = self.get("dataType", dataColumn.fieldType.dataType)
        optype = self.get("optype", dataColumn.fieldType.optype)
        if (dataType != dataColumn.fieldType.dataType
                or optype != dataColumn.fieldType.optype) and feature not in (
                    "predictedDisplayValue", "decision"):
            dataColumn = FieldCastMethods.cast(FakeFieldType(dataType, optype),
                                               dataColumn)

        if feature is not None:
            subTable.fields[self.get("displayName", self["name"])] = dataColumn

        performanceTable.end("OutputField")
        return dataColumn
コード例 #26
0
ファイル: ClusteringModel.py プロジェクト: soedjais/augustus
    def calculateScore(self, dataTable, functionTable, performanceTable):
        """Calculate the score of this model.

        This method is called by C{calculate} to separate operations
        that are performed by all models (in C{calculate}) from
        operations that are performed by specific models (in
        C{calculateScore}).

        @type subTable: DataTable
        @param subTable: The DataTable representing this model's lexical scope.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: A DataColumn containing the score.
        """

        performanceTable.begin("ClusteringModel")

        performanceTable.begin("set up")

        distributionBased = (self["modelClass"] == "distributionBased")
        clusteringFields = self.xpath(
            "pmml:ClusteringField[not(@isCenterField='false')]")
        fieldWeights = [
            clusteringField.get("fieldWeight",
                                defaultFromXsd=True,
                                convertType=True)
            for clusteringField in clusteringFields
        ]
        for fieldWeight in fieldWeights:
            if fieldWeight < 0.0:
                raise defs.PmmlValidationError(
                    "ClusteringField fieldWeights must all be non-negative (encountered %g)"
                    % fieldWeight)
        clusters = self.xpath("pmml:Cluster")
        comparisonMeasure = self.childOfClass(ComparisonMeasure)
        defaultCompareFunction = comparisonMeasure.get("compareFunction",
                                                       defaultFromXsd=True)
        metric = comparisonMeasure.childOfClass(PmmlClusteringMetric)
        metrictag = metric.t

        performanceTable.end("set up")

        for clusteringField in clusteringFields:
            dataType = dataTable.fields[
                clusteringField["field"]].fieldType.dataType
            if dataType == "string":
                raise defs.PmmlValidationError(
                    "ClusteringField \"%s\" has dataType \"%s\", which cannot be used for clustering"
                    % (clusteringField["field"], dataType))

        missingValueWeights = self.childOfTag("MissingValueWeights")
        if missingValueWeights is None:
            adjustM = None

        else:
            performanceTable.begin("MissingValueWeights")

            missingWeights = missingValueWeights.childOfClass(
                PmmlArray).values(convertType=True)

            sumNMqi = NP("zeros", len(dataTable), dtype=NP.dtype(float))
            for clusteringField, missingWeight in zip(clusteringFields,
                                                      missingWeights):
                clusteringField.addToAdjustM(dataTable, functionTable,
                                             performanceTable, sumNMqi,
                                             missingWeight)

            adjustM = NP(sum(missingWeights) / sumNMqi)
            adjustM[NP(sumNMqi == 0.0)] = 1.0

            performanceTable.end("MissingValueWeights")

        anyInvalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
        for clusteringField in clusteringFields:
            mask = dataTable.fields[clusteringField["field"]].mask
            if mask is not None:
                NP("logical_or", anyInvalid, NP(mask == defs.INVALID),
                   anyInvalid)

        bestClusterId = None
        bestClusterAffinity = None
        allClusterAffinities = {}

        for index, cluster in enumerate(clusters):
            array = cluster.childOfClass(PmmlArray)
            if array is None:
                raise defs.PmmlValidationError(
                    "Cluster must have an array to designate its center")

            centerStrings = array.values(convertType=False)
            if len(centerStrings) != len(clusteringFields):
                raise defs.PmmlValidationError(
                    "Cluster array has %d components, but there are %d ClusteringFields with isCenterField=true"
                    % (len(centerStrings), len(clusteringFields)))

            performanceTable.begin(metrictag)

            if distributionBased:
                matrix = cluster.xpath("pmml:Covariances/pmml:Matrix")
                if len(matrix) != 1:
                    raise defs.PmmlValidationError(
                        "In distribution-based clustering, all clusters must have a Covariances/Matrix"
                    )
                try:
                    covarianceMatrix = NP("array",
                                          matrix[0].values(),
                                          dtype=NP.dtype(float))
                except ValueError:
                    raise defs.PmmlValidationError(
                        "Covariances/Matrix must contain real numbers for distribution-based clustering"
                    )

            else:
                covarianceMatrix = None

            state = self._State()
            metric.initialize(state, len(dataTable), len(clusteringFields),
                              distributionBased)

            for clusteringField, centerString, fieldWeight in zip(
                    clusteringFields, centerStrings, fieldWeights):
                if isinstance(metric, PmmlClusteringMetricBinary):
                    metric.accumulateBinary(
                        state, dataTable.fields[clusteringField["field"]],
                        centerString, distributionBased)
                else:
                    performanceTable.pause(metrictag)
                    cxy = clusteringField.compare(dataTable, functionTable,
                                                  performanceTable,
                                                  centerString,
                                                  defaultCompareFunction,
                                                  anyInvalid)
                    performanceTable.unpause(metrictag)
                    metric.accumulate(state, cxy, fieldWeight,
                                      distributionBased)

            distance = metric.finalizeDistance(state, adjustM,
                                               distributionBased,
                                               covarianceMatrix)
            del state

            performanceTable.end(metrictag)

            if index == 0:
                bestClusterId = NP("ones", len(dataTable),
                                   dtype=NP.dtype(int))  # 1-based index
                bestClusterAffinity = distance

            better = NP(distance < bestClusterAffinity)
            bestClusterId[better] = index + 1  # 1-based index
            bestClusterAffinity[better] = distance[better]

            allClusterAffinities[cluster.get("id",
                                             "%d" % (index + 1))] = distance

        if not anyInvalid.any():
            scoreMask = None
        else:
            scoreMask = NP(anyInvalid * defs.INVALID)

        performanceTable.begin("set scores")
        score = {}

        performanceTable.begin("predictedValue")
        fieldType = FakeFieldType("string", "categorical")
        clusterIdentifiers = NP("empty", len(dataTable), dtype=fieldType.dtype)
        for index, cluster in enumerate(clusters):
            value = fieldType.stringToValue(
                cluster.get("id", "%d" % (index + 1)))
            clusterIdentifiers[NP(bestClusterId == (index + 1))] = value
        score[None] = DataColumn(fieldType, clusterIdentifiers, scoreMask)
        performanceTable.end("predictedValue")

        if self.subFields["predictedDisplayValue"]:
            performanceTable.begin("predictedDisplayValue")
            fieldType = FakeFieldType("string", "categorical")
            clusterNames = NP("empty", len(dataTable), dtype=fieldType.dtype)
            for index, cluster in enumerate(clusters):
                value = fieldType.stringToValue(cluster.get("name", ""))
                clusterNames[NP(bestClusterId == (index + 1))] = value
            score["predictedDisplayValue"] = DataColumn(
                fieldType, clusterNames, scoreMask)
            performanceTable.end("predictedDisplayValue")

        if self.subFields["entity"]:
            performanceTable.begin("entity")
            fieldType = FakeFieldType("object", "any")
            entities = NP("empty", len(dataTable), dtype=fieldType.dtype)
            for index, cluster in enumerate(clusters):
                value = fieldType.stringToValue(cluster.get("name", ""))
                indexPlusOne = index + 1
                for i in xrange(len(entities)):
                    if bestClusterId[i] == indexPlusOne:
                        entities[i] = cluster
            score["entity"] = DataColumn(fieldType, entities, scoreMask)
            performanceTable.end("entity")

        if self.subFields["clusterId"]:
            performanceTable.begin("clusterId")
            fieldType = FakeFieldType("integer", "continuous")
            score["clusterId"] = DataColumn(fieldType, bestClusterId,
                                            scoreMask)
            performanceTable.end("clusterId")

        if self.subFields["entityId"]:
            performanceTable.begin("entityId")
            fieldType = FakeFieldType("integer", "continuous")
            score["entityId"] = DataColumn(fieldType, bestClusterId, scoreMask)
            performanceTable.end("entityId")

        if self.subFields["clusterAffinity"]:
            performanceTable.begin("clusterAffinity")
            fieldType = FakeFieldType("double", "continuous")
            score["clusterAffinity"] = DataColumn(fieldType,
                                                  bestClusterAffinity,
                                                  scoreMask)
            performanceTable.end("clusterAffinity")

        if self.subFields["affinity"]:
            performanceTable.begin("affinity")
            fieldType = FakeFieldType("double", "continuous")
            score["affinity"] = DataColumn(fieldType, bestClusterAffinity,
                                           scoreMask)
            performanceTable.end("affinity")

        if self.subFields["all"]:
            performanceTable.begin("all")
            fieldType = FakeFieldType("double", "continuous")
            for identifier, distance in allClusterAffinities.items():
                score["all.%s" % identifier] = DataColumn(
                    fieldType, distance, scoreMask)
            performanceTable.end("all")

        performanceTable.end("set scores")
        performanceTable.end("ClusteringModel")
        return score
コード例 #27
0
ファイル: ClusteringModel.py プロジェクト: Huskyeder/augustus
    def calculateScore(self, dataTable, functionTable, performanceTable):
        """Calculate the score of this model.

        This method is called by C{calculate} to separate operations
        that are performed by all models (in C{calculate}) from
        operations that are performed by specific models (in
        C{calculateScore}).

        @type subTable: DataTable
        @param subTable: The DataTable representing this model's lexical scope.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: A DataColumn containing the score.
        """

        performanceTable.begin("ClusteringModel")

        performanceTable.begin("set up")

        distributionBased = (self["modelClass"] == "distributionBased")
        clusteringFields = self.xpath("pmml:ClusteringField[not(@isCenterField='false')]")
        fieldWeights = [clusteringField.get("fieldWeight", defaultFromXsd=True, convertType=True) for clusteringField in clusteringFields]
        for fieldWeight in fieldWeights:
            if fieldWeight < 0.0:
                raise defs.PmmlValidationError("ClusteringField fieldWeights must all be non-negative (encountered %g)" % fieldWeight)
        clusters = self.xpath("pmml:Cluster")
        comparisonMeasure = self.childOfClass(ComparisonMeasure)
        defaultCompareFunction = comparisonMeasure.get("compareFunction", defaultFromXsd=True)
        metric = comparisonMeasure.childOfClass(PmmlClusteringMetric)
        metrictag = metric.t

        performanceTable.end("set up")

        for clusteringField in clusteringFields:
            dataType = dataTable.fields[clusteringField["field"]].fieldType.dataType
            if dataType == "string":
                raise defs.PmmlValidationError("ClusteringField \"%s\" has dataType \"%s\", which cannot be used for clustering" % (clusteringField["field"], dataType))

        missingValueWeights = self.childOfTag("MissingValueWeights")
        if missingValueWeights is None:
            adjustM = None

        else:
            performanceTable.begin("MissingValueWeights")

            missingWeights = missingValueWeights.childOfClass(PmmlArray).values(convertType=True)

            sumNMqi = NP("zeros", len(dataTable), dtype=NP.dtype(float))
            for clusteringField, missingWeight in zip(clusteringFields, missingWeights):
                clusteringField.addToAdjustM(dataTable, functionTable, performanceTable, sumNMqi, missingWeight)

            adjustM = NP(sum(missingWeights) / sumNMqi)
            adjustM[NP(sumNMqi == 0.0)] = 1.0

            performanceTable.end("MissingValueWeights")

        anyInvalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
        for clusteringField in clusteringFields:
            mask = dataTable.fields[clusteringField["field"]].mask
            if mask is not None:
                NP("logical_or", anyInvalid, NP(mask == defs.INVALID), anyInvalid)

        bestClusterId = None
        bestClusterAffinity = None
        allClusterAffinities = {}

        for index, cluster in enumerate(clusters):
            array = cluster.childOfClass(PmmlArray)
            if array is None:
                raise defs.PmmlValidationError("Cluster must have an array to designate its center")

            centerStrings = array.values(convertType=False)
            if len(centerStrings) != len(clusteringFields):
                raise defs.PmmlValidationError("Cluster array has %d components, but there are %d ClusteringFields with isCenterField=true" % (len(centerStrings), len(clusteringFields)))

            performanceTable.begin(metrictag)

            if distributionBased:
                matrix = cluster.xpath("pmml:Covariances/pmml:Matrix")
                if len(matrix) != 1:
                    raise defs.PmmlValidationError("In distribution-based clustering, all clusters must have a Covariances/Matrix")
                try:
                    covarianceMatrix = NP("array", matrix[0].values(), dtype=NP.dtype(float))
                except ValueError:
                    raise defs.PmmlValidationError("Covariances/Matrix must contain real numbers for distribution-based clustering")

            else:
                covarianceMatrix = None

            state = self._State()
            metric.initialize(state, len(dataTable), len(clusteringFields), distributionBased)

            for clusteringField, centerString, fieldWeight in zip(clusteringFields, centerStrings, fieldWeights):
                if isinstance(metric, PmmlClusteringMetricBinary):
                    metric.accumulateBinary(state, dataTable.fields[clusteringField["field"]], centerString, distributionBased)
                else:
                    performanceTable.pause(metrictag)
                    cxy = clusteringField.compare(dataTable, functionTable, performanceTable, centerString, defaultCompareFunction, anyInvalid)
                    performanceTable.unpause(metrictag)
                    metric.accumulate(state, cxy, fieldWeight, distributionBased)

            distance = metric.finalizeDistance(state, adjustM, distributionBased, covarianceMatrix)
            del state

            performanceTable.end(metrictag)

            if index == 0:
                bestClusterId = NP("ones", len(dataTable), dtype=NP.dtype(int))   # 1-based index
                bestClusterAffinity = distance

            better = NP(distance < bestClusterAffinity)
            bestClusterId[better] = index + 1   # 1-based index
            bestClusterAffinity[better] = distance[better]

            allClusterAffinities[cluster.get("id", "%d" % (index + 1))] = distance

        if not anyInvalid.any():
            scoreMask = None
        else:
            scoreMask = NP(anyInvalid * defs.INVALID)

        performanceTable.begin("set scores")
        score = {}

        performanceTable.begin("predictedValue")
        fieldType = FakeFieldType("string", "categorical")
        clusterIdentifiers = NP("empty", len(dataTable), dtype=fieldType.dtype)
        for index, cluster in enumerate(clusters):
            value = fieldType.stringToValue(cluster.get("id", "%d" % (index + 1)))
            clusterIdentifiers[NP(bestClusterId == (index + 1))] = value
        score[None] = DataColumn(fieldType, clusterIdentifiers, scoreMask)
        performanceTable.end("predictedValue")

        if self.subFields["predictedDisplayValue"]:
            performanceTable.begin("predictedDisplayValue")
            fieldType = FakeFieldType("string", "categorical")
            clusterNames = NP("empty", len(dataTable), dtype=fieldType.dtype)
            for index, cluster in enumerate(clusters):
                value = fieldType.stringToValue(cluster.get("name", ""))
                clusterNames[NP(bestClusterId == (index + 1))] = value
            score["predictedDisplayValue"] = DataColumn(fieldType, clusterNames, scoreMask)
            performanceTable.end("predictedDisplayValue")

        if self.subFields["entity"]:
            performanceTable.begin("entity")
            fieldType = FakeFieldType("object", "any")
            entities = NP("empty", len(dataTable), dtype=fieldType.dtype)
            for index, cluster in enumerate(clusters):
                value = fieldType.stringToValue(cluster.get("name", ""))
                indexPlusOne = index + 1
                for i in xrange(len(entities)):
                    if bestClusterId[i] == indexPlusOne:
                        entities[i] = cluster
            score["entity"] = DataColumn(fieldType, entities, scoreMask)
            performanceTable.end("entity")

        if self.subFields["clusterId"]:
            performanceTable.begin("clusterId")
            fieldType = FakeFieldType("integer", "continuous")
            score["clusterId"] = DataColumn(fieldType, bestClusterId, scoreMask)
            performanceTable.end("clusterId")

        if self.subFields["entityId"]:
            performanceTable.begin("entityId")
            fieldType = FakeFieldType("integer", "continuous")
            score["entityId"] = DataColumn(fieldType, bestClusterId, scoreMask)
            performanceTable.end("entityId")

        if self.subFields["clusterAffinity"]:
            performanceTable.begin("clusterAffinity")
            fieldType = FakeFieldType("double", "continuous")
            score["clusterAffinity"] = DataColumn(fieldType, bestClusterAffinity, scoreMask)
            performanceTable.end("clusterAffinity")

        if self.subFields["affinity"]:
            performanceTable.begin("affinity")
            fieldType = FakeFieldType("double", "continuous")
            score["affinity"] = DataColumn(fieldType, bestClusterAffinity, scoreMask)
            performanceTable.end("affinity")

        if self.subFields["all"]:
            performanceTable.begin("all")
            fieldType = FakeFieldType("double", "continuous")
            for identifier, distance in allClusterAffinities.items():
                score["all.%s" % identifier] = DataColumn(fieldType, distance, scoreMask)
            performanceTable.end("all")

        performanceTable.end("set scores")
        performanceTable.end("ClusteringModel")
        return score
コード例 #28
0
ファイル: MiningModel.py プロジェクト: Huskyeder/augustus
    def _selectFirst(self, dataTable, functionTable, performanceTable, segmentation):
        """Used by C{calculateScore}."""

        performanceTable.begin("Segmentation selectFirst")

        scoresData = NP("empty", len(dataTable), dtype=NP.dtype(object))
        scoresMask = NP("zeros", len(dataTable), dtype=defs.maskType)
        unfilled = NP("ones", len(dataTable), dtype=NP.dtype(bool))
        segments = NP("empty", len(dataTable), dtype=NP.dtype(object))

        newOutputData = []
        for segment in segmentation.childrenOfTag("Segment", iterator=True):
            performanceTable.pause("Segmentation selectFirst")
            selection = segment.childOfClass(PmmlPredicate).evaluate(dataTable, functionTable, performanceTable)
            performanceTable.unpause("Segmentation selectFirst")
            NP("logical_and", selection, unfilled, selection)
            if not selection.any():
                continue

            subTable = dataTable.subTable(selection)
            subModel = segment.childOfClass(PmmlModel)
            performanceTable.pause("Segmentation selectFirst")

            subModel.calculate(subTable, functionTable, performanceTable)
            performanceTable.unpause("Segmentation selectFirst")

            scoresData[selection] = subTable.score.data
            if subTable.score.mask is not None:
                scoresMask[selection] = subTable.score.mask
            else:
                scoresMask[selection] = defs.VALID

            segmentName = segment.get("id")
            if segmentName is not None:
                segments[selection] = segmentName

            for fieldName, dataColumn in subTable.output.items():
                if fieldName not in dataTable.output:
                    data = NP("empty", len(dataTable), dtype=dataColumn.fieldType.dtype)
                    data[selection] = dataColumn.data

                    mask = NP(NP("ones", len(dataTable), dtype=defs.maskType) * defs.MISSING)
                    if dataColumn.mask is None:
                        mask[selection] = defs.VALID
                    else:
                        mask[selection] = dataColumn.mask

                    newDataColumn = DataColumn(dataColumn.fieldType, data, mask)
                    newDataColumn._unlock()
                    dataTable.output[fieldName] = newDataColumn
                    newOutputData.append(newDataColumn)

                else:
                    newDataColumn = dataTable.output[fieldName]

                    newDataColumn.data[selection] = dataColumn.data
                    if dataColumn.mask is None:
                        newDataColumn.mask[selection] = defs.VALID
                    else:
                        newDataColumn.mask[selection] = dataColumn.mask

            unfilled -= selection
            if not unfilled.any():
                break

        for newDataColumn in newOutputData:
            if not newDataColumn.mask.any():
                newDataColumn._mask = None
            newDataColumn._lock()

        if not scoresMask.any():
            scoresMask = None

        scores = DataColumn(self.scoreType, scoresData, scoresMask)

        if self.name is None:
            performanceTable.end("Segmentation selectFirst")
            return {None: scores}
        else:
            performanceTable.end("Segmentation selectFirst")
            return {None: scores, "segment": DataColumn(self.scoreTypeSegment, segments, None)}