コード例 #1
0
    def evaluate(self, dataTable, functionTable, performanceTable, text=None):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @type text: string or None
        @param text: If None, use the text of this Formula object; otherwise, use C{text} instead.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        if text is None:
            text = self.text

        performanceTable.begin("Formula parse")
        parsed = Formula.parse(text)
        performanceTable.end("Formula parse")

        performanceTable.begin("Formula evaluate")
        dataColumn = parsed.evaluate(dataTable, functionTable, performanceTable)

        if dataColumn.mask is None:
            return dataColumn

        data = dataColumn.data
        mask = dataColumn.mask
        mask = FieldCastMethods.applyInvalidValueTreatment(mask, self.get("invalidValueTreatment"))
        data, mask = FieldCastMethods.applyMapMissingTo(dataColumn.fieldType, data, mask, self.get("mapMissingTo"))

        performanceTable.end("Formula evaluate")
        return DataColumn(dataColumn.fieldType, data, mask)
コード例 #2
0
ファイル: Apply.py プロジェクト: Huskyeder/augustus
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("Apply")
        
        function = functionTable.get(self.get("function"))
        if function is None:
            raise LookupError("Apply references function \"%s\", but it does not exist" % self.get("function"))

        arguments = self.childrenOfClass(PmmlExpression)

        performanceTable.pause("Apply")
        dataColumn = function.evaluate(dataTable, functionTable, performanceTable, arguments)
        performanceTable.unpause("Apply")

        mask = FieldCastMethods.applyInvalidValueTreatment(dataColumn.mask, self.get("invalidValueTreatment"))
        data, mask = FieldCastMethods.applyMapMissingTo(dataColumn.fieldType, dataColumn.data, mask, self.get("mapMissingTo"))

        performanceTable.end("Apply")
        return DataColumn(dataColumn.fieldType, data, mask)
コード例 #3
0
ファイル: NormContinuous.py プロジェクト: Huskyeder/augustus
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("NormContinuous")
        
        dataColumn = dataTable.fields[self["field"]]
        if dataColumn.fieldType.dataType in ("object", "string", "boolean"):
            raise defs.PmmlValidationError("NormContinuous requires a numeric input field, but \"%s\" is" % dataColumn.fieldType.dataType)

        outliers = self.get("outliers")

        linearNorms = self.childrenOfTag("LinearNorm")
        for linearNorm in linearNorms:
            linearNorm.orig = float(linearNorm["orig"])
            linearNorm.norm = float(linearNorm["norm"])

        linearNorms.sort(lambda x, y: cmp(x.orig, y.orig))   # technically, it's invalid if not already sorted

        data = NP("empty", len(dataTable), self._fieldType.dtype)
        mask = dataColumn.mask

        # extrapolate before the first
        selection = NP(dataColumn.data <= linearNorms[0].orig)
        if outliers == "asMissingValues":
            mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, selection)
        elif outliers == "asExtremeValues":
            data[selection] = linearNorms[0].norm
        else:
            self.transformSelection(linearNorms[0], linearNorms[1], dataColumn.data, data, selection)

        for i in xrange(len(linearNorms) - 1):
            selection = NP(linearNorms[i].orig < dataColumn.data)
            NP("logical_and", selection, NP(dataColumn.data <= linearNorms[i+1].orig), selection)

            self.transformSelection(linearNorms[i], linearNorms[i+1], dataColumn.data, data, selection)

        selection = NP(linearNorms[-1].orig < dataColumn.data)
        if outliers == "asMissingValues":
            mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, selection)
        elif outliers == "asExtremeValues":
            data[selection] = linearNorms[-1].norm
        else:
            self.transformSelection(linearNorms[-2], linearNorms[-1], dataColumn.data, data, selection)

        data, mask = FieldCastMethods.applyMapMissingTo(self._fieldType, data, mask, self.get("mapMissingTo"))

        performanceTable.end("NormContinuous")
        return DataColumn(self._fieldType, data, mask)
コード例 #4
0
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("NormDiscrete")

        dataColumn = dataTable.fields[self["field"]]
        value = dataColumn.fieldType.stringToValue(self["value"])
        data = NP("array",
                  NP(dataColumn.data == value),
                  dtype=self._fieldType.dtype)
        data, mask = FieldCastMethods.applyMapMissingTo(
            self._fieldType, data, dataColumn.mask, self.get("mapMissingTo"))

        performanceTable.end("NormDiscrete")
        return DataColumn(self._fieldType, data, mask)
コード例 #5
0
ファイル: DefineFunction.py プロジェクト: Huskyeder/augustus
    def evaluate(self, dataTable, functionTable, performanceTable, arguments):
        """Evaluate the function, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        arguments = [x.evaluate(dataTable, functionTable, performanceTable) for x in arguments]
        performanceTable.begin("user-defined \"%s\"" % self.name)

        parameters = self.childrenOfTag("ParameterField")

        if len(arguments) != len(parameters):
            raise defs.PmmlValidationError("Apply function=\"%s\" has %d arguments but the corresponding DefineFunction has %d parameters" % (self.name, len(arguments), len(parameters)))

        subTable = dataTable.subTable()

        for argument, parameter in zip(arguments, parameters):
            dataType = parameter.get("dataType", argument.fieldType.dataType)
            optype = parameter.get("optype", argument.fieldType.optype)
            if dataType != argument.fieldType.dataType or optype != argument.fieldType.optype:
                argument = FieldCastMethods.cast(FakeFieldType(dataType, optype), argument)

            subTable.fields[parameter["name"]] = argument

        performanceTable.pause("user-defined \"%s\"" % self.name)
        dataColumn = self.childOfClass(PmmlExpression).evaluate(subTable, functionTable, performanceTable)
        performanceTable.unpause("user-defined \"%s\"" % self.name)

        dataType = self.get("dataType", dataColumn.fieldType.dataType)
        optype = self.get("optype", dataColumn.fieldType.optype)
        if dataType != dataColumn.fieldType.dataType or optype != dataColumn.fieldType.optype:
            dataColumn = FieldCastMethods.cast(FakeFieldType(dataType, optype), dataColumn)

        performanceTable.end("user-defined \"%s\"" % self.name)
        return dataColumn
コード例 #6
0
ファイル: CastExpression.py プロジェクト: Huskyeder/augustus
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        dataColumn = self.childOfClass(PmmlExpression).evaluate(dataTable, functionTable, performanceTable)
        performanceTable.begin("CastExpression")

        dataColumn = FieldCastMethods.cast(FieldType(self), dataColumn)
        mask = FieldCastMethods.applyInvalidValueTreatment(dataColumn.mask, self.get("invalidValueTreatment"))
        data, mask = FieldCastMethods.applyMapMissingTo(dataColumn.fieldType, dataColumn.data, mask, self.get("mapMissingTo"))

        performanceTable.end("CastExpression")
        return DataColumn(dataColumn.fieldType, data, mask)
コード例 #7
0
ファイル: Apply.py プロジェクト: soedjais/augustus
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("Apply")

        function = functionTable.get(self.get("function"))
        if function is None:
            raise LookupError(
                "Apply references function \"%s\", but it does not exist" %
                self.get("function"))

        arguments = self.childrenOfClass(PmmlExpression)

        performanceTable.pause("Apply")
        dataColumn = function.evaluate(dataTable, functionTable,
                                       performanceTable, arguments)
        performanceTable.unpause("Apply")

        mask = FieldCastMethods.applyInvalidValueTreatment(
            dataColumn.mask, self.get("invalidValueTreatment"))
        data, mask = FieldCastMethods.applyMapMissingTo(
            dataColumn.fieldType, dataColumn.data, mask,
            self.get("mapMissingTo"))

        performanceTable.end("Apply")
        return DataColumn(dataColumn.fieldType, data, mask)
コード例 #8
0
ファイル: DerivedField.py プロジェクト: Huskyeder/augustus
    def calculate(self, dataTable, functionTable=None, performanceTable=None):
        """Calculate a DerivedField.

        This method modifies the input DataTable.

        If the data types between the DerivedField and its EXPRESSION
        are not matched, the DerivedField will need to cast the output.
        This is a potentially expensive and often unwanted operation.
        When a DerivedField casts, it reports the cast in the
        PerformanceTable with DerivedField name, to help the user
        debug their PMML.

        @type dataTable: DataTable
        @param dataTable: The pre-built DataTable.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.  Initially, it contains only the built-in functions, but any user functions defined in PMML would be added to it.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataTable
        @return: A DataTable containing the result, usually a modified version of the input.
        """

        if functionTable is None:
            functionTable = FunctionTable()
        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        dataColumn = self.childOfClass(PmmlExpression).evaluate(dataTable, functionTable, performanceTable)
        performanceTable.begin("DerivedField")

        dataType = dataColumn.fieldType.dataType
        optype = dataColumn.fieldType.optype
        if self.get("dataType", dataType) == dataType and self.get("optype", optype) == optype and len(self.childrenOfTag("Value")) == 0:
            dataTable.fields[self.name] = dataColumn

        else:
            performanceTable.begin("cast (\"%s\")" % self.name)
            dataTable.fields[self.name] = FieldCastMethods.cast(FieldType(self), dataColumn)
            performanceTable.end("cast (\"%s\")" % self.name)

        performanceTable.end("DerivedField")

        return dataTable.fields[self.name]
コード例 #9
0
ファイル: FieldRef.py プロジェクト: Huskyeder/augustus
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("FieldRef")

        dataColumn = dataTable.fields[self["field"]]
        data, mask = FieldCastMethods.applyMapMissingTo(dataColumn.fieldType, dataColumn.data, dataColumn.mask, self.get("mapMissingTo"))

        performanceTable.end("FieldRef")
        return DataColumn(dataColumn.fieldType, data, mask)
コード例 #10
0
ファイル: MapValues.py プロジェクト: soedjais/augustus
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("MapValues")

        fieldType = FakeFieldType(self.get("dataType", "string"),
                                  self.get("optype", self._optype))
        fieldType._newValuesAllowed = True

        defaultValue = self.get("defaultValue")
        if defaultValue is not None:
            defaultValue = fieldType.stringToValue(defaultValue)

        data = NP("empty", len(dataTable), dtype=fieldType.dtype)
        if defaultValue is not None:
            data[:] = defaultValue

        outputColumn = self["outputColumn"]
        columnNameToField = {}
        for fieldColumnPair in self.childrenOfTag("FieldColumnPair"):
            dataColumn = dataTable.fields[fieldColumnPair["field"]]
            columnNameToField[fieldColumnPair["column"]] = dataColumn

        # cache partial selections because they'll be used over and over in intersecting sets
        dataSelections = {}
        missingSelections = {}
        coverage = NP("zeros", len(dataTable), dtype=NP.dtype(bool))

        for index, row in enumerate(
                self.childOfClass(TableInterface).iterate()):
            outputValue = row.get(outputColumn)
            if outputValue is None:
                raise defs.PmmlValidationError(
                    "MapValues has outputColumn \"%s\" but a column with that name does not appear in row %d of the table"
                    % (outputColumn, index))
            del row[outputColumn]
            outputValue = fieldType.stringToValue(outputValue)

            # this is an intersection of all matching columns
            selection = NP("ones", len(dataTable), dtype=NP.dtype(bool))

            for columnName, columnValueString in row.items():
                dataColumn = columnNameToField.get(columnName)
                if dataColumn is not None:
                    columnValue = dataColumn.fieldType.stringToValue(
                        columnValueString)

                    # one cached data array per column (name, value) pair
                    if (columnName, columnValueString) not in dataSelections:
                        selectData = NP(dataColumn.data == columnValue)
                        if dataColumn.mask is not None:
                            NP("logical_and", selectData,
                               NP(dataColumn.mask == defs.VALID), selectData)
                        dataSelections[columnName,
                                       columnValueString] = selectData
                    NP("logical_and", selection,
                       dataSelections[columnName,
                                      columnValueString], selection)

                    # one cached mask array per column name ("missing" has only one possible value, though I consider any non-VALID "missing")
                    if columnName not in missingSelections and dataColumn.mask is not None:
                        missingSelections[columnName] = NP(
                            dataColumn.mask != defs.VALID)

            # set the intersection to the output value
            data[selection] = outputValue
            NP("logical_or", coverage, selection, coverage)

        missing = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
        for missingSelection in missingSelections.values():
            NP("logical_or", missing, missingSelection, missing)
        coverage -= missing

        mask = missing * defs.MISSING

        data, mask = FieldCastMethods.applyMapMissingTo(
            fieldType, data, mask, self.get("mapMissingTo"))

        if defaultValue is None:
            NP("logical_not", coverage, coverage)
            if mask is None:
                mask = NP(coverage * defs.MISSING)
            else:
                mask[coverage] = defs.MISSING

        performanceTable.end("MapValues")
        return DataColumn(fieldType, data, mask)
コード例 #11
0
ファイル: OutputField.py プロジェクト: Huskyeder/augustus
    def format(self, subTable, functionTable, performanceTable, score):
        """Extract or post-process output for the output field of a DataTable.

        @type subTable: DataTable
        @param subTable: The DataTable associated with this local lexical scope.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @type score: dict
        @param score: Dictionary mapping PMML score "feature" strings to DataColumns.  This dictionary always contains a None key, which is the basic feature ("predictedValue").
        @rtype: DataColumn
        @return: The output that would go into an output field of a DataTable.
        """

        performanceTable.begin("OutputField")

        feature = self.get("feature")
        if feature is None:
            dataColumn = subTable.fields[self["name"]]

        elif feature == "predictedValue":
            dataColumn = score[None]

        elif feature == "predictedDisplayValue":
            original = score[None]
            toString = original.fieldType.valueToString
            data = NP("empty", len(subTable), dtype=NP.dtype(object))
            for i, x in enumerate(original.data):
                data[i] = toString(x)
            dataColumn = DataColumn(FakeFieldType("string", "continuous"), data, None)

        elif feature == "transformedValue":
            expression = self.childOfClass(PmmlExpression)
            if expression is None:
                raise defs.PmmlValidationError("OutputField with feature \"transformedValue\" requires an EXPRESSION")
            
            performanceTable.pause("OutputField")
            dataColumn = expression.evaluate(subTable, functionTable, performanceTable)
            performanceTable.unpause("OutputField")

        elif feature == "decision":
            decisions = self.childOfTag("Decisions")
            if decisions is None:
                raise defs.PmmlValidationError("OutputField with feature \"decision\" requires a Decisions block")

            performanceTable.pause("OutputField")
            dataColumn = self.childOfClass(PmmlExpression).evaluate(subTable, functionTable, performanceTable)
            performanceTable.unpause("OutputField")

            if dataColumn.mask is None:
                valid = None
            else:
                valid = NP(dataColumn.mask == defs.VALID)

            fieldType = FakeFieldType("object", "any")
            data = NP("empty", len(subTable), dtype=fieldType.dtype)
            mask = NP(NP("ones", len(subTable), dtype=defs.maskType) * defs.MISSING)

            for decision in decisions.childrenOfTag("Decision"):
                value = dataColumn.fieldType.stringToValue(decision["value"])

                selection = NP(dataColumn.data == value)
                if valid is not None:
                    NP("logical_and", selection, valid, selection)

                for i in xrange(len(data)):
                    if selection[i]:
                        data[i] = decision

                mask[selection] = defs.VALID
            
            if not mask.any():
                mask = None

            dataColumn = DataColumn(fieldType, data, mask)

        elif feature in score:
            dataColumn = score[feature]

        else:
            model = self.getparent()
            if model is not None: model = model.getparent()

            if model is None:
                model = "(orphaned OutputField; no parent model)"
            else:
                model = model.t

            raise defs.PmmlValidationError("Models of type %s do not produce \"%s\" features (or at least, it is not yet implemented by Augustus)" % (model, feature))

        dataType = self.get("dataType", dataColumn.fieldType.dataType)
        optype = self.get("optype", dataColumn.fieldType.optype)
        if (dataType != dataColumn.fieldType.dataType or optype != dataColumn.fieldType.optype) and feature not in ("predictedDisplayValue", "decision"):
            dataColumn = FieldCastMethods.cast(FakeFieldType(dataType, optype), dataColumn)

        if feature is not None:
            subTable.fields[self.get("displayName", self["name"])] = dataColumn

        performanceTable.end("OutputField")
        return dataColumn
コード例 #12
0
ファイル: Discretize.py プロジェクト: pradeep6kumar/augustus
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("Discretize")

        dataColumn = dataTable.fields[self["field"]]
        if dataColumn.fieldType.dataType in ("object", "string", "boolean"):
            raise defs.PmmlValidationError("Discretize requires a numeric input field, but \"%s\" is" % dataColumn.fieldType.dataType)

        fieldType = FakeFieldType(self.get("dataType", "string"), self.get("optype", self._optype))
        fieldType._newValuesAllowed = True

        defaultValue = self.get("defaultValue")
        if defaultValue is not None:
            defaultValue = fieldType.stringToValue(defaultValue)

        data = NP("empty", len(dataTable), dtype=fieldType.dtype)
        mask = NP("empty", len(dataTable), dtype=defs.maskType)
        if defaultValue is None:
            mask[:] = defs.MISSING
        else:
            data[:] = defaultValue
            mask[:] = defs.VALID

        for discretizeBin in self.childrenOfTag("DiscretizeBin"):
            try:
                binValue = fieldType.stringToValue(discretizeBin["binValue"])
            except ValueError:
                raise defs.PmmlValidationError("Cannot cast DiscretizeBin binValue \"%s\" as %s %s" % (discretizeBin["binValue"], fieldType.optype, fieldType.dataType))

            fieldType.values.append(FakeFieldValue(value=binValue))

            interval = discretizeBin.childOfTag("Interval")

            closure = interval["closure"]
            leftMargin = interval.get("leftMargin")
            rightMargin = interval.get("rightMargin")
            selection = None

            if leftMargin is not None:
                try:
                    leftMargin = dataColumn.fieldType.stringToValue(leftMargin)
                except ValueError:
                    raise defs.PmmlValidationError("Improper value in Interval leftMargin specification: \"%s\"" % leftMargin)

                if closure in ("openClosed", "openOpen"):
                    if selection is None:
                        selection = NP(leftMargin < dataColumn.data)
                    else:
                        NP("logical_and", selection, NP(leftMargin < dataColumn.data), selection)

                elif closure in ("closedOpen", "closedClosed"):
                    if selection is None:
                        selection = NP(leftMargin <= dataColumn.data)
                    else:
                        NP("logical_and", selection, NP(leftMargin <= dataColumn.data), selection)

            if rightMargin is not None:
                try:
                    rightMargin = dataColumn.fieldType.stringToValue(rightMargin)
                except ValueError:
                    raise defs.PmmlValidationError("Improper value in Interval rightMargin specification: \"%s\"" % rightMargin)

                if closure in ("openOpen", "closedOpen"):
                    if selection is None:
                        selection = NP(dataColumn.data < rightMargin)
                    else:
                        NP("logical_and", selection, NP(dataColumn.data < rightMargin), selection)

                elif closure in ("openClosed", "closedClosed"):
                    if selection is None:
                        selection = NP(dataColumn.data <= rightMargin)
                    else:
                        NP("logical_and", selection, NP(dataColumn.data <= rightMargin), selection)
                
            if selection is not None:
                NP("logical_and", selection, NP(dataColumn.mask == defs.VALID), selection)
                data[selection] = binValue
                mask[selection] = defs.VALID

        mask[NP(dataColumn.mask == defs.MISSING)] = defs.MISSING
        mask[NP(dataColumn.mask == defs.INVALID)] = defs.INVALID

        data, mask = FieldCastMethods.applyMapMissingTo(fieldType, data, mask, self.get("mapMissingTo"))
        
        performanceTable.end("Discretize")
        return DataColumn(fieldType, data, mask)
コード例 #13
0
ファイル: MiningField.py プロジェクト: Huskyeder/augustus
    def replaceField(self, dataTable, functionTable, performanceTable):
        """Replace a field in the DataTable for outlier removal,
        missing value handling, and invalid value treatment.

        @type dataTable: DataTable
        @param dataTable: The pre-built DataTable.
        @type functionTable: FunctionTable
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        """

        dataColumn = dataTable.fields.get(self.name)
        if dataColumn is None:
            return

        performanceTable.begin("MiningField")

        optype = self.get("optype", dataColumn.fieldType.optype)
        if optype != dataColumn.fieldType.optype:
            dataColumn = FieldCastMethods.cast(FakeFieldType(dataColumn.fieldType.dataType, optype), dataColumn)

        data = dataColumn.data
        mask = dataColumn.mask

        outliers = self.get("outliers")
        
        lowValue = self.get("lowValue")
        if lowValue is not None:
            lowValue = dataColumn.fieldType.stringToValue(lowValue)

            if outliers == "asMissingValues":
                selection = NP(dataColumn.data < lowValue)
                mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, selection)

            elif outliers == "asExtremeValues":
                selection = NP(dataColumn.data < lowValue)
                if data is dataColumn.data:
                    data = NP("copy", data)
                    data.setflags(write=True)
                    data[selection] = lowValue

        highValue = self.get("highValue")
        if highValue is not None:
            highValue = dataColumn.fieldType.stringToValue(highValue)

            if outliers == "asMissingValues":
                selection = NP(dataColumn.data > highValue)
                mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, selection)

            elif outliers == "asExtremeValues":
                selection = NP(dataColumn.data > highValue)
                if data is dataColumn.data:
                    data = NP("copy", data)
                    data.setflags(write=True)
                    data[selection] = highValue

        mask = FieldCastMethods.applyInvalidValueTreatment(mask, self.get("invalidValueTreatment"))
        data, mask = FieldCastMethods.applyMapMissingTo(dataColumn.fieldType, data, mask, self.get("missingValueReplacement"))

        dataTable.fields.replaceField(self.name, DataColumn(dataColumn.fieldType, data, mask))
        performanceTable.end("MiningField")
コード例 #14
0
ファイル: MapValues.py プロジェクト: Huskyeder/augustus
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("MapValues")
        
        fieldType = FakeFieldType(self.get("dataType", "string"), self.get("optype", self._optype))
        fieldType._newValuesAllowed = True

        defaultValue = self.get("defaultValue")
        if defaultValue is not None:
            defaultValue = fieldType.stringToValue(defaultValue)

        data = NP("empty", len(dataTable), dtype=fieldType.dtype)
        if defaultValue is not None:
            data[:] = defaultValue

        outputColumn = self["outputColumn"]
        columnNameToField = {}
        for fieldColumnPair in self.childrenOfTag("FieldColumnPair"):
            dataColumn = dataTable.fields[fieldColumnPair["field"]]
            columnNameToField[fieldColumnPair["column"]] = dataColumn

        # cache partial selections because they'll be used over and over in intersecting sets
        dataSelections = {}
        missingSelections = {}
        coverage = NP("zeros", len(dataTable), dtype=NP.dtype(bool))

        for index, row in enumerate(self.childOfClass(TableInterface).iterate()):
            outputValue = row.get(outputColumn)
            if outputValue is None:
                raise defs.PmmlValidationError("MapValues has outputColumn \"%s\" but a column with that name does not appear in row %d of the table" % (outputColumn, index))
            del row[outputColumn]
            outputValue = fieldType.stringToValue(outputValue)

            # this is an intersection of all matching columns
            selection = NP("ones", len(dataTable), dtype=NP.dtype(bool))

            for columnName, columnValueString in row.items():
                dataColumn = columnNameToField.get(columnName)
                if dataColumn is not None:
                    columnValue = dataColumn.fieldType.stringToValue(columnValueString)

                    # one cached data array per column (name, value) pair
                    if (columnName, columnValueString) not in dataSelections:
                        selectData = NP(dataColumn.data == columnValue)
                        if dataColumn.mask is not None:
                            NP("logical_and", selectData, NP(dataColumn.mask == defs.VALID), selectData)
                        dataSelections[columnName, columnValueString] = selectData
                    NP("logical_and", selection, dataSelections[columnName, columnValueString], selection)

                    # one cached mask array per column name ("missing" has only one possible value, though I consider any non-VALID "missing")
                    if columnName not in missingSelections and dataColumn.mask is not None:
                        missingSelections[columnName] = NP(dataColumn.mask != defs.VALID)
                        
            # set the intersection to the output value
            data[selection] = outputValue
            NP("logical_or", coverage, selection, coverage)
        
        missing = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
        for missingSelection in missingSelections.values():
            NP("logical_or", missing, missingSelection, missing)
        coverage -= missing

        mask = missing * defs.MISSING

        data, mask = FieldCastMethods.applyMapMissingTo(fieldType, data, mask, self.get("mapMissingTo"))

        if defaultValue is None:
            NP("logical_not", coverage, coverage)
            if mask is None:
                mask = NP(coverage * defs.MISSING)
            else:
                mask[coverage] = defs.MISSING

        performanceTable.end("MapValues")
        return DataColumn(fieldType, data, mask)
コード例 #15
0
ファイル: OutputField.py プロジェクト: soedjais/augustus
    def format(self, subTable, functionTable, performanceTable, score):
        """Extract or post-process output for the output field of a DataTable.

        @type subTable: DataTable
        @param subTable: The DataTable associated with this local lexical scope.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @type score: dict
        @param score: Dictionary mapping PMML score "feature" strings to DataColumns.  This dictionary always contains a None key, which is the basic feature ("predictedValue").
        @rtype: DataColumn
        @return: The output that would go into an output field of a DataTable.
        """

        performanceTable.begin("OutputField")

        feature = self.get("feature")
        if feature is None:
            dataColumn = subTable.fields[self["name"]]

        elif feature == "predictedValue":
            dataColumn = score[None]

        elif feature == "predictedDisplayValue":
            original = score[None]
            toString = original.fieldType.valueToString
            data = NP("empty", len(subTable), dtype=NP.dtype(object))
            for i, x in enumerate(original.data):
                data[i] = toString(x)
            dataColumn = DataColumn(FakeFieldType("string", "continuous"),
                                    data, None)

        elif feature == "transformedValue":
            expression = self.childOfClass(PmmlExpression)
            if expression is None:
                raise defs.PmmlValidationError(
                    "OutputField with feature \"transformedValue\" requires an EXPRESSION"
                )

            performanceTable.pause("OutputField")
            dataColumn = expression.evaluate(subTable, functionTable,
                                             performanceTable)
            performanceTable.unpause("OutputField")

        elif feature == "decision":
            decisions = self.childOfTag("Decisions")
            if decisions is None:
                raise defs.PmmlValidationError(
                    "OutputField with feature \"decision\" requires a Decisions block"
                )

            performanceTable.pause("OutputField")
            dataColumn = self.childOfClass(PmmlExpression).evaluate(
                subTable, functionTable, performanceTable)
            performanceTable.unpause("OutputField")

            if dataColumn.mask is None:
                valid = None
            else:
                valid = NP(dataColumn.mask == defs.VALID)

            fieldType = FakeFieldType("object", "any")
            data = NP("empty", len(subTable), dtype=fieldType.dtype)
            mask = NP(
                NP("ones", len(subTable), dtype=defs.maskType) * defs.MISSING)

            for decision in decisions.childrenOfTag("Decision"):
                value = dataColumn.fieldType.stringToValue(decision["value"])

                selection = NP(dataColumn.data == value)
                if valid is not None:
                    NP("logical_and", selection, valid, selection)

                for i in xrange(len(data)):
                    if selection[i]:
                        data[i] = decision

                mask[selection] = defs.VALID

            if not mask.any():
                mask = None

            dataColumn = DataColumn(fieldType, data, mask)

        elif feature in score:
            dataColumn = score[feature]

        else:
            model = self.getparent()
            if model is not None: model = model.getparent()

            if model is None:
                model = "(orphaned OutputField; no parent model)"
            else:
                model = model.t

            raise defs.PmmlValidationError(
                "Models of type %s do not produce \"%s\" features (or at least, it is not yet implemented by Augustus)"
                % (model, feature))

        dataType = self.get("dataType", dataColumn.fieldType.dataType)
        optype = self.get("optype", dataColumn.fieldType.optype)
        if (dataType != dataColumn.fieldType.dataType
                or optype != dataColumn.fieldType.optype) and feature not in (
                    "predictedDisplayValue", "decision"):
            dataColumn = FieldCastMethods.cast(FakeFieldType(dataType, optype),
                                               dataColumn)

        if feature is not None:
            subTable.fields[self.get("displayName", self["name"])] = dataColumn

        performanceTable.end("OutputField")
        return dataColumn
コード例 #16
0
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("Discretize")

        dataColumn = dataTable.fields[self["field"]]
        if dataColumn.fieldType.dataType in ("object", "string", "boolean"):
            raise defs.PmmlValidationError("Discretize requires a numeric input field, but \"%s\" is" % dataColumn.fieldType.dataType)

        fieldType = FakeFieldType(self.get("dataType", "string"), self.get("optype", self._optype))
        fieldType._newValuesAllowed = True

        defaultValue = self.get("defaultValue")
        if defaultValue is not None:
            defaultValue = fieldType.stringToValue(defaultValue)

        data = NP("empty", len(dataTable), dtype=fieldType.dtype)
        mask = NP("empty", len(dataTable), dtype=defs.maskType)
        if defaultValue is None:
            mask[:] = defs.MISSING
        else:
            data[:] = defaultValue
            mask[:] = defs.VALID

        for discretizeBin in self.childrenOfTag("DiscretizeBin"):
            try:
                binValue = fieldType.stringToValue(discretizeBin["binValue"])
            except ValueError:
                raise defs.PmmlValidationError("Cannot cast DiscretizeBin binValue \"%s\" as %s %s" % (discretizeBin["binValue"], fieldType.optype, fieldType.dataType))

            fieldType.values.append(FakeFieldValue(value=binValue))

            interval = discretizeBin.childOfTag("Interval")

            closure = interval["closure"]
            leftMargin = interval.get("leftMargin")
            rightMargin = interval.get("rightMargin")
            selection = None

            if leftMargin is not None:
                try:
                    leftMargin = dataColumn.fieldType.stringToValue(leftMargin)
                except ValueError:
                    raise defs.PmmlValidationError("Improper value in Interval leftMargin specification: \"%s\"" % leftMargin)

                if closure in ("openClosed", "openOpen"):
                    if selection is None:
                        selection = NP(leftMargin < dataColumn.data)
                    else:
                        NP("logical_and", selection, NP(leftMargin < dataColumn.data), selection)

                elif closure in ("closedOpen", "closedClosed"):
                    if selection is None:
                        selection = NP(leftMargin <= dataColumn.data)
                    else:
                        NP("logical_and", selection, NP(leftMargin <= dataColumn.data), selection)

            if rightMargin is not None:
                try:
                    rightMargin = dataColumn.fieldType.stringToValue(rightMargin)
                except ValueError:
                    raise defs.PmmlValidationError("Improper value in Interval rightMargin specification: \"%s\"" % rightMargin)

                if closure in ("openOpen", "closedOpen"):
                    if selection is None:
                        selection = NP(dataColumn.data < rightMargin)
                    else:
                        NP("logical_and", selection, NP(dataColumn.data < rightMargin), selection)

                elif closure in ("openClosed", "closedClosed"):
                    if selection is None:
                        selection = NP(dataColumn.data <= rightMargin)
                    else:
                        NP("logical_and", selection, NP(dataColumn.data <= rightMargin), selection)
                
            if selection is not None:
                NP("logical_and", selection, NP(dataColumn.mask == defs.VALID), selection)
                data[selection] = binValue
                mask[selection] = defs.VALID

        mask[NP(dataColumn.mask == defs.MISSING)] = defs.MISSING
        mask[NP(dataColumn.mask == defs.INVALID)] = defs.INVALID

        data, mask = FieldCastMethods.applyMapMissingTo(fieldType, data, mask, self.get("mapMissingTo"))
        
        performanceTable.end("Discretize")
        return DataColumn(fieldType, data, mask)
コード例 #17
0
ファイル: MiningField.py プロジェクト: soedjais/augustus
    def replaceField(self, dataTable, functionTable, performanceTable):
        """Replace a field in the DataTable for outlier removal,
        missing value handling, and invalid value treatment.

        @type dataTable: DataTable
        @param dataTable: The pre-built DataTable.
        @type functionTable: FunctionTable
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        """

        dataColumn = dataTable.fields.get(self.name)
        if dataColumn is None:
            return

        performanceTable.begin("MiningField")

        optype = self.get("optype", dataColumn.fieldType.optype)
        if optype != dataColumn.fieldType.optype:
            dataColumn = FieldCastMethods.cast(
                FakeFieldType(dataColumn.fieldType.dataType, optype),
                dataColumn)

        data = dataColumn.data
        mask = dataColumn.mask

        outliers = self.get("outliers")

        lowValue = self.get("lowValue")
        if lowValue is not None:
            lowValue = dataColumn.fieldType.stringToValue(lowValue)

            if outliers == "asMissingValues":
                selection = NP(dataColumn.data < lowValue)
                mask = FieldCastMethods.outliersAsMissing(
                    mask, dataColumn.mask, selection)

            elif outliers == "asExtremeValues":
                selection = NP(dataColumn.data < lowValue)
                if data is dataColumn.data:
                    data = NP("copy", data)
                    data.setflags(write=True)
                    data[selection] = lowValue

        highValue = self.get("highValue")
        if highValue is not None:
            highValue = dataColumn.fieldType.stringToValue(highValue)

            if outliers == "asMissingValues":
                selection = NP(dataColumn.data > highValue)
                mask = FieldCastMethods.outliersAsMissing(
                    mask, dataColumn.mask, selection)

            elif outliers == "asExtremeValues":
                selection = NP(dataColumn.data > highValue)
                if data is dataColumn.data:
                    data = NP("copy", data)
                    data.setflags(write=True)
                    data[selection] = highValue

        mask = FieldCastMethods.applyInvalidValueTreatment(
            mask, self.get("invalidValueTreatment"))
        data, mask = FieldCastMethods.applyMapMissingTo(
            dataColumn.fieldType, data, mask,
            self.get("missingValueReplacement"))

        dataTable.fields.replaceField(
            self.name, DataColumn(dataColumn.fieldType, data, mask))
        performanceTable.end("MiningField")
コード例 #18
0
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("NormContinuous")

        dataColumn = dataTable.fields[self["field"]]
        if dataColumn.fieldType.dataType in ("object", "string", "boolean"):
            raise defs.PmmlValidationError(
                "NormContinuous requires a numeric input field, but \"%s\" is"
                % dataColumn.fieldType.dataType)

        outliers = self.get("outliers")

        linearNorms = self.childrenOfTag("LinearNorm")
        for linearNorm in linearNorms:
            linearNorm.orig = float(linearNorm["orig"])
            linearNorm.norm = float(linearNorm["norm"])

        linearNorms.sort(lambda x, y: cmp(x.orig, y.orig)
                         )  # technically, it's invalid if not already sorted

        data = NP("empty", len(dataTable), self._fieldType.dtype)
        mask = dataColumn.mask

        # extrapolate before the first
        selection = NP(dataColumn.data <= linearNorms[0].orig)
        if outliers == "asMissingValues":
            mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask,
                                                      selection)
        elif outliers == "asExtremeValues":
            data[selection] = linearNorms[0].norm
        else:
            self.transformSelection(linearNorms[0], linearNorms[1],
                                    dataColumn.data, data, selection)

        for i in xrange(len(linearNorms) - 1):
            selection = NP(linearNorms[i].orig < dataColumn.data)
            NP("logical_and", selection,
               NP(dataColumn.data <= linearNorms[i + 1].orig), selection)

            self.transformSelection(linearNorms[i], linearNorms[i + 1],
                                    dataColumn.data, data, selection)

        selection = NP(linearNorms[-1].orig < dataColumn.data)
        if outliers == "asMissingValues":
            mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask,
                                                      selection)
        elif outliers == "asExtremeValues":
            data[selection] = linearNorms[-1].norm
        else:
            self.transformSelection(linearNorms[-2], linearNorms[-1],
                                    dataColumn.data, data, selection)

        data, mask = FieldCastMethods.applyMapMissingTo(
            self._fieldType, data, mask, self.get("mapMissingTo"))

        performanceTable.end("NormContinuous")
        return DataColumn(self._fieldType, data, mask)