def evaluate(self, dataTable, functionTable, performanceTable): """Evaluate the expression, using a DataTable as input. @type dataTable: DataTable @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression. @type functionTable: FunctionTable @param functionTable: The FunctionTable, containing any functions that might be called in this expression. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: The result of the calculation as a DataColumn. """ performanceTable.begin("NormContinuous") dataColumn = dataTable.fields[self["field"]] if dataColumn.fieldType.dataType in ("object", "string", "boolean"): raise defs.PmmlValidationError("NormContinuous requires a numeric input field, but \"%s\" is" % dataColumn.fieldType.dataType) outliers = self.get("outliers") linearNorms = self.childrenOfTag("LinearNorm") for linearNorm in linearNorms: linearNorm.orig = float(linearNorm["orig"]) linearNorm.norm = float(linearNorm["norm"]) linearNorms.sort(lambda x, y: cmp(x.orig, y.orig)) # technically, it's invalid if not already sorted data = NP("empty", len(dataTable), self._fieldType.dtype) mask = dataColumn.mask # extrapolate before the first selection = NP(dataColumn.data <= linearNorms[0].orig) if outliers == "asMissingValues": mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, selection) elif outliers == "asExtremeValues": data[selection] = linearNorms[0].norm else: self.transformSelection(linearNorms[0], linearNorms[1], dataColumn.data, data, selection) for i in xrange(len(linearNorms) - 1): selection = NP(linearNorms[i].orig < dataColumn.data) NP("logical_and", selection, NP(dataColumn.data <= linearNorms[i+1].orig), selection) self.transformSelection(linearNorms[i], linearNorms[i+1], dataColumn.data, data, selection) selection = NP(linearNorms[-1].orig < dataColumn.data) if outliers == "asMissingValues": mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, selection) elif outliers == "asExtremeValues": data[selection] = linearNorms[-1].norm else: self.transformSelection(linearNorms[-2], linearNorms[-1], dataColumn.data, data, selection) data, mask = FieldCastMethods.applyMapMissingTo(self._fieldType, data, mask, self.get("mapMissingTo")) performanceTable.end("NormContinuous") return DataColumn(self._fieldType, data, mask)
def replaceField(self, dataTable, functionTable, performanceTable): """Replace a field in the DataTable for outlier removal, missing value handling, and invalid value treatment. @type dataTable: DataTable @param dataTable: The pre-built DataTable. @type functionTable: FunctionTable @param functionTable: A table of functions. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. """ dataColumn = dataTable.fields.get(self.name) if dataColumn is None: return performanceTable.begin("MiningField") optype = self.get("optype", dataColumn.fieldType.optype) if optype != dataColumn.fieldType.optype: dataColumn = FieldCastMethods.cast(FakeFieldType(dataColumn.fieldType.dataType, optype), dataColumn) data = dataColumn.data mask = dataColumn.mask outliers = self.get("outliers") lowValue = self.get("lowValue") if lowValue is not None: lowValue = dataColumn.fieldType.stringToValue(lowValue) if outliers == "asMissingValues": selection = NP(dataColumn.data < lowValue) mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, selection) elif outliers == "asExtremeValues": selection = NP(dataColumn.data < lowValue) if data is dataColumn.data: data = NP("copy", data) data.setflags(write=True) data[selection] = lowValue highValue = self.get("highValue") if highValue is not None: highValue = dataColumn.fieldType.stringToValue(highValue) if outliers == "asMissingValues": selection = NP(dataColumn.data > highValue) mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, selection) elif outliers == "asExtremeValues": selection = NP(dataColumn.data > highValue) if data is dataColumn.data: data = NP("copy", data) data.setflags(write=True) data[selection] = highValue mask = FieldCastMethods.applyInvalidValueTreatment(mask, self.get("invalidValueTreatment")) data, mask = FieldCastMethods.applyMapMissingTo(dataColumn.fieldType, data, mask, self.get("missingValueReplacement")) dataTable.fields.replaceField(self.name, DataColumn(dataColumn.fieldType, data, mask)) performanceTable.end("MiningField")
def replaceField(self, dataTable, functionTable, performanceTable): """Replace a field in the DataTable for outlier removal, missing value handling, and invalid value treatment. @type dataTable: DataTable @param dataTable: The pre-built DataTable. @type functionTable: FunctionTable @param functionTable: A table of functions. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. """ dataColumn = dataTable.fields.get(self.name) if dataColumn is None: return performanceTable.begin("MiningField") optype = self.get("optype", dataColumn.fieldType.optype) if optype != dataColumn.fieldType.optype: dataColumn = FieldCastMethods.cast( FakeFieldType(dataColumn.fieldType.dataType, optype), dataColumn) data = dataColumn.data mask = dataColumn.mask outliers = self.get("outliers") lowValue = self.get("lowValue") if lowValue is not None: lowValue = dataColumn.fieldType.stringToValue(lowValue) if outliers == "asMissingValues": selection = NP(dataColumn.data < lowValue) mask = FieldCastMethods.outliersAsMissing( mask, dataColumn.mask, selection) elif outliers == "asExtremeValues": selection = NP(dataColumn.data < lowValue) if data is dataColumn.data: data = NP("copy", data) data.setflags(write=True) data[selection] = lowValue highValue = self.get("highValue") if highValue is not None: highValue = dataColumn.fieldType.stringToValue(highValue) if outliers == "asMissingValues": selection = NP(dataColumn.data > highValue) mask = FieldCastMethods.outliersAsMissing( mask, dataColumn.mask, selection) elif outliers == "asExtremeValues": selection = NP(dataColumn.data > highValue) if data is dataColumn.data: data = NP("copy", data) data.setflags(write=True) data[selection] = highValue mask = FieldCastMethods.applyInvalidValueTreatment( mask, self.get("invalidValueTreatment")) data, mask = FieldCastMethods.applyMapMissingTo( dataColumn.fieldType, data, mask, self.get("missingValueReplacement")) dataTable.fields.replaceField( self.name, DataColumn(dataColumn.fieldType, data, mask)) performanceTable.end("MiningField")
def evaluate(self, dataTable, functionTable, performanceTable): """Evaluate the expression, using a DataTable as input. @type dataTable: DataTable @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression. @type functionTable: FunctionTable @param functionTable: The FunctionTable, containing any functions that might be called in this expression. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: The result of the calculation as a DataColumn. """ performanceTable.begin("NormContinuous") dataColumn = dataTable.fields[self["field"]] if dataColumn.fieldType.dataType in ("object", "string", "boolean"): raise defs.PmmlValidationError( "NormContinuous requires a numeric input field, but \"%s\" is" % dataColumn.fieldType.dataType) outliers = self.get("outliers") linearNorms = self.childrenOfTag("LinearNorm") for linearNorm in linearNorms: linearNorm.orig = float(linearNorm["orig"]) linearNorm.norm = float(linearNorm["norm"]) linearNorms.sort(lambda x, y: cmp(x.orig, y.orig) ) # technically, it's invalid if not already sorted data = NP("empty", len(dataTable), self._fieldType.dtype) mask = dataColumn.mask # extrapolate before the first selection = NP(dataColumn.data <= linearNorms[0].orig) if outliers == "asMissingValues": mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, selection) elif outliers == "asExtremeValues": data[selection] = linearNorms[0].norm else: self.transformSelection(linearNorms[0], linearNorms[1], dataColumn.data, data, selection) for i in xrange(len(linearNorms) - 1): selection = NP(linearNorms[i].orig < dataColumn.data) NP("logical_and", selection, NP(dataColumn.data <= linearNorms[i + 1].orig), selection) self.transformSelection(linearNorms[i], linearNorms[i + 1], dataColumn.data, data, selection) selection = NP(linearNorms[-1].orig < dataColumn.data) if outliers == "asMissingValues": mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, selection) elif outliers == "asExtremeValues": data[selection] = linearNorms[-1].norm else: self.transformSelection(linearNorms[-2], linearNorms[-1], dataColumn.data, data, selection) data, mask = FieldCastMethods.applyMapMissingTo( self._fieldType, data, mask, self.get("mapMissingTo")) performanceTable.end("NormContinuous") return DataColumn(self._fieldType, data, mask)