def applyInvalidValueTreatment(mask, invalidValueTreatment, overwrite=False): """Replace INVALID values with MISSING if invalidValueTreatment is "asMissing". This function does not modify the original data (unless C{overwrite} is True), but it returns a substitute. Example use:: mask = dataColumn.mask mask = FieldCastMethods.applyInvalidValueTreatment(mask, pmml.get("invalidValueTreatment")) return DataColumn(dataColumn.fieldType, dataColumn.data, mask) It can also be used in conjunction with other FieldCastMethods. @type mask: 1d Numpy array of dtype defs.maskType, or None @param mask: The mask. @type invalidValueTreatment: string @param invalidValueTreatment: One of "returnInvalid", "asIs", "asMissing"; only "asMissing" has an effect. @type overwrite: bool @param overwrite: If True, temporarily unlike and overwrite the original mask. @rtype: 1d Numpy array of dtype defs.maskType @return: The new mask. """ if mask is None: return mask if invalidValueTreatment == "asMissing": if overwrite: mask.setflags(write=True) else: mask = NP("copy", mask) mask.setflags(write=True) mask[NP(mask == defs.INVALID)] = defs.MISSING return mask
def maskInvalid(self, data, mask): """Helper method to replace NaN and infinite values with INVALID after a potentially dangerous operation. Example:: result = NP("log", dataColumn.data) # log(0) = -inf, log(-x) = nan resultMask = self.maskInvalid(result, dataColumn.mask) return DataColumn(fakeFieldType, result, resultMask) The input C{data} and C{mask} are not modified by this method; a substitute mask is returned. @type data: 1d Numpy array @param data: The dataset that may contain NaN and infinite values. @type mask: 1d Numpy array of C{defs.maskType}, or None @param mask: The original mask. @rtype: 1d Numpy array of C{defs.maskType}, or None @return: The new mask. """ bad = NP("logical_not", NP("isfinite", data)) if bad.any(): if mask is None: mask = bad * defs.INVALID else: NP("logical_and", bad, NP(mask == defs.VALID), bad) if not mask.flags.writeable: mask = NP("copy", mask) mask.setflags(write=True) mask[bad] = defs.INVALID if mask is not None and not mask.any(): mask = None return mask
def outliersAsMissing(mask, originalMask, selection, overwrite=False): """Label all rows specified by a selection as MISSING. This function does not modify the original mask (unless C{overwrite} is True), but it returns a substitute. Example use:: mask = dataColumn.mask mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, dataColumn.data < MINIMUM_CUT) mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, dataColumn.data > MAXIMUM_CUT) return DataColumn(dataColumn.fieldType, dataColumn.data, mask) It can also be used in conjunction with other FieldCastMethods. @type mask: 1d Numpy array of type defs.maskType, or None @param mask: The mask to be updated. @type originalMask: 1d Numpy array of type defs.maskType, or None @param originalMask: The original mask. @type selection: 1d Numpy array of bool @param selection: The rows to label as MISSING. @type overwrite: bool @param overwrite: If True, temporarily unlock and overwrite the original mask. @rtype: 1d Numpy array of type defs.maskType @return: The new mask. """ if mask is None: mask = selection * defs.MISSING elif mask is originalMask: NP("logical_and", selection, NP(mask == defs.VALID), selection) if overwrite: mask.setflags(write=True) else: mask = NP("copy", mask) mask.setflags(write=True) mask[selection] = defs.MISSING else: NP("logical_and", selection, NP(mask == defs.VALID), selection) mask[selection] = defs.MISSING return mask
def replaceField(self, dataTable, functionTable, performanceTable): """Replace a field in the DataTable for outlier removal, missing value handling, and invalid value treatment. @type dataTable: DataTable @param dataTable: The pre-built DataTable. @type functionTable: FunctionTable @param functionTable: A table of functions. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. """ dataColumn = dataTable.fields.get(self.name) if dataColumn is None: return performanceTable.begin("MiningField") optype = self.get("optype", dataColumn.fieldType.optype) if optype != dataColumn.fieldType.optype: dataColumn = FieldCastMethods.cast(FakeFieldType(dataColumn.fieldType.dataType, optype), dataColumn) data = dataColumn.data mask = dataColumn.mask outliers = self.get("outliers") lowValue = self.get("lowValue") if lowValue is not None: lowValue = dataColumn.fieldType.stringToValue(lowValue) if outliers == "asMissingValues": selection = NP(dataColumn.data < lowValue) mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, selection) elif outliers == "asExtremeValues": selection = NP(dataColumn.data < lowValue) if data is dataColumn.data: data = NP("copy", data) data.setflags(write=True) data[selection] = lowValue highValue = self.get("highValue") if highValue is not None: highValue = dataColumn.fieldType.stringToValue(highValue) if outliers == "asMissingValues": selection = NP(dataColumn.data > highValue) mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, selection) elif outliers == "asExtremeValues": selection = NP(dataColumn.data > highValue) if data is dataColumn.data: data = NP("copy", data) data.setflags(write=True) data[selection] = highValue mask = FieldCastMethods.applyInvalidValueTreatment(mask, self.get("invalidValueTreatment")) data, mask = FieldCastMethods.applyMapMissingTo(dataColumn.fieldType, data, mask, self.get("missingValueReplacement")) dataTable.fields.replaceField(self.name, DataColumn(dataColumn.fieldType, data, mask)) performanceTable.end("MiningField")
def replaceField(self, dataTable, functionTable, performanceTable): """Replace a field in the DataTable for outlier removal, missing value handling, and invalid value treatment. @type dataTable: DataTable @param dataTable: The pre-built DataTable. @type functionTable: FunctionTable @param functionTable: A table of functions. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. """ dataColumn = dataTable.fields.get(self.name) if dataColumn is None: return performanceTable.begin("MiningField") optype = self.get("optype", dataColumn.fieldType.optype) if optype != dataColumn.fieldType.optype: dataColumn = FieldCastMethods.cast( FakeFieldType(dataColumn.fieldType.dataType, optype), dataColumn) data = dataColumn.data mask = dataColumn.mask outliers = self.get("outliers") lowValue = self.get("lowValue") if lowValue is not None: lowValue = dataColumn.fieldType.stringToValue(lowValue) if outliers == "asMissingValues": selection = NP(dataColumn.data < lowValue) mask = FieldCastMethods.outliersAsMissing( mask, dataColumn.mask, selection) elif outliers == "asExtremeValues": selection = NP(dataColumn.data < lowValue) if data is dataColumn.data: data = NP("copy", data) data.setflags(write=True) data[selection] = lowValue highValue = self.get("highValue") if highValue is not None: highValue = dataColumn.fieldType.stringToValue(highValue) if outliers == "asMissingValues": selection = NP(dataColumn.data > highValue) mask = FieldCastMethods.outliersAsMissing( mask, dataColumn.mask, selection) elif outliers == "asExtremeValues": selection = NP(dataColumn.data > highValue) if data is dataColumn.data: data = NP("copy", data) data.setflags(write=True) data[selection] = highValue mask = FieldCastMethods.applyInvalidValueTreatment( mask, self.get("invalidValueTreatment")) data, mask = FieldCastMethods.applyMapMissingTo( dataColumn.fieldType, data, mask, self.get("missingValueReplacement")) dataTable.fields.replaceField( self.name, DataColumn(dataColumn.fieldType, data, mask)) performanceTable.end("MiningField")