def _lint(self, examples): """Returns the result of the `NonNormalNumericFeatureDetector` linter. Args: examples: A `PTransform` that yields a `PCollection` of `tf.Example`s Returns: A `LintResult` of the format warnings: [feature names] lint_sample: [ stats: {mean, std_dev} # for a "typical" numeric feature stats: {mean, std_dev, min, max} # for each flagged feature ] """ result = self._make_result() numeric_features = utils.get_numeric_features(self._stats) numeric_feature_stats = [] feature_means = [] feature_std_devs = [] for feature_stats in self._stats.features: if (feature_stats.name not in numeric_features or feature_stats.name in self.IGNORE_FEATURE_NAMES): continue numeric_feature_stats.append(feature_stats) num_stats = feature_stats.num_stats feature_means.append(num_stats.mean) feature_std_devs.append(num_stats.std_dev) means_trimmed_mean, means_trimmed_std = self._get_trimmed_stats( feature_means) std_devs_trimmed_mean, std_devs_trimmed_std = self._get_trimmed_stats( feature_std_devs) typical_stats = lint_result_pb2.Statistics( id=self._TYPICAL_STATS_ID, mean=means_trimmed_mean, std_dev=std_devs_trimmed_mean) result.lint_samples.add(stats=[typical_stats]) for feature_stats in numeric_feature_stats: num_stats = feature_stats.num_stats mean_deviance = utils.get_zscore( num_stats.mean, means_trimmed_mean, means_trimmed_std) std_dev_deviance = utils.get_zscore( num_stats.std_dev, std_devs_trimmed_mean, std_devs_trimmed_std) warnings = [] if mean_deviance > self._max_deviance: warnings.append('mean') if std_dev_deviance > self._max_deviance: warnings.append('std_dev') if warnings: result.warnings.append( self.WARNING_FMT.format(feature_stats.name, ','.join(warnings))) result.lint_samples.add(stats=[lint_result_pb2.Statistics( id=feature_stats.name, mean=num_stats.mean, std_dev=num_stats.std_dev, min=num_stats.min, max=num_stats.max)]) return result
def __init__(self, stats, enum_threshold=20, ignore_strings=True): """Constructs a `EnumDetector` linter. Args: stats: A `DatasetFeatureStatisticsList` proto describing the examples. enum_threshold: Number of unique values above which a feature will be regarded as real valued rather than as an enum. ignore_strings: Whether to assume that strings are already enums. """ super(EnumDetector, self).__init__() self._stats = stats self._enum_threshold = enum_threshold self._ignore_strings = ignore_strings self._numeric_features = utils.get_numeric_features(self._stats)
def _lint(self, examples): """Returns the result of the CircularDomainDetector linter. Args: examples: A `PTransform` that yields a `PCollection` of `tf.Example`s. Returns: A `LintResult` of the format warnings: [feature names] lint_sample: None """ result = self._make_result() numeric_features = utils.get_numeric_features(self._stats) for feature in self._stats.features: name = feature.name if name in numeric_features and self._name_is_suspicious(name): result.warnings.append(name) return result
def _lint(self, examples): """Returns the result of the `ZipCodeAsNumberDetector` linter. Args: examples: A `PTransform` that yields a `PCollection` of `tf.Example`s. Returns: A `LintResult` of the format warnings: [feature names] lint_samples: None """ result = self._make_result() numeric_features = utils.get_numeric_features(self._stats) for feature in self._stats.features: if (feature.name in numeric_features and self._ZIP_RE.search(feature.name.lower())): result.warnings.append(feature.name) return result
def __init__(self, stats, z_score_threshold=0.5): """Constructs a TailedDistributionDetector linter. Args: stats: A `DatasetFeatureStatisticsList` proto describing the examples. z_score_threshold: The z-score of the min/max-trimmed mean (using the un-trimmed standard deviation) above which a feature's distribution will be considered tailed. """ super(TailedDistributionDetector, self).__init__() self._stats = stats self._z_score_threshold = z_score_threshold self.numeric_features = utils.get_numeric_features(self._stats) self.feature_num_stats = {} for feature in self._stats.features: if feature.name not in self.numeric_features: continue self.feature_num_stats[feature.name] = feature.num_stats
def should_run(self): return bool(utils.get_numeric_features(self._stats))
def _counted_features(self): return utils.get_numeric_features(self._stats)
def should_run(self): return any(feature_name.lower() not in self.IGNORE_FEATURE_NAMES for feature_name in utils.get_numeric_features(self._stats))