Esempio n. 1
0
  def _lint(self, examples):
    """Returns the result of the `NonNormalNumericFeatureDetector` linter.

    Args:
      examples: A `PTransform` that yields a `PCollection` of `tf.Example`s

    Returns:
      A `LintResult` of the format
        warnings: [feature names]
        lint_sample: [
          stats: {mean, std_dev}  # for a "typical" numeric feature
          stats: {mean, std_dev, min, max}  # for each flagged feature
        ]
    """
    result = self._make_result()
    numeric_features = utils.get_numeric_features(self._stats)
    numeric_feature_stats = []
    feature_means = []
    feature_std_devs = []
    for feature_stats in self._stats.features:
      if (feature_stats.name not in numeric_features
          or feature_stats.name in self.IGNORE_FEATURE_NAMES):
        continue
      numeric_feature_stats.append(feature_stats)
      num_stats = feature_stats.num_stats
      feature_means.append(num_stats.mean)
      feature_std_devs.append(num_stats.std_dev)

    means_trimmed_mean, means_trimmed_std = self._get_trimmed_stats(
        feature_means)
    std_devs_trimmed_mean, std_devs_trimmed_std = self._get_trimmed_stats(
        feature_std_devs)

    typical_stats = lint_result_pb2.Statistics(
        id=self._TYPICAL_STATS_ID,
        mean=means_trimmed_mean, std_dev=std_devs_trimmed_mean)
    result.lint_samples.add(stats=[typical_stats])

    for feature_stats in numeric_feature_stats:
      num_stats = feature_stats.num_stats
      mean_deviance = utils.get_zscore(
          num_stats.mean, means_trimmed_mean, means_trimmed_std)
      std_dev_deviance = utils.get_zscore(
          num_stats.std_dev, std_devs_trimmed_mean, std_devs_trimmed_std)
      warnings = []
      if mean_deviance > self._max_deviance:
        warnings.append('mean')
      if std_dev_deviance > self._max_deviance:
        warnings.append('std_dev')
      if warnings:
        result.warnings.append(
            self.WARNING_FMT.format(feature_stats.name, ','.join(warnings)))
        result.lint_samples.add(stats=[lint_result_pb2.Statistics(
            id=feature_stats.name,
            mean=num_stats.mean, std_dev=num_stats.std_dev,
            min=num_stats.min, max=num_stats.max)])

    return result
Esempio n. 2
0
  def __init__(self, stats, enum_threshold=20, ignore_strings=True):
    """Constructs a `EnumDetector` linter.

    Args:
      stats: A `DatasetFeatureStatisticsList` proto describing the examples.
      enum_threshold: Number of unique values above which a feature will be
                      regarded as real valued rather than as an enum.
      ignore_strings: Whether to assume that strings are already enums.
    """
    super(EnumDetector, self).__init__()
    self._stats = stats
    self._enum_threshold = enum_threshold
    self._ignore_strings = ignore_strings
    self._numeric_features = utils.get_numeric_features(self._stats)
Esempio n. 3
0
  def _lint(self, examples):
    """Returns the result of the CircularDomainDetector linter.

    Args:
      examples: A `PTransform` that yields a `PCollection` of `tf.Example`s.

    Returns:
      A `LintResult` of the format
        warnings: [feature names]
        lint_sample: None
    """
    result = self._make_result()
    numeric_features = utils.get_numeric_features(self._stats)
    for feature in self._stats.features:
      name = feature.name
      if name in numeric_features and self._name_is_suspicious(name):
        result.warnings.append(name)
    return result
Esempio n. 4
0
  def _lint(self, examples):
    """Returns the result of the `ZipCodeAsNumberDetector` linter.

    Args:
      examples: A `PTransform` that yields a `PCollection` of `tf.Example`s.

    Returns:
      A `LintResult` of the format
        warnings: [feature names]
        lint_samples: None
    """
    result = self._make_result()
    numeric_features = utils.get_numeric_features(self._stats)
    for feature in self._stats.features:
      if (feature.name in numeric_features and
          self._ZIP_RE.search(feature.name.lower())):
        result.warnings.append(feature.name)
    return result
Esempio n. 5
0
  def __init__(self, stats, z_score_threshold=0.5):
    """Constructs a TailedDistributionDetector linter.

    Args:
      stats: A `DatasetFeatureStatisticsList` proto describing the examples.
      z_score_threshold: The z-score of the min/max-trimmed mean (using the
                         un-trimmed standard deviation) above which a feature's
                         distribution will be considered tailed.
    """
    super(TailedDistributionDetector, self).__init__()
    self._stats = stats
    self._z_score_threshold = z_score_threshold

    self.numeric_features = utils.get_numeric_features(self._stats)
    self.feature_num_stats = {}
    for feature in self._stats.features:
      if feature.name not in self.numeric_features:
        continue
      self.feature_num_stats[feature.name] = feature.num_stats
Esempio n. 6
0
 def should_run(self):
   return bool(utils.get_numeric_features(self._stats))
Esempio n. 7
0
 def _counted_features(self):
   return utils.get_numeric_features(self._stats)
Esempio n. 8
0
 def should_run(self):
   return any(feature_name.lower() not in self.IGNORE_FEATURE_NAMES
              for feature_name in utils.get_numeric_features(self._stats))