Ejemplo n.º 1
0
    def test_get_feature(self):
        schema = text_format.Parse(
            """
        feature {
          name: "feature1"
        }
        feature {
          name: "feature2"
        }
        """, schema_pb2.Schema())

        feature2 = schema_util.get_feature(schema, 'feature2')
        self.assertEqual(feature2.name, 'feature2')
        # Check to verify that we are operating on the same feature object.
        self.assertIs(feature2, schema_util.get_feature(schema, 'feature2'))
    def __init__(self, label_feature: types.FeaturePath,
                 schema: schema_pb2.Schema, seed: int):
        """Initializes SkLearnMutualInformation.

    Args:
      label_feature: The key used to identify labels in the ExampleBatch.
      schema: The schema of the dataset.
      seed: An int value to seed the RNG used in MI computation.

    Raises:
      ValueError: If label_feature does not exist in the schema.
    """
        self._label_feature = label_feature
        self._schema = schema
        self._categorical_features = schema_util.get_categorical_features(
            schema)
        assert schema_util.get_feature(self._schema, self._label_feature)
        self._label_feature_is_categorical = (self._label_feature
                                              in self._categorical_features)
        self._seed = seed
        self._schema_features = set([
            feature_path
            for (feature_path, _) in schema_util.get_all_leaf_features(schema)
        ])

        # Seed the RNG used for shuffling and for MI computations.
        np.random.seed(seed)
    def _convert_categorical_features_to_numeric(self, df):
        """Encodes all categorical features in input dataframe to numeric values.

    Categorical features are inferred from the schema. They are transformed
    using the np.unique function which maps each value in the feature's domain
    to a numeric id. Encoded categorical features are marked by a boolean mask
    which is returned and used by scikit-learn to identify discrete features.

    Args:
      df: A pd.DataFrame containing feature values where each column corresponds
        to a feature and each row corresponds to an example.

    Returns:
      A boolean list where the ith element is true iff the ith feature column in
      the input df is a categorical feature.
    """
        is_categorical_feature = [False for _ in df]

        for i, column in enumerate(df):
            if schema_util.is_categorical_feature(
                    schema_util.get_feature(self._schema, column)):
                # Encode categorical columns
                df[column] = np.unique(df[column].values,
                                       return_inverse=True)[1]
                is_categorical_feature[i] = True
        return is_categorical_feature
    def _impute(self, examples):
        """Imputes missing feature values.

    Replaces missing values with CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE
    for categorical features and 10*max(feature_values) for numeric features.
    We impute missing values with an extreme value that is far from observed
    values so it does not incorrectly impact KNN results. 10*max(feature_values)
    is used instead of sys.max_float because max_float is large enough to cause
    unexpected float arithmetic errors.

    Args:
      examples: A dict where the key is the feature name and the values are the
        feature values.

    Returns:
      A dict where the key is the feature name and the values are the
        feature values with missing values imputed.
    """

        for feature, feature_values in examples.items():
            if schema_util.is_categorical_feature(
                    schema_util.get_feature(self._schema, feature)):
                imputation_fill_value = CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE
            else:
                imputation_fill_value = max(value for value in feature_values
                                            if value is not None) * 10
            examples[feature] = [
                value if value is not None else imputation_fill_value
                for value in feature_values
            ]
        return examples
Ejemplo n.º 5
0
    def __init__(self, schema: schema_pb2.Schema, y_path: types.FeaturePath,
                 x_paths: Optional[Iterable[types.FeaturePath]],
                 y_boundaries: Optional[Iterable[float]], min_x_count: int,
                 top_k_per_y: Optional[int], bottom_k_per_y: Optional[int],
                 name: Text) -> None:
        """Initializes a lift statistics generator.

    Args:
      schema: A required schema for the dataset.
      y_path: The path to use as Y in the lift expression:
        lift = P(Y=y|X=x) / P(Y=y).
      x_paths: An optional list of path to use as X in the lift expression:
        lift = P(Y=y|X=x) / P(Y=y). If None (default), all categorical features,
        exluding the feature passed as y_path, will be used.
      y_boundaries: An optional list of boundaries to be used for binning
        y_path. If provided with b boundaries, the binned values will be treated
        as a categorical feature with b+1 different values. For example, the
        y_boundaries value [0.1, 0.8] would lead to three buckets: [-inf, 0.1),
        [0.1, 0.8) and [0.8, inf].
      min_x_count: The minimum number of examples in which a specific x value
        must appear, in order for its lift to be output.
      top_k_per_y:  Optionally, the number of top x values per y value, ordered
        by descending lift, for which to output lift. If both top_k_per_y and
        bottom_k_per_y are unset, all values will be output.
      bottom_k_per_y:  Optionally, the number of bottom x values per y value,
        ordered by descending lift, for which to output lift. If both
        top_k_per_y and bottom_k_per_y are unset, all values will be output.
      name: An optional unique name associated with the statistics generator.
    """
        self._name = name
        self._schema = schema
        self._y_path = y_path
        self._min_x_count = min_x_count
        self._top_k_per_y = top_k_per_y
        self._bottom_k_per_y = bottom_k_per_y

        y_feature = schema_util.get_feature(schema, y_path)
        y_is_categorical = schema_util.is_categorical_feature(y_feature)
        if y_boundaries:
            if y_is_categorical:
                raise ValueError(
                    'Boundaries cannot be applied to a categorical y_path')
            self._y_boundaries = np.array(sorted(set(y_boundaries)))
        else:
            if not y_is_categorical:
                raise ValueError(
                    'Boundaries must be provided with a non-categorical '
                    'y_path.')
            self._y_boundaries = y_boundaries
        if x_paths is None:
            self._x_paths = (
                set(schema_util.get_categorical_features(schema)) -
                set([y_path]))
        else:
            self._x_paths = x_paths
Ejemplo n.º 6
0
    def test_get_feature_not_present(self):
        schema = text_format.Parse(
            """
        feature {
          name: "feature1"
        }
        """, schema_pb2.Schema())

        with self.assertRaisesRegexp(ValueError,
                                     'Feature.*not found in the schema.*'):
            _ = schema_util.get_feature(schema, 'feature2')
Ejemplo n.º 7
0
 def test_get_feature_internal_step_not_struct(self):
     schema = text_format.Parse(
         """
     feature {
       name: "feature1"
     }
     """, schema_pb2.Schema())
     with self.assertRaisesRegexp(
             ValueError, 'does not refer to a valid STRUCT feature'):
         _ = schema_util.get_feature(
             schema, types.FeaturePath(['feature1', 'sub_feature2']))
Ejemplo n.º 8
0
 def test_get_feature_using_path(self):
     schema = text_format.Parse(
         """
     feature {
       name: "feature1"
       type: STRUCT
       struct_domain {
         feature {
           name: "sub_feature1"
         }
       }
     }
     """, schema_pb2.Schema())
     sub_feature1 = schema_util.get_feature(
         schema, types.FeaturePath(['feature1', 'sub_feature1']))
     self.assertIs(sub_feature1, schema.feature[0].struct_domain.feature[0])
Ejemplo n.º 9
0
 def test_get_feature_using_path_not_present(self):
     schema = text_format.Parse(
         """
     feature {
       name: "feature1"
       type: STRUCT
       struct_domain {
         feature {
           name: "sub_feature1"
         }
       }
     }
     """, schema_pb2.Schema())
     with self.assertRaisesRegexp(ValueError,
                                  'Feature.*not found in the schema'):
         _ = schema_util.get_feature(
             schema, types.FeaturePath(['feature1', 'sub_feature2']))
    def __init__(self, label_feature, schema, seed):
        """Initializes SkLearnMutualInformation.

    Args:
      label_feature: The key used to identify labels in the ExampleBatch.
      schema: The schema of the dataset.
      seed: An int value to seed the RNG used in MI computation.

    Raises:
      ValueError: If label_feature does not exist in the schema.
    """
        self._label_feature = label_feature
        self._schema = schema
        self._label_feature_is_categorical = schema_util.is_categorical_feature(
            schema_util.get_feature(self._schema, self._label_feature))
        self._seed = seed

        # Seed the RNG used for shuffling and for MI computations.
        np.random.seed(seed)
Ejemplo n.º 11
0
    def __init__(self, y_path: types.FeaturePath,
                 schema: Optional[schema_pb2.Schema],
                 x_paths: Optional[Iterable[types.FeaturePath]],
                 y_boundaries: Optional[Sequence[float]], min_x_count: int,
                 top_k_per_y: Optional[int], bottom_k_per_y: Optional[int],
                 weight_column_name: Optional[Text], output_custom_stats: bool,
                 name: Text) -> None:
        """Initializes a lift statistics generator.

    Args:
      y_path: The path to use as Y in the lift expression: lift = P(Y=y|X=x) /
        P(Y=y).
     schema: An optional schema for the dataset. If not provided, x_paths must
       be specified. If x_paths are not specified, the schema is used to
       identify all categorical columns for which Lift should be computed.
      x_paths: An optional list of path to use as X in the lift expression: lift
        = P(Y=y|X=x) / P(Y=y). If None (default), all categorical features,
        exluding the feature passed as y_path, will be used.
      y_boundaries: An optional list of boundaries to be used for binning
        y_path. If provided with b boundaries, the binned values will be treated
        as a categorical feature with b+1 different values. For example, the
        y_boundaries value [0.1, 0.8] would lead to three buckets: [-inf, 0.1),
          [0.1, 0.8) and [0.8, inf].
      min_x_count: The minimum number of examples in which a specific x value
        must appear, in order for its lift to be output.
      top_k_per_y: Optionally, the number of top x values per y value, ordered
        by descending lift, for which to output lift. If both top_k_per_y and
        bottom_k_per_y are unset, all values will be output.
      bottom_k_per_y: Optionally, the number of bottom x values per y value,
        ordered by descending lift, for which to output lift. If both
        top_k_per_y and bottom_k_per_y are unset, all values will be output.
      weight_column_name: Optionally, a weight column to use for converting
        counts of x or y into weighted counts.
      output_custom_stats: Whether to output custom stats for use with Facets.
      name: An optional unique name associated with the statistics generator.
    """
        self._name = name
        self._schema = schema
        self._y_path = y_path
        self._min_x_count = min_x_count
        self._top_k_per_y = top_k_per_y
        self._bottom_k_per_y = bottom_k_per_y
        self._output_custom_stats = output_custom_stats
        self._y_boundaries = (np.array(sorted(set(y_boundaries)))
                              if y_boundaries else None)
        self._weight_column_name = weight_column_name

        # If a schema is provided, we can do some additional validation of the
        # provided y_feature and boundaries.
        if self._schema is not None:
            y_feature = schema_util.get_feature(self._schema, y_path)
            y_is_categorical = schema_util.is_categorical_feature(y_feature)
            if self._y_boundaries is not None:
                if y_is_categorical:
                    raise ValueError(
                        'Boundaries cannot be applied to a categorical y_path')
            else:
                if not y_is_categorical:
                    raise ValueError(
                        'Boundaries must be provided with a non-categorical '
                        'y_path.')
        if x_paths is not None:
            self._x_paths = x_paths
        elif self._schema is not None:
            self._x_paths = (
                set(schema_util.get_categorical_features(schema)) -
                set([y_path]))
        else:
            raise ValueError('Either a schema or x_paths must be provided.')
Ejemplo n.º 12
0
 def test_get_feature_invalid_schema_input(self):
     with self.assertRaisesRegexp(TypeError,
                                  '.*should be a Schema proto.*'):
         _ = schema_util.get_feature({}, 'feature')
Ejemplo n.º 13
0
    def __init__(self,
                 label_feature: types.FeaturePath,
                 schema: Optional[schema_pb2.Schema] = None,
                 max_encoding_length: int = 512,
                 seed: int = 12345,
                 multivalent_features: Optional[Set[types.FeaturePath]] = None,
                 categorical_features: Optional[Set[types.FeaturePath]] = None,
                 features_to_ignore: Optional[Set[types.FeaturePath]] = None,
                 normalize_by_max: bool = False,
                 allow_invalid_partitions: bool = False,
                 custom_stats_key: str = _ADJUSTED_MUTUAL_INFORMATION_KEY,
                 column_partitions: int = 1):
        """Initializes MutualInformation.

    Args:
      label_feature: The key used to identify labels in the ExampleBatch.
      schema: An optional schema describing the the dataset. Either a schema or
        a list of categorical and multivalent features must be provided.
      max_encoding_length: An int value to specify the maximum length of
        encoding to represent a feature value.
      seed: An int value to seed the RNG used in MI computation.
      multivalent_features: An optional set of features that are multivalent.
      categorical_features: An optional set of the features that are
        categorical.
      features_to_ignore: An optional set of features that should be ignored by
        the mutual information calculation.
      normalize_by_max: If True, AMI values are normalized to a range 0 to 1 by
        dividing by the maximum possible information AMI(Y, Y).
      allow_invalid_partitions: If True, generator tolerates input partitions
        that are invalid (e.g. size of partion is < the k for the KNN), where
        invalid partitions return no stats. The min_partitions_stat_presence arg
        to PartitionedStatisticsAnalyzer controls how many partitions may be
        invalid while still reporting the metric.
      custom_stats_key: A string that determines the key used in the custom
        statistic. This defaults to `_ADJUSTED_MUTUAL_INFORMATION_KEY`.
      column_partitions: If > 1, self.partitioner returns a PTransform that
        partitions input RecordBatches by column (feature), in addition to the
        normal row partitioning (by batch). The total number of effective
        partitions is column_partitions * row_partitions, where row_partitions
        is passed to self.partitioner.

    Raises:
      ValueError: If label_feature does not exist in the schema.
    """
        self._label_feature = label_feature
        self._schema = schema
        self._normalize_by_max = normalize_by_max
        if multivalent_features is not None:
            self._multivalent_features = multivalent_features
        elif self._schema is not None:
            self._multivalent_features = schema_util.get_multivalent_features(
                self._schema)
        else:
            raise ValueError(
                "Either multivalent feature set or schema must be provided")
        if categorical_features is not None:
            self._categorical_features = categorical_features
        elif self._schema is not None:
            self._categorical_features = schema_util.get_categorical_features(
                self._schema)
        else:
            raise ValueError(
                "Either categorical feature set or schema must be provided")
        if schema:
            assert schema_util.get_feature(self._schema, self._label_feature)
        self._label_feature_is_categorical = (self._label_feature
                                              in self._categorical_features)
        self._max_encoding_length = max_encoding_length
        self._seed = seed
        self._features_to_ignore = features_to_ignore
        self._allow_invalid_partitions = allow_invalid_partitions
        self._custom_stats_key = custom_stats_key
        self._column_partitions = column_partitions