def test_get_feature(self): schema = text_format.Parse( """ feature { name: "feature1" } feature { name: "feature2" } """, schema_pb2.Schema()) feature2 = schema_util.get_feature(schema, 'feature2') self.assertEqual(feature2.name, 'feature2') # Check to verify that we are operating on the same feature object. self.assertIs(feature2, schema_util.get_feature(schema, 'feature2'))
def __init__(self, label_feature: types.FeaturePath, schema: schema_pb2.Schema, seed: int): """Initializes SkLearnMutualInformation. Args: label_feature: The key used to identify labels in the ExampleBatch. schema: The schema of the dataset. seed: An int value to seed the RNG used in MI computation. Raises: ValueError: If label_feature does not exist in the schema. """ self._label_feature = label_feature self._schema = schema self._categorical_features = schema_util.get_categorical_features( schema) assert schema_util.get_feature(self._schema, self._label_feature) self._label_feature_is_categorical = (self._label_feature in self._categorical_features) self._seed = seed self._schema_features = set([ feature_path for (feature_path, _) in schema_util.get_all_leaf_features(schema) ]) # Seed the RNG used for shuffling and for MI computations. np.random.seed(seed)
def _convert_categorical_features_to_numeric(self, df): """Encodes all categorical features in input dataframe to numeric values. Categorical features are inferred from the schema. They are transformed using the np.unique function which maps each value in the feature's domain to a numeric id. Encoded categorical features are marked by a boolean mask which is returned and used by scikit-learn to identify discrete features. Args: df: A pd.DataFrame containing feature values where each column corresponds to a feature and each row corresponds to an example. Returns: A boolean list where the ith element is true iff the ith feature column in the input df is a categorical feature. """ is_categorical_feature = [False for _ in df] for i, column in enumerate(df): if schema_util.is_categorical_feature( schema_util.get_feature(self._schema, column)): # Encode categorical columns df[column] = np.unique(df[column].values, return_inverse=True)[1] is_categorical_feature[i] = True return is_categorical_feature
def _impute(self, examples): """Imputes missing feature values. Replaces missing values with CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE for categorical features and 10*max(feature_values) for numeric features. We impute missing values with an extreme value that is far from observed values so it does not incorrectly impact KNN results. 10*max(feature_values) is used instead of sys.max_float because max_float is large enough to cause unexpected float arithmetic errors. Args: examples: A dict where the key is the feature name and the values are the feature values. Returns: A dict where the key is the feature name and the values are the feature values with missing values imputed. """ for feature, feature_values in examples.items(): if schema_util.is_categorical_feature( schema_util.get_feature(self._schema, feature)): imputation_fill_value = CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE else: imputation_fill_value = max(value for value in feature_values if value is not None) * 10 examples[feature] = [ value if value is not None else imputation_fill_value for value in feature_values ] return examples
def __init__(self, schema: schema_pb2.Schema, y_path: types.FeaturePath, x_paths: Optional[Iterable[types.FeaturePath]], y_boundaries: Optional[Iterable[float]], min_x_count: int, top_k_per_y: Optional[int], bottom_k_per_y: Optional[int], name: Text) -> None: """Initializes a lift statistics generator. Args: schema: A required schema for the dataset. y_path: The path to use as Y in the lift expression: lift = P(Y=y|X=x) / P(Y=y). x_paths: An optional list of path to use as X in the lift expression: lift = P(Y=y|X=x) / P(Y=y). If None (default), all categorical features, exluding the feature passed as y_path, will be used. y_boundaries: An optional list of boundaries to be used for binning y_path. If provided with b boundaries, the binned values will be treated as a categorical feature with b+1 different values. For example, the y_boundaries value [0.1, 0.8] would lead to three buckets: [-inf, 0.1), [0.1, 0.8) and [0.8, inf]. min_x_count: The minimum number of examples in which a specific x value must appear, in order for its lift to be output. top_k_per_y: Optionally, the number of top x values per y value, ordered by descending lift, for which to output lift. If both top_k_per_y and bottom_k_per_y are unset, all values will be output. bottom_k_per_y: Optionally, the number of bottom x values per y value, ordered by descending lift, for which to output lift. If both top_k_per_y and bottom_k_per_y are unset, all values will be output. name: An optional unique name associated with the statistics generator. """ self._name = name self._schema = schema self._y_path = y_path self._min_x_count = min_x_count self._top_k_per_y = top_k_per_y self._bottom_k_per_y = bottom_k_per_y y_feature = schema_util.get_feature(schema, y_path) y_is_categorical = schema_util.is_categorical_feature(y_feature) if y_boundaries: if y_is_categorical: raise ValueError( 'Boundaries cannot be applied to a categorical y_path') self._y_boundaries = np.array(sorted(set(y_boundaries))) else: if not y_is_categorical: raise ValueError( 'Boundaries must be provided with a non-categorical ' 'y_path.') self._y_boundaries = y_boundaries if x_paths is None: self._x_paths = ( set(schema_util.get_categorical_features(schema)) - set([y_path])) else: self._x_paths = x_paths
def test_get_feature_not_present(self): schema = text_format.Parse( """ feature { name: "feature1" } """, schema_pb2.Schema()) with self.assertRaisesRegexp(ValueError, 'Feature.*not found in the schema.*'): _ = schema_util.get_feature(schema, 'feature2')
def test_get_feature_internal_step_not_struct(self): schema = text_format.Parse( """ feature { name: "feature1" } """, schema_pb2.Schema()) with self.assertRaisesRegexp( ValueError, 'does not refer to a valid STRUCT feature'): _ = schema_util.get_feature( schema, types.FeaturePath(['feature1', 'sub_feature2']))
def test_get_feature_using_path(self): schema = text_format.Parse( """ feature { name: "feature1" type: STRUCT struct_domain { feature { name: "sub_feature1" } } } """, schema_pb2.Schema()) sub_feature1 = schema_util.get_feature( schema, types.FeaturePath(['feature1', 'sub_feature1'])) self.assertIs(sub_feature1, schema.feature[0].struct_domain.feature[0])
def test_get_feature_using_path_not_present(self): schema = text_format.Parse( """ feature { name: "feature1" type: STRUCT struct_domain { feature { name: "sub_feature1" } } } """, schema_pb2.Schema()) with self.assertRaisesRegexp(ValueError, 'Feature.*not found in the schema'): _ = schema_util.get_feature( schema, types.FeaturePath(['feature1', 'sub_feature2']))
def __init__(self, label_feature, schema, seed): """Initializes SkLearnMutualInformation. Args: label_feature: The key used to identify labels in the ExampleBatch. schema: The schema of the dataset. seed: An int value to seed the RNG used in MI computation. Raises: ValueError: If label_feature does not exist in the schema. """ self._label_feature = label_feature self._schema = schema self._label_feature_is_categorical = schema_util.is_categorical_feature( schema_util.get_feature(self._schema, self._label_feature)) self._seed = seed # Seed the RNG used for shuffling and for MI computations. np.random.seed(seed)
def __init__(self, y_path: types.FeaturePath, schema: Optional[schema_pb2.Schema], x_paths: Optional[Iterable[types.FeaturePath]], y_boundaries: Optional[Sequence[float]], min_x_count: int, top_k_per_y: Optional[int], bottom_k_per_y: Optional[int], weight_column_name: Optional[Text], output_custom_stats: bool, name: Text) -> None: """Initializes a lift statistics generator. Args: y_path: The path to use as Y in the lift expression: lift = P(Y=y|X=x) / P(Y=y). schema: An optional schema for the dataset. If not provided, x_paths must be specified. If x_paths are not specified, the schema is used to identify all categorical columns for which Lift should be computed. x_paths: An optional list of path to use as X in the lift expression: lift = P(Y=y|X=x) / P(Y=y). If None (default), all categorical features, exluding the feature passed as y_path, will be used. y_boundaries: An optional list of boundaries to be used for binning y_path. If provided with b boundaries, the binned values will be treated as a categorical feature with b+1 different values. For example, the y_boundaries value [0.1, 0.8] would lead to three buckets: [-inf, 0.1), [0.1, 0.8) and [0.8, inf]. min_x_count: The minimum number of examples in which a specific x value must appear, in order for its lift to be output. top_k_per_y: Optionally, the number of top x values per y value, ordered by descending lift, for which to output lift. If both top_k_per_y and bottom_k_per_y are unset, all values will be output. bottom_k_per_y: Optionally, the number of bottom x values per y value, ordered by descending lift, for which to output lift. If both top_k_per_y and bottom_k_per_y are unset, all values will be output. weight_column_name: Optionally, a weight column to use for converting counts of x or y into weighted counts. output_custom_stats: Whether to output custom stats for use with Facets. name: An optional unique name associated with the statistics generator. """ self._name = name self._schema = schema self._y_path = y_path self._min_x_count = min_x_count self._top_k_per_y = top_k_per_y self._bottom_k_per_y = bottom_k_per_y self._output_custom_stats = output_custom_stats self._y_boundaries = (np.array(sorted(set(y_boundaries))) if y_boundaries else None) self._weight_column_name = weight_column_name # If a schema is provided, we can do some additional validation of the # provided y_feature and boundaries. if self._schema is not None: y_feature = schema_util.get_feature(self._schema, y_path) y_is_categorical = schema_util.is_categorical_feature(y_feature) if self._y_boundaries is not None: if y_is_categorical: raise ValueError( 'Boundaries cannot be applied to a categorical y_path') else: if not y_is_categorical: raise ValueError( 'Boundaries must be provided with a non-categorical ' 'y_path.') if x_paths is not None: self._x_paths = x_paths elif self._schema is not None: self._x_paths = ( set(schema_util.get_categorical_features(schema)) - set([y_path])) else: raise ValueError('Either a schema or x_paths must be provided.')
def test_get_feature_invalid_schema_input(self): with self.assertRaisesRegexp(TypeError, '.*should be a Schema proto.*'): _ = schema_util.get_feature({}, 'feature')
def __init__(self, label_feature: types.FeaturePath, schema: Optional[schema_pb2.Schema] = None, max_encoding_length: int = 512, seed: int = 12345, multivalent_features: Optional[Set[types.FeaturePath]] = None, categorical_features: Optional[Set[types.FeaturePath]] = None, features_to_ignore: Optional[Set[types.FeaturePath]] = None, normalize_by_max: bool = False, allow_invalid_partitions: bool = False, custom_stats_key: str = _ADJUSTED_MUTUAL_INFORMATION_KEY, column_partitions: int = 1): """Initializes MutualInformation. Args: label_feature: The key used to identify labels in the ExampleBatch. schema: An optional schema describing the the dataset. Either a schema or a list of categorical and multivalent features must be provided. max_encoding_length: An int value to specify the maximum length of encoding to represent a feature value. seed: An int value to seed the RNG used in MI computation. multivalent_features: An optional set of features that are multivalent. categorical_features: An optional set of the features that are categorical. features_to_ignore: An optional set of features that should be ignored by the mutual information calculation. normalize_by_max: If True, AMI values are normalized to a range 0 to 1 by dividing by the maximum possible information AMI(Y, Y). allow_invalid_partitions: If True, generator tolerates input partitions that are invalid (e.g. size of partion is < the k for the KNN), where invalid partitions return no stats. The min_partitions_stat_presence arg to PartitionedStatisticsAnalyzer controls how many partitions may be invalid while still reporting the metric. custom_stats_key: A string that determines the key used in the custom statistic. This defaults to `_ADJUSTED_MUTUAL_INFORMATION_KEY`. column_partitions: If > 1, self.partitioner returns a PTransform that partitions input RecordBatches by column (feature), in addition to the normal row partitioning (by batch). The total number of effective partitions is column_partitions * row_partitions, where row_partitions is passed to self.partitioner. Raises: ValueError: If label_feature does not exist in the schema. """ self._label_feature = label_feature self._schema = schema self._normalize_by_max = normalize_by_max if multivalent_features is not None: self._multivalent_features = multivalent_features elif self._schema is not None: self._multivalent_features = schema_util.get_multivalent_features( self._schema) else: raise ValueError( "Either multivalent feature set or schema must be provided") if categorical_features is not None: self._categorical_features = categorical_features elif self._schema is not None: self._categorical_features = schema_util.get_categorical_features( self._schema) else: raise ValueError( "Either categorical feature set or schema must be provided") if schema: assert schema_util.get_feature(self._schema, self._label_feature) self._label_feature_is_categorical = (self._label_feature in self._categorical_features) self._max_encoding_length = max_encoding_length self._seed = seed self._features_to_ignore = features_to_ignore self._allow_invalid_partitions = allow_invalid_partitions self._custom_stats_key = custom_stats_key self._column_partitions = column_partitions