def Load_TFDV(df): lencols = len(df.columns) # print(lencols) y_tfdv = [0] * lencols i = 0 for col in df.columns: # print(col) df_col = df[[col]] st_option = tfdv.StatsOptions(enable_semantic_domain_stats=True) stats = tfdv.generate_statistics_from_dataframe( df_col, stats_options=st_option) schema = tfdv.infer_schema(statistics=stats) categ_lst = get_categorical_features(schema) for x in categ_lst: y_tfdv[i] = 1 break xc = schema.feature # print(xc) for x in xc: cnt_NLD = str(x).count('natural_language_domain') cnt_TD = str(x).count('time_domain') if cnt_NLD: y_tfdv[i] = 3 if cnt_TD: y_tfdv[i] = 2 print(y_tfdv[i]) i = i + 1 return y_tfdv
def test_get_categorical_features(self): schema = text_format.Parse( """ feature { name: "fa" type: INT int_domain { is_categorical: true } } feature { name: "fb" type: BYTES } feature { name: "fc" type: FLOAT } feature { name: "fd" type: INT } """, schema_pb2.Schema()) expected = set(['fa', 'fb']) self.assertEqual(schema_util.get_categorical_features(schema), expected)
def __init__(self, label_feature: types.FeaturePath, schema: schema_pb2.Schema, seed: int): """Initializes SkLearnMutualInformation. Args: label_feature: The key used to identify labels in the ExampleBatch. schema: The schema of the dataset. seed: An int value to seed the RNG used in MI computation. Raises: ValueError: If label_feature does not exist in the schema. """ self._label_feature = label_feature self._schema = schema self._categorical_features = schema_util.get_categorical_features( schema) assert schema_util.get_feature(self._schema, self._label_feature) self._label_feature_is_categorical = (self._label_feature in self._categorical_features) self._seed = seed self._schema_features = set([ feature_path for (feature_path, _) in schema_util.get_all_leaf_features(schema) ]) # Seed the RNG used for shuffling and for MI computations. np.random.seed(seed)
def __init__(self, schema: schema_pb2.Schema, y_path: types.FeaturePath, x_paths: Optional[Iterable[types.FeaturePath]], y_boundaries: Optional[Iterable[float]], min_x_count: int, top_k_per_y: Optional[int], bottom_k_per_y: Optional[int], name: Text) -> None: """Initializes a lift statistics generator. Args: schema: A required schema for the dataset. y_path: The path to use as Y in the lift expression: lift = P(Y=y|X=x) / P(Y=y). x_paths: An optional list of path to use as X in the lift expression: lift = P(Y=y|X=x) / P(Y=y). If None (default), all categorical features, exluding the feature passed as y_path, will be used. y_boundaries: An optional list of boundaries to be used for binning y_path. If provided with b boundaries, the binned values will be treated as a categorical feature with b+1 different values. For example, the y_boundaries value [0.1, 0.8] would lead to three buckets: [-inf, 0.1), [0.1, 0.8) and [0.8, inf]. min_x_count: The minimum number of examples in which a specific x value must appear, in order for its lift to be output. top_k_per_y: Optionally, the number of top x values per y value, ordered by descending lift, for which to output lift. If both top_k_per_y and bottom_k_per_y are unset, all values will be output. bottom_k_per_y: Optionally, the number of bottom x values per y value, ordered by descending lift, for which to output lift. If both top_k_per_y and bottom_k_per_y are unset, all values will be output. name: An optional unique name associated with the statistics generator. """ self._name = name self._schema = schema self._y_path = y_path self._min_x_count = min_x_count self._top_k_per_y = top_k_per_y self._bottom_k_per_y = bottom_k_per_y y_feature = schema_util.get_feature(schema, y_path) y_is_categorical = schema_util.is_categorical_feature(y_feature) if y_boundaries: if y_is_categorical: raise ValueError( 'Boundaries cannot be applied to a categorical y_path') self._y_boundaries = np.array(sorted(set(y_boundaries))) else: if not y_is_categorical: raise ValueError( 'Boundaries must be provided with a non-categorical ' 'y_path.') self._y_boundaries = y_boundaries if x_paths is None: self._x_paths = ( set(schema_util.get_categorical_features(schema)) - set([y_path])) else: self._x_paths = x_paths
def test_get_categorical_features(self): schema = text_format.Parse( """ feature { name: "fa" type: INT int_domain { is_categorical: true } } feature { name: "fb" type: BYTES } feature { name: "fc" type: FLOAT } feature { name: "fd" type: INT } feature { name: "fd" type: STRUCT struct_domain { feature { name: "fd_fa" type: INT int_domain { is_categorical: true } } feature { name: "fd_fb" } } } feature { name: "fe" type: FLOAT float_domain { is_categorical: true } } """, schema_pb2.Schema()) expected = set([ types.FeaturePath(['fa']), types.FeaturePath(['fb']), types.FeaturePath(['fd', 'fd_fa']), types.FeaturePath(['fe']), ]) self.assertEqual(schema_util.get_categorical_features(schema), expected)
def __init__(self, y_path: types.FeaturePath, schema: Optional[schema_pb2.Schema], x_paths: Optional[Iterable[types.FeaturePath]], y_boundaries: Optional[Sequence[float]], min_x_count: int, top_k_per_y: Optional[int], bottom_k_per_y: Optional[int], weight_column_name: Optional[Text], output_custom_stats: bool, name: Text) -> None: """Initializes a lift statistics generator. Args: y_path: The path to use as Y in the lift expression: lift = P(Y=y|X=x) / P(Y=y). schema: An optional schema for the dataset. If not provided, x_paths must be specified. If x_paths are not specified, the schema is used to identify all categorical columns for which Lift should be computed. x_paths: An optional list of path to use as X in the lift expression: lift = P(Y=y|X=x) / P(Y=y). If None (default), all categorical features, exluding the feature passed as y_path, will be used. y_boundaries: An optional list of boundaries to be used for binning y_path. If provided with b boundaries, the binned values will be treated as a categorical feature with b+1 different values. For example, the y_boundaries value [0.1, 0.8] would lead to three buckets: [-inf, 0.1), [0.1, 0.8) and [0.8, inf]. min_x_count: The minimum number of examples in which a specific x value must appear, in order for its lift to be output. top_k_per_y: Optionally, the number of top x values per y value, ordered by descending lift, for which to output lift. If both top_k_per_y and bottom_k_per_y are unset, all values will be output. bottom_k_per_y: Optionally, the number of bottom x values per y value, ordered by descending lift, for which to output lift. If both top_k_per_y and bottom_k_per_y are unset, all values will be output. weight_column_name: Optionally, a weight column to use for converting counts of x or y into weighted counts. output_custom_stats: Whether to output custom stats for use with Facets. name: An optional unique name associated with the statistics generator. """ self._name = name self._schema = schema self._y_path = y_path self._min_x_count = min_x_count self._top_k_per_y = top_k_per_y self._bottom_k_per_y = bottom_k_per_y self._output_custom_stats = output_custom_stats self._y_boundaries = (np.array(sorted(set(y_boundaries))) if y_boundaries else None) self._weight_column_name = weight_column_name # If a schema is provided, we can do some additional validation of the # provided y_feature and boundaries. if self._schema is not None: y_feature = schema_util.get_feature(self._schema, y_path) y_is_categorical = schema_util.is_categorical_feature(y_feature) if self._y_boundaries is not None: if y_is_categorical: raise ValueError( 'Boundaries cannot be applied to a categorical y_path') else: if not y_is_categorical: raise ValueError( 'Boundaries must be provided with a non-categorical ' 'y_path.') if x_paths is not None: self._x_paths = x_paths elif self._schema is not None: self._x_paths = ( set(schema_util.get_categorical_features(schema)) - set([y_path])) else: raise ValueError('Either a schema or x_paths must be provided.')
def __init__(self, label_feature: types.FeaturePath, schema: Optional[schema_pb2.Schema] = None, max_encoding_length: int = 512, seed: int = 12345, multivalent_features: Optional[Set[types.FeaturePath]] = None, categorical_features: Optional[Set[types.FeaturePath]] = None, features_to_ignore: Optional[Set[types.FeaturePath]] = None, normalize_by_max: bool = False, allow_invalid_partitions: bool = False, custom_stats_key: str = _ADJUSTED_MUTUAL_INFORMATION_KEY, column_partitions: int = 1): """Initializes MutualInformation. Args: label_feature: The key used to identify labels in the ExampleBatch. schema: An optional schema describing the the dataset. Either a schema or a list of categorical and multivalent features must be provided. max_encoding_length: An int value to specify the maximum length of encoding to represent a feature value. seed: An int value to seed the RNG used in MI computation. multivalent_features: An optional set of features that are multivalent. categorical_features: An optional set of the features that are categorical. features_to_ignore: An optional set of features that should be ignored by the mutual information calculation. normalize_by_max: If True, AMI values are normalized to a range 0 to 1 by dividing by the maximum possible information AMI(Y, Y). allow_invalid_partitions: If True, generator tolerates input partitions that are invalid (e.g. size of partion is < the k for the KNN), where invalid partitions return no stats. The min_partitions_stat_presence arg to PartitionedStatisticsAnalyzer controls how many partitions may be invalid while still reporting the metric. custom_stats_key: A string that determines the key used in the custom statistic. This defaults to `_ADJUSTED_MUTUAL_INFORMATION_KEY`. column_partitions: If > 1, self.partitioner returns a PTransform that partitions input RecordBatches by column (feature), in addition to the normal row partitioning (by batch). The total number of effective partitions is column_partitions * row_partitions, where row_partitions is passed to self.partitioner. Raises: ValueError: If label_feature does not exist in the schema. """ self._label_feature = label_feature self._schema = schema self._normalize_by_max = normalize_by_max if multivalent_features is not None: self._multivalent_features = multivalent_features elif self._schema is not None: self._multivalent_features = schema_util.get_multivalent_features( self._schema) else: raise ValueError( "Either multivalent feature set or schema must be provided") if categorical_features is not None: self._categorical_features = categorical_features elif self._schema is not None: self._categorical_features = schema_util.get_categorical_features( self._schema) else: raise ValueError( "Either categorical feature set or schema must be provided") if schema: assert schema_util.get_feature(self._schema, self._label_feature) self._label_feature_is_categorical = (self._label_feature in self._categorical_features) self._max_encoding_length = max_encoding_length self._seed = seed self._features_to_ignore = features_to_ignore self._allow_invalid_partitions = allow_invalid_partitions self._custom_stats_key = custom_stats_key self._column_partitions = column_partitions