Beispiel #1
0
def _sparse_feature_from_feature_spec(spec, name, domains):
    """Returns a representation of a SparseFeature from a feature spec."""
    if isinstance(spec.index_key, list):
        raise ValueError(
            'SparseFeature "{}" had index_key {}, but size and index_key '
            'fields should be single values'.format(name, spec.index_key))
    if isinstance(spec.size, list):
        raise ValueError(
            'SparseFeature "{}" had size {}, but size and index_key fields '
            'should be single values'.format(name, spec.size))

    # Create a index feature.
    index_feature = schema_pb2.Feature(name=spec.index_key,
                                       type=schema_pb2.INT,
                                       int_domain=schema_pb2.IntDomain(
                                           min=0, max=spec.size - 1))

    # Create a value feature.
    value_feature = schema_pb2.Feature(name=spec.value_key)
    _set_type(name, value_feature, spec.dtype)
    _set_domain(name, value_feature, domains.get(name))

    # Create a sparse feature which refers to the index and value features.
    index_feature_ref = schema_pb2.SparseFeature.IndexFeature(
        name=spec.index_key)
    value_feature_ref = schema_pb2.SparseFeature.ValueFeature(
        name=spec.value_key)
    sparse_feature = schema_pb2.SparseFeature(
        name=name,
        is_sorted=True if spec.already_sorted else None,
        index_feature=[index_feature_ref],
        value_feature=value_feature_ref)

    return (index_feature, value_feature, sparse_feature)
Beispiel #2
0
  def test_look_up_feature(self):
    feature_1 = text_format.Parse("""name: "feature1" """, schema_pb2.Feature())
    feature_2 = text_format.Parse("""name: "feature2" """, schema_pb2.Feature())

    container = [feature_1, feature_2]
    self.assertEqual(
        schema_util.look_up_feature('feature1', container), feature_1)
    self.assertEqual(
        schema_util.look_up_feature('feature2', container), feature_2)
    self.assertEqual(schema_util.look_up_feature('feature3', container), None)
Beispiel #3
0
def _sparse_feature_from_feature_spec(spec, name, domains):
    """Returns a representation of a SparseFeature from a feature spec."""
    if isinstance(spec.index_key, list):
        assert isinstance(spec.size,
                          (list, tuple, tf.TensorShape)), type(spec.size)
        assert len(spec.index_key) == len(spec.size), (spec.index_key,
                                                       spec.size)
        spec_size = [
            s.value if isinstance(s, tf.compat.v1.Dimension) else s
            for s in spec.size
        ]
        int_domains = [
            schema_pb2.IntDomain(min=0, max=size -
                                 1) if size is not None else None
            for size in spec_size
        ]
        index_feature = [
            schema_pb2.Feature(name=key,
                               type=schema_pb2.INT,
                               int_domain=int_domain)
            for (key, int_domain) in zip(spec.index_key, int_domains)
        ]
        index_feature_ref = [
            schema_pb2.SparseFeature.IndexFeature(name=key)
            for key in spec.index_key
        ]
    else:
        # Create a index feature.
        index_feature = [
            schema_pb2.Feature(name=spec.index_key,
                               type=schema_pb2.INT,
                               int_domain=schema_pb2.IntDomain(min=0,
                                                               max=spec.size -
                                                               1))
        ]
        index_feature_ref = [
            schema_pb2.SparseFeature.IndexFeature(name=spec.index_key)
        ]

    # Create a value feature.
    value_feature = schema_pb2.Feature(name=spec.value_key)
    _set_type(name, value_feature, spec.dtype)
    _set_domain(name, value_feature, domains.get(name))

    # Create a sparse feature which refers to the index and value features.
    value_feature_ref = schema_pb2.SparseFeature.ValueFeature(
        name=spec.value_key)
    sparse_feature = schema_pb2.SparseFeature(
        name=name,
        is_sorted=True if spec.already_sorted else None,
        index_feature=index_feature_ref,
        value_feature=value_feature_ref)

    return (index_feature, value_feature, sparse_feature)
    def test_map_prensor_to_prensor_with_schema(self):
        original = create_expression.create_expression_from_prensor(
            prensor_test_util.create_nested_prensor())

        def my_prensor_op(original_prensor):
            # Note that we are copying over the original root prensor node. The root
            # node is ignored in the result.
            return prensor.create_prensor_from_descendant_nodes({
                path.Path([]):
                original_prensor.node,
                path.Path(["bar2"]):
                original_prensor.get_child_or_error("bar").node,
                path.Path(["keep_me2"]):
                original_prensor.get_child_or_error("keep_me").node
            })

        bar2_feature = schema_pb2.Feature()
        bar2_feature.value_count.max = 7
        keep_me2_feature = schema_pb2.Feature()
        keep_me2_feature.value_count.max = 10

        # Since the top node is actually a child node, we use the child schema.
        my_output_schema = map_prensor_to_prensor.create_schema(
            is_repeated=True,
            children={
                "bar2": {
                    "is_repeated": True,
                    "dtype": tf.string,
                    "schema_feature": bar2_feature
                },
                "keep_me2": {
                    "is_repeated": False,
                    "dtype": tf.bool,
                    "schema_feature": keep_me2_feature
                }
            })

        result = map_prensor_to_prensor.map_prensor_to_prensor(
            root_expr=original,
            source=path.Path(["doc"]),
            paths_needed=[path.Path(["bar"]),
                          path.Path(["keep_me"])],
            prensor_op=my_prensor_op,
            output_schema=my_output_schema)

        doc_result = result.get_child_or_error("doc")
        bar2_result = doc_result.get_child_or_error("bar2")
        self.assertEqual(bar2_result.schema_feature.value_count.max, 7)

        keep_me2_result = doc_result.get_child_or_error("keep_me2")
        self.assertEqual(keep_me2_result.schema_feature.value_count.max, 10)
Beispiel #5
0
def _ragged_tensor_representation_from_feature_spec(
    spec: common_types.RaggedFeature, name: str,
    domains: Dict[str, common_types.DomainType]
) -> Tuple[schema_pb2.Feature, List[schema_pb2.Feature],
           schema_pb2.TensorRepresentation]:
    """Returns representation of a RaggedTensor from a feature spec.

  Args:
    spec: A tf.io.RaggedFeature feature spec.
    name: Feature name.
    domains: A dict whose keys are feature names and values are one of
      schema_pb2.IntDomain, schema_pb2.StringDomain or schema_pb2.FloatDomain.

  Returns:
    A tuple (value_feature, partitions_features, ragged_tensor_rep),
      where value_feature represents RaggedTensor values, partitions_features
      represent row lengths partitions and ragged_tensor_rep - ragged
      TensorRepresentation.

  Raises:
    ValueError: If the feature spec contains partition types different from
      UniformRowLength and RowLengths.
  """
    value_feature = schema_pb2.Feature(name=spec.value_key or name)
    _set_type(name, value_feature, spec.dtype)
    _set_domain(name, value_feature, domains.get(name))

    ragged_tensor = schema_pb2.TensorRepresentation.RaggedTensor(
        feature_path=path_pb2.Path(step=[spec.value_key or name]))

    partitions_features = []
    for partition in spec.partitions:
        if isinstance(partition, tf.io.RaggedFeature.UniformRowLength):  # pytype: disable=attribute-error
            ragged_tensor.partition.append(
                schema_pb2.TensorRepresentation.RaggedTensor.Partition(
                    uniform_row_length=partition.length))
        elif isinstance(partition, tf.io.RaggedFeature.RowLengths):  # pytype: disable=attribute-error
            ragged_tensor.partition.append(
                schema_pb2.TensorRepresentation.RaggedTensor.Partition(
                    row_length=partition.key))
            partitions_features.append(
                schema_pb2.Feature(name=partition.key, type=schema_pb2.INT))
        else:
            raise ValueError(
                'RaggedFeature can only be created with UniformRowLength and '
                'RowLengths partitions.')

    return value_feature, partitions_features, schema_pb2.TensorRepresentation(
        ragged_tensor=ragged_tensor)
Beispiel #6
0
    def test_get_schema(self):
        foo_feature = schema_pb2.Feature()
        foo_feature.int_domain.max = 10
        foo = expression_test_util.MockExpression(is_repeated=False,
                                                  my_type=tf.int64,
                                                  schema_feature=foo_feature)
        foorepeated = expression_test_util.MockExpression(is_repeated=True,
                                                          my_type=tf.int64)
        bar_feature = schema_pb2.Feature()
        bar_feature.presence.min_count = 17
        bar = expression_test_util.MockExpression(is_repeated=True,
                                                  my_type=tf.string,
                                                  schema_feature=bar_feature)
        keep_me = expression_test_util.MockExpression(is_repeated=False,
                                                      my_type=tf.bool)

        doc = expression_test_util.MockExpression(is_repeated=True,
                                                  my_type=tf.int64,
                                                  children={
                                                      "bar": bar,
                                                      "keep_me": keep_me
                                                  })
        root = expression_test_util.MockExpression(is_repeated=True,
                                                   my_type=None,
                                                   children={
                                                       "foo": foo,
                                                       "foorepeated":
                                                       foorepeated,
                                                       "doc": doc
                                                   })

        schema_result = root.get_schema()
        feature_map = _features_as_map(schema_result.feature)
        self.assertIn("foo", feature_map)
        # Check the properties of a first-level feature.
        self.assertEqual(feature_map["foo"].int_domain.max, 10)
        self.assertIn("foorepeated", feature_map)

        self.assertEqual(feature_map["doc"].type,
                         schema_pb2.FeatureType.STRUCT)
        doc_feature_map = _features_as_map(
            feature_map["doc"].struct_domain.feature)
        # Test that second level features are correctly handled.
        self.assertIn("bar", doc_feature_map)
        # Test that an string_domain specified at the schema level is inserted
        # correctly.
        self.assertEqual(doc_feature_map["bar"].presence.min_count, 17)
        self.assertIn("keep_me", doc_feature_map)
Beispiel #7
0
 def test_stats_options_invalid_slicing_sql_query(self):
     schema = schema_pb2.Schema(feature=[
         schema_pb2.Feature(name='feat1', type=schema_pb2.BYTES),
         schema_pb2.Feature(name='feat3', type=schema_pb2.INT)
     ], )
     experimental_slice_sqls = [
         """
     SELECT
       STRUCT(feat1, feat2)
     FROM
       example.feat1, example.feat2
     """
     ]
     with self.assertRaisesRegex(ValueError, 'One of the slice SQL query'):
         stats_options.StatsOptions(
             experimental_slice_sqls=experimental_slice_sqls, schema=schema)
    def test_get_schema_missing_features(self):
        # The expr has a number of features: foo, foorepeated, doc, user.
        expr = create_expression.create_expression_from_prensor(
            prensor_test_util.create_big_prensor())
        # The schema has only a subset of the features on the expr.
        schema = schema_pb2.Schema()
        feature = schema.feature.add()
        feature.name = "foo"
        feature.type = schema_pb2.FeatureType.INT
        feature.value_count.min = 1
        feature.value_count.max = 1
        feature = schema.feature.add()
        feature.name = "foorepeated"
        feature.type = schema_pb2.FeatureType.INT
        feature.value_count.min = 0
        feature.value_count.max = 5
        feature = schema.feature.add()
        feature.name = "doc"
        feature.type = schema_pb2.FeatureType.STRUCT
        feature.struct_domain.feature.append(
            schema_pb2.Feature(name="keep_me",
                               type=schema_pb2.FeatureType.INT))

        # By default, the output schema has all features present in the expr.
        expr = expr.apply_schema(schema)
        output_schema = expr.get_schema()
        self.assertNotEqual(schema, output_schema)
        self.assertLen(schema.feature, 3)
        self.assertLen(output_schema.feature, 4)

        # With create_schema_features = False, only features on the original schema
        # propogate to the new schema.
        output_schema = expr.get_schema(create_schema_features=False)
        self.assertLen(output_schema.feature, 3)
Beispiel #9
0
def _feature_from_feature_spec(spec, name, domains):
    """Returns a representation of a Feature from a feature spec."""
    if isinstance(spec, tf.io.FixedLenFeature):
        if spec.default_value is not None:
            raise ValueError(
                'feature "{}" had default_value {}, but FixedLenFeature must have '
                'default_value=None'.format(name, spec.default_value))
        dims = [schema_pb2.FixedShape.Dim(size=size) for size in spec.shape]
        feature = schema_pb2.Feature(
            name=name,
            presence=schema_pb2.FeaturePresence(min_fraction=1.0),
            shape=schema_pb2.FixedShape(dim=dims))
    elif isinstance(spec, tf.io.VarLenFeature):
        feature = schema_pb2.Feature(name=name)
    else:
        raise TypeError(
            'Spec for feature "{}" was {} of type {}, expected a '
            'FixedLenFeature, VarLenFeature or SparseFeature'.format(
                name, spec, type(spec)))

    _set_type(name, feature, spec.dtype)
    _set_domain(name, feature, domains.get(name))
    return feature
Beispiel #10
0
    def _ProjectTfmdSchema(self,
                           tensor_names: List[Text]) -> schema_pb2.Schema:
        """Projects self._schema by the given tensor names."""
        tensor_representations = self.TensorRepresentations()
        tensor_names = set(tensor_names)
        if not tensor_names.issubset(tensor_representations):
            raise ValueError(
                "Unable to project {} because they were not in the original "
                "TensorRepresentations.".format(tensor_names -
                                                tensor_representations))
        used_paths = set()
        for tensor_name in tensor_names:
            used_paths.update(
                tensor_representation_util.
                GetSourceColumnsFromTensorRepresentation(
                    tensor_representations[tensor_name]))
        result = schema_pb2.Schema()
        # Note: We only copy projected features into the new schema because the
        # coder, and ArrowSchema() only care about Schema.feature. If they start
        # depending on other Schema fields then those fields must also be projected.
        for f in self._schema.feature:
            p = path.ColumnPath(f.name)
            if f.name == _SEQUENCE_COLUMN_NAME:
                if f.type != schema_pb2.STRUCT:
                    raise ValueError(
                        "Feature {} was expected to be of type STRUCT, but got {}"
                        .format(f.name, f))
                result_sequence_struct = schema_pb2.Feature()
                result_sequence_struct.CopyFrom(f)
                result_sequence_struct.ClearField("struct_domain")
                any_sequence_feature_projected = False
                for sf in f.struct_domain.feature:
                    sequence_feature_path = p.child(sf.name)
                    if sequence_feature_path in used_paths:
                        any_sequence_feature_projected = True
                        result_sequence_struct.struct_domain.feature.add(
                        ).CopyFrom(sf)
                if any_sequence_feature_projected:
                    result.feature.add().CopyFrom(result_sequence_struct)
            elif p in used_paths:
                result.feature.add().CopyFrom(f)

        tensor_representation_util.SetTensorRepresentationsInSchema(
            result, {
                k: v
                for k, v in tensor_representations.items() if k in tensor_names
            })

        return result
Beispiel #11
0
def _clean_feature(feature: schema_pb2.Feature) -> schema_pb2.Feature:
    """Remove name and all children of a feature (if any exist), returning a copy.

  Args:
    feature: input feature

  Returns:
    cleaned feature
  """
    copy = schema_pb2.Feature()
    copy.CopyFrom(feature)
    copy.ClearField("name")
    if copy.HasField("struct_domain"):
        del copy.struct_domain.feature[:]
    return copy
def create_protobuf_feature(column_schema):
    feature = schema_pb2.Feature()
    feature.name = column_schema.name
    feature = register_dtype(column_schema, feature)
    annotation = feature.annotation
    annotation.tag.extend([
        tag.value if hasattr(tag, "value") else tag
        for tag in column_schema.tags
    ])
    # can be instantiated with no values
    # if  so, unnecessary to dump
    # import pdb; pdb.set_trace()
    if len(column_schema.properties) > 0:
        feature = register_extra_metadata(column_schema, feature)
    return feature
Beispiel #13
0
def _get_promote_schema_feature(
        original: Optional[schema_pb2.Feature],
        parent: Optional[schema_pb2.Feature]) -> Optional[schema_pb2.Feature]:
    """Generate the schema feature for the field resulting from promote.

  Note that promote results in the exact same number of values.

  Note that min_count is never propagated.

  Args:
    original: the original feature
    parent: the parent feature

  Returns:
    the schema of the new field.
  """
    if original is None or parent is None:
        return None
    result = schema_pb2.Feature()
    result.lifecycle_stage = _min_lifecycle_stage(original.lifecycle_stage,
                                                  parent.lifecycle_stage)
    result.type = original.type
    if original.HasField("distribution_constraints"):
        result.distribution_constraints.CopyFrom(
            original.distribution_constraints)
    _copy_domain_info(original, result)

    if _feature_is_dense(parent):
        parent_size = parent.value_count.min
        if original.value_count.HasField("min"):
            result.value_count.min = parent_size * original.value_count.min
        if original.value_count.HasField("max"):
            result.value_count.max = parent_size * original.value_count.max
        if original.presence.HasField("min_fraction"):
            if original.presence.min_fraction == 1:
                result.presence.min_fraction = 1
            else:
                result.presence.min_fraction = (
                    original.presence.min_fraction / parent_size)
        if original.presence.HasField("min_count"):
            # If the parent is dense then the count can
            # be reduced by the number of children.
            # E.g. {{"a"},{"b"}},{{"c"},{"d"}},{{"e"},{"f"}}
            # with a count of 6, with a parent size of 2 becomes:
            # can become {"a","b"}, {"c", "d"}, {"e", "f"}
            # which has a count of 3.
            result.presence.min_count = original.presence.min_count // parent_size
    return result
Beispiel #14
0
    def test_stats_options_json_round_trip(self):
        generators = [
            lift_stats_generator.LiftStatsGenerator(
                schema=None,
                y_path=types.FeaturePath(['label']),
                x_paths=[types.FeaturePath(['feature'])])
        ]
        feature_whitelist = ['a']
        schema = schema_pb2.Schema(feature=[schema_pb2.Feature(name='f')])
        label_feature = 'label'
        weight_feature = 'weight'
        slice_functions = [slicing_util.get_feature_value_slicer({'b': None})]
        sample_rate = 0.01
        num_top_values = 21
        frequency_threshold = 2
        weighted_frequency_threshold = 2.0
        num_rank_histogram_buckets = 1001
        num_values_histogram_buckets = 11
        num_histogram_buckets = 11
        num_quantiles_histogram_buckets = 11
        epsilon = 0.02
        infer_type_from_schema = True
        desired_batch_size = 100
        enable_semantic_domain_stats = True
        semantic_domain_stats_sample_rate = 0.1

        options = stats_options.StatsOptions(
            generators=generators,
            feature_whitelist=feature_whitelist,
            schema=schema,
            label_feature=label_feature,
            weight_feature=weight_feature,
            slice_functions=slice_functions,
            sample_rate=sample_rate,
            num_top_values=num_top_values,
            frequency_threshold=frequency_threshold,
            weighted_frequency_threshold=weighted_frequency_threshold,
            num_rank_histogram_buckets=num_rank_histogram_buckets,
            num_values_histogram_buckets=num_values_histogram_buckets,
            num_histogram_buckets=num_histogram_buckets,
            num_quantiles_histogram_buckets=num_quantiles_histogram_buckets,
            epsilon=epsilon,
            infer_type_from_schema=infer_type_from_schema,
            desired_batch_size=desired_batch_size,
            enable_semantic_domain_stats=enable_semantic_domain_stats,
            semantic_domain_stats_sample_rate=semantic_domain_stats_sample_rate
        )

        options_json = options.to_json()
        options = stats_options.StatsOptions.from_json(options_json)

        self.assertIsNone(options.generators)
        self.assertEqual(feature_whitelist, options.feature_whitelist)
        compare.assertProtoEqual(self, schema, options.schema)
        self.assertEqual(label_feature, options.label_feature)
        self.assertEqual(weight_feature, options.weight_feature)
        self.assertIsNone(options.slice_functions)
        self.assertEqual(sample_rate, options.sample_rate)
        self.assertEqual(num_top_values, options.num_top_values)
        self.assertEqual(frequency_threshold, options.frequency_threshold)
        self.assertEqual(weighted_frequency_threshold,
                         options.weighted_frequency_threshold)
        self.assertEqual(num_rank_histogram_buckets,
                         options.num_rank_histogram_buckets)
        self.assertEqual(num_values_histogram_buckets,
                         options.num_values_histogram_buckets)
        self.assertEqual(num_histogram_buckets, options.num_histogram_buckets)
        self.assertEqual(num_quantiles_histogram_buckets,
                         options.num_quantiles_histogram_buckets)
        self.assertEqual(epsilon, options.epsilon)
        self.assertEqual(infer_type_from_schema,
                         options.infer_type_from_schema)
        self.assertEqual(desired_batch_size, options.desired_batch_size)
        self.assertEqual(enable_semantic_domain_stats,
                         options.enable_semantic_domain_stats)
        self.assertEqual(semantic_domain_stats_sample_rate,
                         options.semantic_domain_stats_sample_rate)
Beispiel #15
0
            'x': tf.io.FixedLenSequenceFeature([], tf.int64)
        },
        'error_msg': r'Spec for feature "x" was .* of type .*, expected a '
        r'FixedLenFeature, VarLenFeature or SparseFeature',
        'error_class': TypeError
    },
]

_FEATURE_BY_NAME = {
    'x':
    text_format.Parse(
        """
        name: "x"
        type: INT
        int_domain { min: 0 max: 9 }
    """, schema_pb2.Feature()),
    'ragged$value':
    text_format.Parse(
        """
        name: "ragged$value"
        type: FLOAT
    """, schema_pb2.Feature()),
    'ragged$row_lengths_1':
    text_format.Parse(
        """
        name: "ragged$row_lengths_1"
        type: INT
    """, schema_pb2.Feature()),
    'ragged$row_lengths_2':
    text_format.Parse(
        """
Beispiel #16
0
     'stats_options_kwargs': {
         'semantic_domain_stats_sample_rate': 2
     },
     'exception_type': ValueError,
     'error_message': 'Invalid semantic_domain_stats_sample_rate 2'
 },
 {
     'testcase_name':
     'categorical_float_without_sketch_generators',
     'stats_options_kwargs': {
         'experimental_use_sketch_based_topk_uniques':
         False,
         'schema':
         schema_pb2.Schema(feature=[
             schema_pb2.Feature(
                 name='f',
                 type=schema_pb2.FLOAT,
                 float_domain=schema_pb2.FloatDomain(is_categorical=True))
         ], ),
     },
     'exception_type':
     ValueError,
     'error_message': ('Categorical float features set in schema require '
                       'experimental_use_sketch_based_topk_uniques'),
 },
 {
     'testcase_name': 'both_slice_fns_and_slice_sqls_specified',
     'stats_options_kwargs': {
         'experimental_slice_functions': [lambda x: (None, x)],
         'experimental_slice_sqls': ['']
     },
     'exception_type': ValueError,
Beispiel #17
0
    def test_valid_stats_options_json_round_trip(self):
        feature_allowlist = ['a']
        schema = schema_pb2.Schema(feature=[schema_pb2.Feature(name='f')])
        vocab_paths = {'a': '/path/to/a'}
        label_feature = 'label'
        weight_feature = 'weight'
        sample_rate = 0.01
        num_top_values = 21
        frequency_threshold = 2
        weighted_frequency_threshold = 2.0
        num_rank_histogram_buckets = 1001
        num_values_histogram_buckets = 11
        num_histogram_buckets = 11
        num_quantiles_histogram_buckets = 11
        epsilon = 0.02
        infer_type_from_schema = True
        desired_batch_size = 100
        enable_semantic_domain_stats = True
        semantic_domain_stats_sample_rate = 0.1
        per_feature_weight_override = {types.FeaturePath(['a']): 'w'}
        add_default_generators = True
        use_sketch_based_topk_uniques = True
        experimental_result_partitions = 3

        options = stats_options.StatsOptions(
            feature_allowlist=feature_allowlist,
            schema=schema,
            vocab_paths=vocab_paths,
            label_feature=label_feature,
            weight_feature=weight_feature,
            sample_rate=sample_rate,
            num_top_values=num_top_values,
            frequency_threshold=frequency_threshold,
            weighted_frequency_threshold=weighted_frequency_threshold,
            num_rank_histogram_buckets=num_rank_histogram_buckets,
            num_values_histogram_buckets=num_values_histogram_buckets,
            num_histogram_buckets=num_histogram_buckets,
            num_quantiles_histogram_buckets=num_quantiles_histogram_buckets,
            epsilon=epsilon,
            infer_type_from_schema=infer_type_from_schema,
            desired_batch_size=desired_batch_size,
            enable_semantic_domain_stats=enable_semantic_domain_stats,
            semantic_domain_stats_sample_rate=semantic_domain_stats_sample_rate,
            per_feature_weight_override=per_feature_weight_override,
            add_default_generators=add_default_generators,
            experimental_use_sketch_based_topk_uniques=
            use_sketch_based_topk_uniques,
            experimental_result_partitions=experimental_result_partitions,
        )

        options_json = options.to_json()
        options = stats_options.StatsOptions.from_json(options_json)

        self.assertEqual(feature_allowlist, options.feature_allowlist)
        compare.assertProtoEqual(self, schema, options.schema)
        self.assertEqual(vocab_paths, options.vocab_paths)
        self.assertEqual(label_feature, options.label_feature)
        self.assertEqual(weight_feature, options.weight_feature)
        self.assertEqual(sample_rate, options.sample_rate)
        self.assertEqual(num_top_values, options.num_top_values)
        self.assertEqual(frequency_threshold, options.frequency_threshold)
        self.assertEqual(weighted_frequency_threshold,
                         options.weighted_frequency_threshold)
        self.assertEqual(num_rank_histogram_buckets,
                         options.num_rank_histogram_buckets)
        self.assertEqual(num_values_histogram_buckets,
                         options.num_values_histogram_buckets)
        self.assertEqual(num_histogram_buckets, options.num_histogram_buckets)
        self.assertEqual(num_quantiles_histogram_buckets,
                         options.num_quantiles_histogram_buckets)
        self.assertEqual(epsilon, options.epsilon)
        self.assertEqual(infer_type_from_schema,
                         options.infer_type_from_schema)
        self.assertEqual(desired_batch_size, options.desired_batch_size)
        self.assertEqual(enable_semantic_domain_stats,
                         options.enable_semantic_domain_stats)
        self.assertEqual(semantic_domain_stats_sample_rate,
                         options.semantic_domain_stats_sample_rate)
        self.assertEqual(per_feature_weight_override,
                         options._per_feature_weight_override)
        self.assertEqual(add_default_generators,
                         options.add_default_generators)
        self.assertEqual(use_sketch_based_topk_uniques,
                         options.experimental_use_sketch_based_topk_uniques)
        self.assertEqual(experimental_result_partitions,
                         options.experimental_result_partitions)
Beispiel #18
0
    def export_tfx_schema(self) -> schema_pb2.Schema:
        """
        Create a Tensorflow metadata schema from a FeatureSet.

        Returns:
            Tensorflow metadata schema.

        """
        schema = schema_pb2.Schema()

        # List of attributes to copy from fields in the FeatureSet to feature in
        # Tensorflow metadata schema where the attribute name is the same.
        attributes_to_copy_from_field_to_feature = [
            "name",
            "presence",
            "group_presence",
            "shape",
            "value_count",
            "domain",
            "int_domain",
            "float_domain",
            "string_domain",
            "bool_domain",
            "struct_domain",
            "_natural_language_domain",
            "image_domain",
            "mid_domain",
            "url_domain",
            "time_domain",
            "time_of_day_domain",
        ]

        for _, field in self._fields.items():
            if isinstance(field, Entity):
                continue
            feature = schema_pb2.Feature()
            for attr in attributes_to_copy_from_field_to_feature:
                if getattr(field, attr) is None:
                    # This corresponds to an unset member in the proto Oneof field.
                    continue
                if issubclass(type(getattr(feature, attr)), Message):
                    # Proto message field to copy is an "embedded" field, so MergeFrom()
                    # method must be used.
                    getattr(feature, attr).MergeFrom(getattr(field, attr))
                elif issubclass(type(getattr(feature, attr)),
                                (int, str, bool)):
                    # Proto message field is a simple Python type, so setattr()
                    # can be used.
                    setattr(feature, attr, getattr(field, attr))
                else:
                    warnings.warn(
                        f"Attribute '{attr}' cannot be copied from Field "
                        f"'{field.name}' in FeatureSet '{self.name}' to a "
                        f"Feature in the Tensorflow metadata schema, because"
                        f"the type is neither a Protobuf message or Python "
                        f"int, str and bool")
            # "type" attr is handled separately because the attribute name is different
            # ("dtype" in field and "type" in Feature) and "type" in Feature is only
            # a subset of "dtype".
            feature.type = field.dtype.to_tfx_schema_feature_type()
            schema.feature.append(feature)

        return schema