def extract_output(self, accumulator: CrossFeatureStatsGeneratorAccumulator ) -> statistics_pb2.DatasetFeatureStatistics: # Create a new DatasetFeatureStatistics proto. result = statistics_pb2.DatasetFeatureStatistics() for feat_cross, cross_feat_stats in accumulator.items(): # Construct the CrossFeatureStatistics proto from the partial # cross feature stats. cross_feat_stats_proto = result.cross_features.add() path_x = path_pb2.Path() path_x.step.append(feat_cross[0]) path_y = path_pb2.Path() path_y.step.append(feat_cross[1]) cross_feat_stats_proto.path_x.CopyFrom(path_x) cross_feat_stats_proto.path_y.CopyFrom(path_y) cross_feat_stats_proto.count = cross_feat_stats.count if cross_feat_stats.count > 0: num_cross_stats_proto = statistics_pb2.NumericCrossStatistics() covariance = (cross_feat_stats.sum_xy / cross_feat_stats.count) -\ (cross_feat_stats.sum_x / cross_feat_stats.count) *\ (cross_feat_stats.sum_y / cross_feat_stats.count) num_cross_stats_proto.covariance = covariance std_dev_x = math.sqrt(max( 0, (cross_feat_stats.sum_square_x / cross_feat_stats.count) - math.pow(cross_feat_stats.sum_x / cross_feat_stats.count, 2))) std_dev_y = math.sqrt(max( 0, (cross_feat_stats.sum_square_y / cross_feat_stats.count) - math.pow(cross_feat_stats.sum_y / cross_feat_stats.count, 2))) if std_dev_x != 0 and std_dev_y != 0: correlation = covariance / (std_dev_x * std_dev_y) num_cross_stats_proto.correlation = correlation cross_feat_stats_proto.num_cross_stats.CopyFrom(num_cross_stats_proto) return result
def _ragged_tensor_representation_from_feature_spec( spec: common_types.RaggedFeature, name: str, domains: Dict[str, common_types.DomainType] ) -> Tuple[schema_pb2.Feature, List[schema_pb2.Feature], schema_pb2.TensorRepresentation]: """Returns representation of a RaggedTensor from a feature spec. Args: spec: A tf.io.RaggedFeature feature spec. name: Feature name. domains: A dict whose keys are feature names and values are one of schema_pb2.IntDomain, schema_pb2.StringDomain or schema_pb2.FloatDomain. Returns: A tuple (value_feature, partitions_features, ragged_tensor_rep), where value_feature represents RaggedTensor values, partitions_features represent row lengths partitions and ragged_tensor_rep - ragged TensorRepresentation. Raises: ValueError: If the feature spec contains partition types different from UniformRowLength and RowLengths. """ value_feature = schema_pb2.Feature(name=spec.value_key or name) _set_type(name, value_feature, spec.dtype) _set_domain(name, value_feature, domains.get(name)) ragged_tensor = schema_pb2.TensorRepresentation.RaggedTensor( feature_path=path_pb2.Path(step=[spec.value_key or name])) partitions_features = [] for partition in spec.partitions: if isinstance(partition, tf.io.RaggedFeature.UniformRowLength): # pytype: disable=attribute-error ragged_tensor.partition.append( schema_pb2.TensorRepresentation.RaggedTensor.Partition( uniform_row_length=partition.length)) elif isinstance(partition, tf.io.RaggedFeature.RowLengths): # pytype: disable=attribute-error ragged_tensor.partition.append( schema_pb2.TensorRepresentation.RaggedTensor.Partition( row_length=partition.key)) partitions_features.append( schema_pb2.Feature(name=partition.key, type=schema_pb2.INT)) else: raise ValueError( 'RaggedFeature can only be created with UniformRowLength and ' 'RowLengths partitions.') return value_feature, partitions_features, schema_pb2.TensorRepresentation( ragged_tensor=ragged_tensor)
def as_proto(self): """Serialize a path as a proto. This fails if there are any anonymous fields. Returns: a Path proto. """ result = tf_metadata_path_pb2.Path() for x in self.field_list: if isinstance(x, str): result.step.append(x) elif isinstance(x, AnonymousId): raise ValueError("Cannot serialize a path with anonymous fields") else: raise ValueError("Unexpected path element type: %s" % type(x)) return result
def to_proto(self) -> path_pb2.Path: return path_pb2.Path(step=self._steps)
def to_proto(self) -> path_pb2.Path: """Creates a tensorflow_metadata path proto this ColumnPath.""" return path_pb2.Path(step=self._steps)