def _to_partial_x_counts( sliced_record_batch: types.SlicedRecordBatch, x_paths: Iterable[types.FeaturePath], example_weight_map: ExampleWeightMap ) -> Iterator[Tuple[_SlicedXKey, _CountType]]: """Yields per-(slice, x_path, x) counts of the examples with x in x_path.""" for x_path in x_paths: for (slice_key, x), x_count in _to_partial_counts( sliced_record_batch, x_path, boundaries=None, weight_column_name=example_weight_map.get(x_path)): yield _SlicedXKey(slice_key, x_path.steps(), x), x_count
def _to_partial_copresence_counts( sliced_record_batch: types.SlicedRecordBatch, y_path: types.FeaturePath, x_paths: Iterable[types.FeaturePath], y_boundaries: Optional[np.ndarray], example_weight_map: ExampleWeightMap, num_xy_pairs_batch_copresent: Optional[ beam.metrics.metric.Metrics.DelegatingDistribution] = None ) -> Iterator[Tuple[_SlicedXYKey, _CountType]]: """Yields per-(slice, path_x, x, y) counts of examples with x and y. This method generates the number of times a given pair of y- and x-values appear in the same record, for a slice_key and x_path. Records in which either x or y is absent will be skipped. Args: sliced_record_batch: A tuple of (slice_key, record_batch) representing a slice of examples y_path: The path to use as Y in the lift expression: lift = P(Y=y|X=x) / P(Y=y). x_paths: A set of x_paths for which to compute lift. y_boundaries: Optionally, a set of bin boundaries to use for binning y_path values. example_weight_map: an ExampleWeightMap that maps a FeaturePath to its corresponding weight column. num_xy_pairs_batch_copresent: A counter tracking the number of different xy pairs that are copresent within each batch. If the same pair of xy values are copresent in more than one batch, this counter will be incremented once for each batch in which they are copresent. Yields: Tuples of the form (_SlicedXYKey(slice_key, x_path, x, y), count) for each combination of x_path, x, and y in the input record batch. """ slice_key, record_batch = sliced_record_batch y_presence = _get_example_value_presence(record_batch, y_path, y_boundaries, weight_column_name=None) if y_presence is None: return ys_by_example = collections.defaultdict(list) for example_index, y in zip(y_presence.example_indices, y_presence.values): ys_by_example[example_index].append(y) for x_path in x_paths: weight_column_name = example_weight_map.get(x_path) x_presence = _get_example_value_presence( record_batch, x_path, boundaries=None, weight_column_name=weight_column_name) if x_presence is None: continue if weight_column_name is not None: copresence_counts = collections.defaultdict(float) else: copresence_counts = collections.defaultdict(int) for example_index, x, weight in zip(x_presence.example_indices, x_presence.values, x_presence.weights): for y in ys_by_example[example_index]: copresence_counts[(x, y)] += weight if num_xy_pairs_batch_copresent: num_xy_pairs_batch_copresent.update(len(copresence_counts)) for (x, y), count in copresence_counts.items(): yield (_SlicedXYKey(slice_key=slice_key, x_path=x_path.steps(), x=x, y=y), count)