Ejemplo n.º 1
0
def _to_partial_x_counts(
    sliced_record_batch: types.SlicedRecordBatch,
    x_paths: Iterable[types.FeaturePath], example_weight_map: ExampleWeightMap
) -> Iterator[Tuple[_SlicedXKey, _CountType]]:
    """Yields per-(slice, x_path, x) counts of the examples with x in x_path."""
    for x_path in x_paths:
        for (slice_key, x), x_count in _to_partial_counts(
                sliced_record_batch,
                x_path,
                boundaries=None,
                weight_column_name=example_weight_map.get(x_path)):
            yield _SlicedXKey(slice_key, x_path.steps(), x), x_count
Ejemplo n.º 2
0
def _to_partial_copresence_counts(
    sliced_record_batch: types.SlicedRecordBatch,
    y_path: types.FeaturePath,
    x_paths: Iterable[types.FeaturePath],
    y_boundaries: Optional[np.ndarray],
    example_weight_map: ExampleWeightMap,
    num_xy_pairs_batch_copresent: Optional[
        beam.metrics.metric.Metrics.DelegatingDistribution] = None
) -> Iterator[Tuple[_SlicedXYKey, _CountType]]:
    """Yields per-(slice, path_x, x, y) counts of examples with x and y.

  This method generates the number of times a given pair of y- and x-values
  appear in the same record, for a slice_key and x_path. Records in which either
  x or y is absent will be skipped.

  Args:
    sliced_record_batch: A tuple of (slice_key, record_batch) representing a
      slice of examples
    y_path: The path to use as Y in the lift expression: lift = P(Y=y|X=x) /
      P(Y=y).
    x_paths: A set of x_paths for which to compute lift.
    y_boundaries: Optionally, a set of bin boundaries to use for binning y_path
      values.
    example_weight_map: an ExampleWeightMap that maps a FeaturePath to its
        corresponding weight column.
    num_xy_pairs_batch_copresent: A counter tracking the number of different xy
      pairs that are copresent within each batch. If the same pair of xy values
      are copresent in more than one batch, this counter will be incremented
      once for each batch in which they are copresent.

  Yields:
    Tuples of the form (_SlicedXYKey(slice_key, x_path, x, y), count) for each
    combination of  x_path, x, and y  in the input record batch.
  """
    slice_key, record_batch = sliced_record_batch
    y_presence = _get_example_value_presence(record_batch,
                                             y_path,
                                             y_boundaries,
                                             weight_column_name=None)
    if y_presence is None:
        return
    ys_by_example = collections.defaultdict(list)
    for example_index, y in zip(y_presence.example_indices, y_presence.values):
        ys_by_example[example_index].append(y)
    for x_path in x_paths:
        weight_column_name = example_weight_map.get(x_path)
        x_presence = _get_example_value_presence(
            record_batch,
            x_path,
            boundaries=None,
            weight_column_name=weight_column_name)
        if x_presence is None:
            continue
        if weight_column_name is not None:
            copresence_counts = collections.defaultdict(float)
        else:
            copresence_counts = collections.defaultdict(int)

        for example_index, x, weight in zip(x_presence.example_indices,
                                            x_presence.values,
                                            x_presence.weights):
            for y in ys_by_example[example_index]:
                copresence_counts[(x, y)] += weight

        if num_xy_pairs_batch_copresent:
            num_xy_pairs_batch_copresent.update(len(copresence_counts))
        for (x, y), count in copresence_counts.items():
            yield (_SlicedXYKey(slice_key=slice_key,
                                x_path=x_path.steps(),
                                x=x,
                                y=y), count)