Example #1
0
def _make_dataset_feature_stats_proto(
    lifts: Tuple[_SlicedFeatureKey, _LiftSeries], y_path: types.FeaturePath,
    y_boundaries: Optional[np.ndarray]
) -> Tuple[types.SliceKey, statistics_pb2.DatasetFeatureStatistics]:
    """Generates DatasetFeatureStatistics proto for a given x_path, y_path pair.

  Args:
    lifts: The result of two successive group bys of lift values. The innermost
      grouping collects all the lift values for a given (slice, x_path and
      y_value) tuple (corresponding to a single LiftSeries message). The
      outermost grouping collects all the lift values for the same (slice,
      x_path) tuple (corresponding to the set of the LiftSeries which share the
      same value of y_path). The full structure of lifts is described by:
        (slice, x_path), [(y, y_count, [(x, lift, xy_count, x_count)])]
    y_path: The path used as Y in the lift expression: lift = P(Y=y|X=x) /
      P(Y=y).
    y_boundaries: Optionally, a set of bin boundaries used for binning y_path
      values.

  Returns:
    The populated DatasetFeatureStatistics proto.
  """
    key, lift_series_list = lifts
    stats = statistics_pb2.DatasetFeatureStatistics()
    cross_stats = stats.cross_features.add(path_x=key.x_path.to_proto(),
                                           path_y=y_path.to_proto())
    for lift_series in sorted(lift_series_list):
        lift_series_proto = (
            cross_stats.categorical_cross_stats.lift.lift_series.add(
                y_count=lift_series.y_count))
        y = lift_series.y
        if y_boundaries is not None:
            low_value, high_value = bin_util.get_boundaries(y, y_boundaries)
            lift_series_proto.y_bucket.low_value = low_value
            lift_series_proto.y_bucket.high_value = high_value
        elif isinstance(y, six.string_types):
            lift_series_proto.y_string = y
        else:
            lift_series_proto.y_int = y

        # dedupe possibly overlapping top_k and bottom_k x values.
        lift_values_deduped = {v.x: v for v in lift_series.lift_values}
        # sort by lift DESC, x ASC
        lift_values_sorted = sorted(lift_values_deduped.values(),
                                    key=lambda v: (-v.lift, v.x))
        for lift_value in lift_values_sorted:
            lift_value_proto = lift_series_proto.lift_values.add(
                lift=lift_value.lift,
                x_count=lift_value.x_count,
                x_and_y_count=lift_value.xy_count)
            x = lift_value.x
            if isinstance(x, six.string_types):
                lift_value_proto.x_string = x
            else:
                lift_value_proto.x_int = x
    return key.slice_key, stats
Example #2
0
def _make_dataset_feature_stats_proto(
    lifts: Tuple[_SlicedFeatureKey, Iterable[_LiftSeries]],
    y_path: types.FeaturePath, y_boundaries: Optional[np.ndarray],
    weighted_examples: bool, output_custom_stats: bool
) -> Tuple[types.SliceKey, statistics_pb2.DatasetFeatureStatistics]:
    """Generates DatasetFeatureStatistics proto for a given x_path, y_path pair.

  Args:
    lifts: The result of two successive group bys of lift values. The innermost
      grouping collects all the lift values for a given (slice, x_path and
      y_value) tuple (corresponding to a single LiftSeries message). The
      outermost grouping collects all the lift values for the same (slice,
      x_path) tuple (corresponding to the set of the LiftSeries which share the
      same value of y_path). The full structure of lifts is described by:
        (slice, x_path), [(y, y_count, [(x, lift, xy_count, x_count)])]
    y_path: The path used as Y in the lift expression: lift = P(Y=y|X=x) /
      P(Y=y).
    y_boundaries: Optionally, a set of bin boundaries used for binning y_path
      values.
    weighted_examples: Whether lift is computed over weighted examples, in which
      case the proto will output weighted counts (as floats) rather than simple
      counts (as ints).
    output_custom_stats: Whether to output custom stats for use with Facets.

  Returns:
    The populated DatasetFeatureStatistics proto.
  """
    key, lift_series_list = lifts
    stats = statistics_pb2.DatasetFeatureStatistics()
    cross_stats = stats.cross_features.add(path_x=key.x_path.to_proto(),
                                           path_y=y_path.to_proto())
    if output_custom_stats:
        feature_stats = stats.features.add(path=key.x_path.to_proto())
    for lift_series in sorted(lift_series_list):
        lift_series_proto = (
            cross_stats.categorical_cross_stats.lift.lift_series.add())
        if weighted_examples:
            lift_series_proto.weighted_y_count = lift_series.y_count
        else:
            lift_series_proto.y_count = lift_series.y_count
        y = lift_series.y
        if y_boundaries is not None and isinstance(y, int):
            low_value, high_value = bin_util.get_boundaries(y, y_boundaries)
            lift_series_proto.y_bucket.low_value = low_value
            lift_series_proto.y_bucket.high_value = high_value
            y_display_fmt = '[{},{}]' if high_value == float(
                'inf') else '[{},{})'
            y_display_val = y_display_fmt.format(low_value, high_value)
        elif isinstance(y, six.text_type):
            lift_series_proto.y_string = y
            y_display_val = y
        elif isinstance(y, six.binary_type):
            y_string = _get_unicode_value(y, y_path)
            lift_series_proto.y_string = y_string
            y_display_val = y_string
        else:
            lift_series_proto.y_int = y
            y_display_val = str(y)

        if output_custom_stats:
            hist = feature_stats.custom_stats.add(
                name='Lift (Y={})'.format(y_display_val)).rank_histogram

        # dedupe possibly overlapping top_k and bottom_k x values.
        lift_values_deduped = {v.x: v for v in lift_series.lift_values}
        # sort by lift DESC, x ASC
        lift_values_sorted = sorted(lift_values_deduped.values(),
                                    key=lambda v: (-v.lift, v.x))
        for lift_value in lift_values_sorted:
            lift_value_proto = lift_series_proto.lift_values.add(
                lift=lift_value.lift)
            if weighted_examples:
                lift_value_proto.weighted_x_count = lift_value.x_count
                lift_value_proto.weighted_x_and_y_count = lift_value.xy_count
            else:
                lift_value_proto.x_count = lift_value.x_count
                lift_value_proto.x_and_y_count = lift_value.xy_count
            x = lift_value.x
            if isinstance(x, six.text_type):
                lift_value_proto.x_string = x
                x_display_val = x
            elif isinstance(x, six.binary_type):
                x_string = _get_unicode_value(x, key.x_path)
                lift_value_proto.x_string = x_string
                x_display_val = x_string
            else:
                lift_value_proto.x_int = x
                x_display_val = str(x)

            if output_custom_stats:
                hist.buckets.add(label=x_display_val,
                                 sample_count=lift_value.lift)

    return key.slice_key, stats