Ejemplo n.º 1
0
def test_high_cardinality_not_discrete():
    vals = 3 * [1, 2, 3] + [4.0, 6.0, 9.0, 9.0]
    x = NumberTracker()
    for v in vals:
        x.track(v)
    summary = x.to_summary()
    assert not summary.is_discrete
Ejemplo n.º 2
0
def test_low_cardinality_is_discrete():
    vals = 3 * [1, 2, 3] + [4.0, 6.0, 9.0, 9.0]
    vals = vals * 10
    x = NumberTracker()
    for v in vals:
        x.track(v)
    summary = x.to_summary()
    assert summary.is_discrete
Ejemplo n.º 3
0
def test_not_discrete_if_float_is_present():
    vals = 3 * [1, 2, 3] + [4.0, 6.0, 9.0, 9.0]
    vals = vals * 10
    x = NumberTracker()
    for v in vals:
        x.track(v)
    summary = x.to_summary()
    assert not summary.is_discrete
Ejemplo n.º 4
0
    def from_protobuf(
        cls: "NLPMetrics",
        message: NLPMetricsMessage,
    ):

        nlp_met = NLPMetrics()
        nlp_met.wer = NumberTracker.from_protobuf(message.wer)
        nlp_met.wil = NumberTracker.from_protobuf(message.wil)
        nlp_met.mer = NumberTracker.from_protobuf(message.mer)

        return nlp_met
Ejemplo n.º 5
0
def test_int_value_should_not_increase_float_count():
    x = NumberTracker()
    for v in [10, 11, 12]:
        x.track(v)

    assert x.ints.count == 3
    assert x.floats.count == 0
    assert x.variance.stddev() == pytest.approx(1.0, 1e-3)

    assert x.theta_sketch.get_result().get_estimate() == pytest.approx(3, 1e-4)

    hist = x.histogram
    assert hist.get_n() == 3
    assert hist.get_max_value() == pytest.approx(12, 1e-4)
    assert hist.get_min_value() == pytest.approx(10, 1e-4)
Ejemplo n.º 6
0
 def __init__(
     self,
     name: str,
     number_tracker: NumberTracker = None,
     string_tracker: StringTracker = None,
     schema_tracker: SchemaTracker = None,
     counters: CountersTracker = None,
     frequent_items: FrequentItemsSketch = None,
     cardinality_tracker: HllSketch = None,
 ):
     # Handle default values
     if counters is None:
         counters = CountersTracker()
     if number_tracker is None:
         number_tracker = NumberTracker()
     if string_tracker is None:
         string_tracker = StringTracker()
     if schema_tracker is None:
         schema_tracker = SchemaTracker()
     if frequent_items is None:
         frequent_items = FrequentItemsSketch()
     if cardinality_tracker is None:
         cardinality_tracker = HllSketch()
     # Assign values
     self.column_name = name
     self.number_tracker = number_tracker
     self.string_tracker = string_tracker
     self.schema_tracker = schema_tracker
     self.counters = counters
     self.frequent_items = frequent_items
     self.cardinality_tracker = cardinality_tracker
Ejemplo n.º 7
0
    def to_protobuf(self, ):
        """
        Convert to protobuf

        Returns:
            TYPE: Description
        """
        return ScoreMatrixMessage(
            labels=self.labels,
            prediction_field=self.prediction_field,
            target_field=self.target_field,
            score_field=self.score_field,
            scores=[
                nt.to_protobuf()
                if nt else NumberTracker.to_protobuf(NumberTracker())
                for nt in np.ravel(self.confusion_matrix)
            ])
Ejemplo n.º 8
0
def test_count_is_correct():
    x = NumberTracker()
    assert x.count == 0
    x.track(None)
    assert x.count == 0
    for val in [1, 2, 3]:
        x.track(val)
    assert x.count == 3
    for val in [1.0, 2.0]:
        x.track(val)
    assert x.count == 5
Ejemplo n.º 9
0
def test_track_floats_ints_unique_in_cardinality_estimate():
    vals = [1, 2, 3, 4]
    x = NumberTracker()
    for val in vals:
        x.track(val)

    assert x.to_summary().unique_count.estimate == 4

    for val in vals:
        x.track(float(val))

    assert x.to_summary().unique_count.estimate == 8
Ejemplo n.º 10
0
def test_merge():
    x = NumberTracker()
    for v in [10, 11, 13]:
        x.track(v)

    merged = x.merge(x)

    assert merged.ints.count == 6
    assert merged.floats.count == 0
    assert merged.histogram.get_n() == 6
    assert merged.histogram.get_max_value() == 13.0
    assert merged.histogram.get_min_value() == 10.0

    msg = merged.to_protobuf()
    NumberTracker.from_protobuf(msg)
Ejemplo n.º 11
0
def test_protobuf_roundtrip():
    x0 = NumberTracker()
    for v in [10, 11, 13]:
        x0.track(v)

    msg = x0.to_protobuf()
    roundtrip = NumberTracker.from_protobuf(msg)

    assert x0.ints.count == roundtrip.ints.count
    assert x0.floats.count == roundtrip.floats.count
    assert x0.histogram.get_n() == roundtrip.histogram.get_n()
    assert x0.histogram.get_min_value() == roundtrip.histogram.get_min_value()
    assert x0.histogram.get_max_value() == roundtrip.histogram.get_max_value()
Ejemplo n.º 12
0
    def from_protobuf(message):
        """
        Load from a protobuf message

        Returns
        -------
        column_profile : ColumnProfile
        """
        return ColumnProfile(
            message.name,
            counters=CountersTracker.from_protobuf(message.counters),
            schema_tracker=SchemaTracker.from_protobuf(message.schema),
            number_tracker=NumberTracker.from_protobuf(message.numbers),
            string_tracker=StringTracker.from_protobuf(message.strings),
            frequent_items=FrequentItemsSketch.from_protobuf(
                message.frequent_items),
            cardinality_tracker=HllSketch.from_protobuf(
                message.cardinality_tracker),
        )
Ejemplo n.º 13
0
 def __init__(self,
              labels: List[str] = None,
              prediction_field: str = None,
              target_field: str = None,
              score_field: str = None):
     self.prediction_field = prediction_field
     self.target_field = target_field
     self.score_field = score_field
     if labels:
         self.labels = sorted(labels)
         num_labels = len(self.labels)
         self.confusion_matrix = np.empty([num_labels, num_labels],
                                          dtype=object)
         for each_ind_i in range(num_labels):
             for each_ind_j in range(num_labels):
                 self.confusion_matrix[each_ind_i,
                                       each_ind_j] = NumberTracker()
     else:
         self.labels = None
         self.confusion_matrix = None
Ejemplo n.º 14
0
    def from_protobuf(
        cls,
        message: ScoreMatrixMessage,
    ):
        if message.ByteSize() == 0:
            return None
        labels = message.labels
        num_labels = len(labels)
        matrix = np.array(
            [NumberTracker.from_protobuf(score)
             for score in message.scores]).reshape(
                 (num_labels, num_labels)) if num_labels > 0 else None

        cm_instance = ConfusionMatrix(
            labels=labels,
            prediction_field=message.prediction_field,
            target_field=message.target_field,
            score_field=message.score_field)
        cm_instance.confusion_matrix = matrix

        return cm_instance
Ejemplo n.º 15
0
    def __init__(
        self,
        labels: List[str] = None,
        prediction_field: str = None,
        target_field: str = None,
        score_field: str = None,
    ):
        self.prediction_field = prediction_field
        self.target_field = target_field
        self.score_field = score_field
        if labels:
            labels_size = len(labels)
            if labels_size > MODEL_METRICS_LABEL_SIZE_WARNING_THRESHOLD:
                _logger.warning(
                    f"The initialized confusion matrix has {labels_size} labels and the resulting"
                    " confusion matrix will be larger than is recommended with whylogs current"
                    " representation of the model metric for a confusion matrix of this size."
                )
            if labels_size > MODEL_METRICS_MAX_LABELS:
                raise ValueError(
                    f"The initialized confusion matrix has {labels_size} labels and the resulting"
                    " confusion matrix will be larger than is supported by whylogs current"
                    " representation of the model metric for a confusion matrix of this size,"
                    " selectively log the most important labels or configure the threshold of"
                    " {MODEL_METRICS_MAX_LABELS} higher by setting MODEL_METRICS_MAX_LABELS."
                )

            self.labels = sorted(labels)
            num_labels = len(self.labels)
            self.confusion_matrix = np.empty([num_labels, num_labels],
                                             dtype=object)
            for each_ind_i in range(num_labels):
                for each_ind_j in range(num_labels):
                    self.confusion_matrix[each_ind_i,
                                          each_ind_j] = NumberTracker()
        else:
            self.labels = None
            self.confusion_matrix = None
Ejemplo n.º 16
0
def test_merge():
    x = NumberTracker()
    for v in [10, 11, 13]:
        x.track(v)

    merged = x.merge(x)

    assert merged.ints.count == 6
    assert merged.floats.count == 0
    assert merged.histogram.get_n() == 6
    assert merged.histogram.get_max_value() == 13.0
    assert merged.histogram.get_min_value() == 10.0
    expected_freq = [
        (10, 2, 2, 2),
        (11, 2, 2, 2),
        (13, 2, 2, 2),
    ]
    compare_frequent_items(expected_freq, merged.frequent_numbers.get_frequent_items())

    msg = merged.to_protobuf()
    NumberTracker.from_protobuf(msg)
Ejemplo n.º 17
0
def test_float_after_int_resets_int_tracker():
    x = NumberTracker()

    x.track(10)
    x.track(11)
    assert x.ints.count == 2
    assert x.floats.count == 0

    x.track(12.0)

    assert x.ints.count == 0
    assert x.floats.count == 3
    assert x.variance.stddev() == pytest.approx(1.0, 1e-3)

    assert x.histogram.get_n() == 3
    assert x.theta_sketch.get_result().get_estimate() == pytest.approx(3, 1e-4)
    assert x.histogram.get_max_value() == pytest.approx(12, 1e-4)
    assert x.histogram.get_min_value() == pytest.approx(10, 1e-4)
Ejemplo n.º 18
0
def test_one_value_not_discrete():
    x = NumberTracker()
    x.track(1)
    assert not x.to_summary().is_discrete
Ejemplo n.º 19
0
 def __init__(self, prediction_field: str = None, target_field: str = None):
     self.prediction_field = prediction_field
     self.target_field = target_field
     self.mer = NumberTracker()
     self.wer = NumberTracker()
     self.wil = NumberTracker()
Ejemplo n.º 20
0
class NLPMetrics:
    def __init__(self, prediction_field: str = None, target_field: str = None):
        self.prediction_field = prediction_field
        self.target_field = target_field
        self.mer = NumberTracker()
        self.wer = NumberTracker()
        self.wil = NumberTracker()

    def update(self, predictions: Union[List[str], str], targets: Union[List[str]], transform=None) -> None:
        """
        Function adds predictions and targets computation of nlp metrics.

        Args:
            predictions (Union[str,List[str]]):
            targets (Union[List[str],str]):

        """
        if transform:
            mes = jiwer.compute_measures(truth=targets, hypothesis=predictions, truth_transform=transform, hypothesis_transform=transform)
        else:
            mes = jiwer.compute_measures(truth=targets, hypothesis=predictions)

        self.mer.track(mes["mer"])
        self.wer.track(mes["wer"])
        self.wil.track(mes["wil"])

    def merge(self, other: "NLPMetrics") -> "NLPMetrics":
        """
        Merge two seperate nlp metrics

        Args:
              other : nlp metrics to merge with self
        Returns:
              NLPMetrics: merged nlp metrics
        """
        if other is None:
            return self

        merged_nlp_metrics = NLPMetrics()
        merged_nlp_metrics.mer = self.mer.merge(other.mer)
        merged_nlp_metrics.wer = self.wer.merge(other.wer)
        merged_nlp_metrics.wil = self.wil.merge(other.wil)

        return merged_nlp_metrics

        return merged_nlp_metrics

    def to_protobuf(
        self,
    ) -> NLPMetricsMessage:
        """
        Convert to protobuf

        Returns:
            TYPE: Protobuf Message
        """

        return NLPMetricsMessage(
            mer=self.mer.to_protobuf(),
            wer=self.wer.to_protobuf(),
            wil=self.wil.to_protobuf(),
        )

    @classmethod
    def from_protobuf(
        cls: "NLPMetrics",
        message: NLPMetricsMessage,
    ):

        nlp_met = NLPMetrics()
        nlp_met.wer = NumberTracker.from_protobuf(message.wer)
        nlp_met.wil = NumberTracker.from_protobuf(message.wil)
        nlp_met.mer = NumberTracker.from_protobuf(message.mer)

        return nlp_met
Ejemplo n.º 21
0
def test_empty_merge_succeeds():
    x1 = NumberTracker()
    x2 = NumberTracker()
    x3 = x1.merge(x2)
    assert isinstance(x3, NumberTracker)