def test_high_cardinality_not_discrete(): vals = 3 * [1, 2, 3] + [4.0, 6.0, 9.0, 9.0] x = NumberTracker() for v in vals: x.track(v) summary = x.to_summary() assert not summary.is_discrete
def test_low_cardinality_is_discrete(): vals = 3 * [1, 2, 3] + [4.0, 6.0, 9.0, 9.0] vals = vals * 10 x = NumberTracker() for v in vals: x.track(v) summary = x.to_summary() assert summary.is_discrete
def test_not_discrete_if_float_is_present(): vals = 3 * [1, 2, 3] + [4.0, 6.0, 9.0, 9.0] vals = vals * 10 x = NumberTracker() for v in vals: x.track(v) summary = x.to_summary() assert not summary.is_discrete
def from_protobuf( cls: "NLPMetrics", message: NLPMetricsMessage, ): nlp_met = NLPMetrics() nlp_met.wer = NumberTracker.from_protobuf(message.wer) nlp_met.wil = NumberTracker.from_protobuf(message.wil) nlp_met.mer = NumberTracker.from_protobuf(message.mer) return nlp_met
def test_int_value_should_not_increase_float_count(): x = NumberTracker() for v in [10, 11, 12]: x.track(v) assert x.ints.count == 3 assert x.floats.count == 0 assert x.variance.stddev() == pytest.approx(1.0, 1e-3) assert x.theta_sketch.get_result().get_estimate() == pytest.approx(3, 1e-4) hist = x.histogram assert hist.get_n() == 3 assert hist.get_max_value() == pytest.approx(12, 1e-4) assert hist.get_min_value() == pytest.approx(10, 1e-4)
def __init__( self, name: str, number_tracker: NumberTracker = None, string_tracker: StringTracker = None, schema_tracker: SchemaTracker = None, counters: CountersTracker = None, frequent_items: FrequentItemsSketch = None, cardinality_tracker: HllSketch = None, ): # Handle default values if counters is None: counters = CountersTracker() if number_tracker is None: number_tracker = NumberTracker() if string_tracker is None: string_tracker = StringTracker() if schema_tracker is None: schema_tracker = SchemaTracker() if frequent_items is None: frequent_items = FrequentItemsSketch() if cardinality_tracker is None: cardinality_tracker = HllSketch() # Assign values self.column_name = name self.number_tracker = number_tracker self.string_tracker = string_tracker self.schema_tracker = schema_tracker self.counters = counters self.frequent_items = frequent_items self.cardinality_tracker = cardinality_tracker
def to_protobuf(self, ): """ Convert to protobuf Returns: TYPE: Description """ return ScoreMatrixMessage( labels=self.labels, prediction_field=self.prediction_field, target_field=self.target_field, score_field=self.score_field, scores=[ nt.to_protobuf() if nt else NumberTracker.to_protobuf(NumberTracker()) for nt in np.ravel(self.confusion_matrix) ])
def test_count_is_correct(): x = NumberTracker() assert x.count == 0 x.track(None) assert x.count == 0 for val in [1, 2, 3]: x.track(val) assert x.count == 3 for val in [1.0, 2.0]: x.track(val) assert x.count == 5
def test_track_floats_ints_unique_in_cardinality_estimate(): vals = [1, 2, 3, 4] x = NumberTracker() for val in vals: x.track(val) assert x.to_summary().unique_count.estimate == 4 for val in vals: x.track(float(val)) assert x.to_summary().unique_count.estimate == 8
def test_merge(): x = NumberTracker() for v in [10, 11, 13]: x.track(v) merged = x.merge(x) assert merged.ints.count == 6 assert merged.floats.count == 0 assert merged.histogram.get_n() == 6 assert merged.histogram.get_max_value() == 13.0 assert merged.histogram.get_min_value() == 10.0 msg = merged.to_protobuf() NumberTracker.from_protobuf(msg)
def test_protobuf_roundtrip(): x0 = NumberTracker() for v in [10, 11, 13]: x0.track(v) msg = x0.to_protobuf() roundtrip = NumberTracker.from_protobuf(msg) assert x0.ints.count == roundtrip.ints.count assert x0.floats.count == roundtrip.floats.count assert x0.histogram.get_n() == roundtrip.histogram.get_n() assert x0.histogram.get_min_value() == roundtrip.histogram.get_min_value() assert x0.histogram.get_max_value() == roundtrip.histogram.get_max_value()
def from_protobuf(message): """ Load from a protobuf message Returns ------- column_profile : ColumnProfile """ return ColumnProfile( message.name, counters=CountersTracker.from_protobuf(message.counters), schema_tracker=SchemaTracker.from_protobuf(message.schema), number_tracker=NumberTracker.from_protobuf(message.numbers), string_tracker=StringTracker.from_protobuf(message.strings), frequent_items=FrequentItemsSketch.from_protobuf( message.frequent_items), cardinality_tracker=HllSketch.from_protobuf( message.cardinality_tracker), )
def __init__(self, labels: List[str] = None, prediction_field: str = None, target_field: str = None, score_field: str = None): self.prediction_field = prediction_field self.target_field = target_field self.score_field = score_field if labels: self.labels = sorted(labels) num_labels = len(self.labels) self.confusion_matrix = np.empty([num_labels, num_labels], dtype=object) for each_ind_i in range(num_labels): for each_ind_j in range(num_labels): self.confusion_matrix[each_ind_i, each_ind_j] = NumberTracker() else: self.labels = None self.confusion_matrix = None
def from_protobuf( cls, message: ScoreMatrixMessage, ): if message.ByteSize() == 0: return None labels = message.labels num_labels = len(labels) matrix = np.array( [NumberTracker.from_protobuf(score) for score in message.scores]).reshape( (num_labels, num_labels)) if num_labels > 0 else None cm_instance = ConfusionMatrix( labels=labels, prediction_field=message.prediction_field, target_field=message.target_field, score_field=message.score_field) cm_instance.confusion_matrix = matrix return cm_instance
def __init__( self, labels: List[str] = None, prediction_field: str = None, target_field: str = None, score_field: str = None, ): self.prediction_field = prediction_field self.target_field = target_field self.score_field = score_field if labels: labels_size = len(labels) if labels_size > MODEL_METRICS_LABEL_SIZE_WARNING_THRESHOLD: _logger.warning( f"The initialized confusion matrix has {labels_size} labels and the resulting" " confusion matrix will be larger than is recommended with whylogs current" " representation of the model metric for a confusion matrix of this size." ) if labels_size > MODEL_METRICS_MAX_LABELS: raise ValueError( f"The initialized confusion matrix has {labels_size} labels and the resulting" " confusion matrix will be larger than is supported by whylogs current" " representation of the model metric for a confusion matrix of this size," " selectively log the most important labels or configure the threshold of" " {MODEL_METRICS_MAX_LABELS} higher by setting MODEL_METRICS_MAX_LABELS." ) self.labels = sorted(labels) num_labels = len(self.labels) self.confusion_matrix = np.empty([num_labels, num_labels], dtype=object) for each_ind_i in range(num_labels): for each_ind_j in range(num_labels): self.confusion_matrix[each_ind_i, each_ind_j] = NumberTracker() else: self.labels = None self.confusion_matrix = None
def test_merge(): x = NumberTracker() for v in [10, 11, 13]: x.track(v) merged = x.merge(x) assert merged.ints.count == 6 assert merged.floats.count == 0 assert merged.histogram.get_n() == 6 assert merged.histogram.get_max_value() == 13.0 assert merged.histogram.get_min_value() == 10.0 expected_freq = [ (10, 2, 2, 2), (11, 2, 2, 2), (13, 2, 2, 2), ] compare_frequent_items(expected_freq, merged.frequent_numbers.get_frequent_items()) msg = merged.to_protobuf() NumberTracker.from_protobuf(msg)
def test_float_after_int_resets_int_tracker(): x = NumberTracker() x.track(10) x.track(11) assert x.ints.count == 2 assert x.floats.count == 0 x.track(12.0) assert x.ints.count == 0 assert x.floats.count == 3 assert x.variance.stddev() == pytest.approx(1.0, 1e-3) assert x.histogram.get_n() == 3 assert x.theta_sketch.get_result().get_estimate() == pytest.approx(3, 1e-4) assert x.histogram.get_max_value() == pytest.approx(12, 1e-4) assert x.histogram.get_min_value() == pytest.approx(10, 1e-4)
def test_one_value_not_discrete(): x = NumberTracker() x.track(1) assert not x.to_summary().is_discrete
def __init__(self, prediction_field: str = None, target_field: str = None): self.prediction_field = prediction_field self.target_field = target_field self.mer = NumberTracker() self.wer = NumberTracker() self.wil = NumberTracker()
class NLPMetrics: def __init__(self, prediction_field: str = None, target_field: str = None): self.prediction_field = prediction_field self.target_field = target_field self.mer = NumberTracker() self.wer = NumberTracker() self.wil = NumberTracker() def update(self, predictions: Union[List[str], str], targets: Union[List[str]], transform=None) -> None: """ Function adds predictions and targets computation of nlp metrics. Args: predictions (Union[str,List[str]]): targets (Union[List[str],str]): """ if transform: mes = jiwer.compute_measures(truth=targets, hypothesis=predictions, truth_transform=transform, hypothesis_transform=transform) else: mes = jiwer.compute_measures(truth=targets, hypothesis=predictions) self.mer.track(mes["mer"]) self.wer.track(mes["wer"]) self.wil.track(mes["wil"]) def merge(self, other: "NLPMetrics") -> "NLPMetrics": """ Merge two seperate nlp metrics Args: other : nlp metrics to merge with self Returns: NLPMetrics: merged nlp metrics """ if other is None: return self merged_nlp_metrics = NLPMetrics() merged_nlp_metrics.mer = self.mer.merge(other.mer) merged_nlp_metrics.wer = self.wer.merge(other.wer) merged_nlp_metrics.wil = self.wil.merge(other.wil) return merged_nlp_metrics return merged_nlp_metrics def to_protobuf( self, ) -> NLPMetricsMessage: """ Convert to protobuf Returns: TYPE: Protobuf Message """ return NLPMetricsMessage( mer=self.mer.to_protobuf(), wer=self.wer.to_protobuf(), wil=self.wil.to_protobuf(), ) @classmethod def from_protobuf( cls: "NLPMetrics", message: NLPMetricsMessage, ): nlp_met = NLPMetrics() nlp_met.wer = NumberTracker.from_protobuf(message.wer) nlp_met.wil = NumberTracker.from_protobuf(message.wil) nlp_met.mer = NumberTracker.from_protobuf(message.mer) return nlp_met
def test_empty_merge_succeeds(): x1 = NumberTracker() x2 = NumberTracker() x3 = x1.merge(x2) assert isinstance(x3, NumberTracker)