class TestIntersectionOverUnion(unittest.TestCase):
    """
    Tests lib.src.measurement.intersection_over_union
    """

    intersection_over_union_data_provider = lambda: (
        (IntervalMock(1.0, 1.0), Interval(0.0, 0.0), 1.0),  # Sanity check
        (IntervalMock(3.0, 6.0), Interval(0.0, 0.0), 0.5),  # Sanity check
        (IntervalMock(1.0, 0.0), Interval(0.0, 0.0), 0.0
         ),  # Shouldn't do a division by 0
        (IntervalMock(0.0, 1.0), Interval(0.0, 0.0), 0.0
         ),  # One 0 should yield value 0
    )

    @data_provider(intersection_over_union_data_provider)
    def test_intersection_over_union(self, a: IntervalMock, b: IntervalMock,
                                     expected_value: float) -> None:
        """
        Tests intersection_over_union function.
        :param a:
        :param b:
        :param expected_value:
        :return:
        """
        self.assertEqual(intersection_over_union(a, b), expected_value)
 def test_get_length(self, start: Any, end: Any,
                     expected_length: float) -> None:
     """
     Tests get_length method's behaviour
     :param start:           Start of the interval
     :param end:             End of the interval
     :param expected_length: Expected calculated length
     :return: None
     """
     a = Interval(start, end)
     self.assertEqual(a.get_length(), expected_length)
 def test_to_formatted(self, start: Any, end: Any,
                       expected_formatted: str) -> None:
     """
     Tests to_formatted method's behaviour.
     :param start:              Start of the interval
     :param end:                End of the interval
     :param expected_formatted: Expected formatted output
     :return: None
     """
     a = Interval(start, end)
     self.assertEqual(a.to_formatted(), expected_formatted)
 def test_get_union(self, start: Any, end: Any, other_start: Any,
                    other_end: Any, expected_union: float) -> None:
     """
     Tests the get_union method's behaviour
     :param start:          Start of interval a
     :param end:            End of interval a
     :param other_start:    Start of interval b
     :param other_end:      End of interval b
     :param expected_union: Value to check against
     :return: None
     """
     a = Interval(start, end)
     b = Interval(other_start, other_end)
     self.assertEqual(a.get_union(b), expected_union)
def sentence_from_string(string: str) -> Sentence:
    """
    Creates a Sentence object from a given single line of an alignment

    :param string: Input string to parse

    :return: Sentence
    """
    parts = string.split("\t")

    try:
        interval_start = float(parts[0])
    except ValueError:
        interval_start = parts[0]

    try:
        interval_end = float(parts[1])
    except ValueError:
        interval_end = parts[1]

    additional_data = None
    if len(parts) > 3:
        additional_data = AdditionalData(float(parts[3]), float(parts[4]), float(parts[5]), float(parts[6]))

    return Sentence(parts[2].strip(), Interval(interval_start, interval_end), additional_data)
def intersection_over_union(ground_truth: Interval,
                            prediction: Interval) -> float:
    """
    Calculates the IOU score for two pairs, ground truth and prediction

    :param ground_truth: Interval
    :param prediction:   Interval

    :return: IOU
    """
    intersection = ground_truth.get_intersection(prediction)
    union = ground_truth.get_union(prediction)

    if intersection == 0.0 or union == 0.0:
        return 0

    return intersection / union
Beispiel #7
0
def transcript_to_sentences(transcript: str) -> List[Sentence]:
    """
    Creates a list of Sentence instances with empty intervals from a given text.

    :param transcript: String

    :return: List of Sentence instances
    """
    return [
        Sentence(sentence, Interval(None, None), None)
        for sentence in sent_tokenize(transcript, "german")
    ]
    def merge_with(self, other: "Sentence") -> "Sentence":
        """
        Merges two sentences

        :param other: Another sentence

        :return: Merged sentence
        """
        if not (isinstance(self.interval.start, float) and isinstance(other.interval.start, float)) or self.interval.start < other.interval.start:
            sentence = str(self.sentence).strip() + " " + str(other.sentence).strip()
            start_time = self.interval.start
            end_time = other.interval.end
        else:
            sentence = str(other.sentence).strip() + " " + str(self.sentence).strip()
            start_time = other.interval.start
            end_time = self.interval.end

        return Sentence(sentence, Interval(start_time, end_time), self.additional_data)
def test_compare_alignment_data_provider() -> Tuple:
    """
    Data provider function
    :return: Tuple
    """
    sentence_1 = Sentence("foo", Interval(0.0, 0.1),
                          AdditionalData(0.5, 0.4, 0.3, 0.2))
    sentence_2 = Sentence("bar", Interval(0.0, 0.2),
                          AdditionalData(0.6, 0.5, 0.4, 0.3))
    sentence_3 = Sentence("baz", Interval(0.1, 0.2),
                          AdditionalData(0.7, 0.6, 0.5, 0.4))
    sentence_4 = Sentence("qux", Interval(0.0, 0.2),
                          AdditionalData(0.8, 0.7, 0.6, 0.5))
    sentence_5 = Sentence("lorem", Interval(0.0001, 0.0002),
                          AdditionalData(0.9, 0.8, 0.7, 0.6))  # Doesn"t appear

    file_1_type1 = AlignmentFile("file_1_audacity_type1.txt",
                                 [sentence_1, sentence_2, sentence_5])
    file_2_type1 = AlignmentFile("file_2_audacity_type1.txt",
                                 [sentence_1, sentence_2, sentence_5])
    file_3_type1 = AlignmentFile("file_3_audacity_type1.txt", [sentence_1])
    file_4_type1 = AlignmentFile(
        "file_4_audacity_type1.txt",
        [sentence_1, sentence_2, sentence_3, sentence_4])
    file_1_type2 = AlignmentFile("file_1_audacity_type2.txt",
                                 [sentence_1, sentence_2, sentence_5])
    file_2_type2 = AlignmentFile("file_2_audacity_type2.txt",
                                 [sentence_1, sentence_2, sentence_5])
    file_3_type2 = AlignmentFile("file_3_audacity_type2.txt", [sentence_3])
    file_4_type2 = AlignmentFile(
        "file_4_audacity_type2.txt",
        [sentence_1, sentence_3, sentence_4, sentence_5])

    config = {
        "no_appearance": {
            "interval_length": 0.0001
        },
        "score_weights": {
            "gaps_google": 1,
            "gaps_transcript": 1,
            "alignment_score": -100,
            "google_confidence": 6.1249349
        }
    }

    return (
        # No data at all, should fallback to zeros etc., no warnings or errors
        ([], config, {
            "no_sentences": {
                "appearing": 0,
                "total": 0,
            },
            "ious": {
                "all": [],
                "all_only": [],
                "low": [],
                "mean": nan,
                "median": nan,
                "per_file": {}
            },
            "appearance": {
                "true_positives": 0,
                "false_positives": 0,
                "true_negatives": 0,
                "false_negatives": 0,
                "precision": 0.0,
                "recall": 0.0,
                "f1_score": 0.0
            },
            "scores": {
                "alignment_scores": {
                    "all": [],
                    "mean": nan,
                    "median": nan
                },
                "calculated": {
                    "all": []
                },
                "deviation": {
                    "all": [],
                    "mean": nan,
                    "median": nan
                },
                "google_confidence": {
                    "all": [],
                    "mean": nan,
                    "median": nan
                },
                "google_gaps": {
                    "all": [],
                    "mean": nan,
                    "median": nan
                },
                "transcript_gaps": {
                    "all": [],
                    "mean": nan,
                    "median": nan
                }
            }
        }),
        # Worst possible score with data
        ([file_3_type1, file_3_type2], config, {
            "no_sentences": {
                "appearing": 1,
                "total": 1
            },
            "ious": {
                "all_only": [0],
                "all": [(0, 0.1, 0.1, "foo", "\\file_3")],
                "low": ["\\file_3.wav"],
                "mean": 0.0,
                "median": 0.0,
                "per_file": {
                    "\\file_3": {
                        "mean": 0.0,
                        "median": 0.0,
                        "all": [(0, 0.1, 0.1, "foo", "\\file_3")]
                    }
                }
            },
            "appearance": {
                "true_positives": 1,
                "false_positives": 0,
                "true_negatives": 0,
                "false_negatives": 0,
                "precision": 1.0,
                "recall": 1.0,
                "f1_score": 1.0
            },
            "scores": {
                "alignment_scores": {
                    "all": [0.6],
                    "mean": 0.6,
                    "median": 0.6
                },
                "calculated": {
                    "all": [-65.42503905999999]
                },
                "deviation": {
                    "all": [0.2],
                    "mean": 0.2,
                    "median": 0.2
                },
                "google_confidence": {
                    "all": [0.7],
                    "mean": 0.7,
                    "median": 0.7
                },
                "google_gaps": {
                    "all": [0.4],
                    "mean": 0.4,
                    "median": 0.4
                },
                "transcript_gaps": {
                    "all": [0.5],
                    "mean": 0.5,
                    "median": 0.5
                }
            }
        }),
        # Perfect scores
        ([file_1_type1, file_2_type1, file_1_type2, file_2_type2], config, {
            "no_sentences": {
                "appearing": 4,
                "total": 6
            },
            "ious": {
                "all_only": [1.0, 1.0, 1.0, 1.0],
                "all": [(1.0, 0.1, 0.1, "foo", "\\file_1"),
                        (1.0, 0.2, 0.2, "bar", "\\file_1"),
                        (1.0, 0.1, 0.1, "foo", "\\file_2"),
                        (1.0, 0.2, 0.2, "bar", "\\file_2")],
                "low": [],
                "mean":
                1.0,
                "median":
                1.0,
                "per_file": {
                    "\\file_1": {
                        "mean":
                        1.0,
                        "median":
                        1.0,
                        "all": [(1.0, 0.1, 0.1, "foo", "\\file_1"),
                                (1.0, 0.2, 0.2, "bar", "\\file_1")]
                    },
                    "\\file_2": {
                        "mean":
                        1.0,
                        "median":
                        1.0,
                        "all": [(1.0, 0.1, 0.1, "foo", "\\file_2"),
                                (1.0, 0.2, 0.2, "bar", "\\file_2")]
                    }
                }
            },
            "appearance": {
                "true_positives": 4,
                "false_positives": 0,
                "true_negatives": 2,
                "false_negatives": 0,
                "precision": 1.0,
                "recall": 1.0,
                "f1_score": 1.0
            },
            "scores": {
                "alignment_scores": {
                    "all": [0.4, 0.5, 0.4, 0.5],
                    "mean": 0.45,
                    "median": 0.45
                },
                "calculated": {
                    "all":
                    [-47.05002604, -56.23753255, -47.05002604, -56.23753255]
                },
                "deviation": {
                    "all": [0.0, 0.0, 0.0, 0.0],
                    "mean": 0.0,
                    "median": 0.0
                },
                "google_confidence": {
                    "all": [0.5, 0.6, 0.5, 0.6],
                    "mean": 0.55,
                    "median": 0.55
                },
                "google_gaps": {
                    "all": [0.2, 0.3, 0.2, 0.3],
                    "mean": 0.25,
                    "median": 0.25
                },
                "transcript_gaps": {
                    "all": [0.3, 0.4, 0.3, 0.4],
                    "mean": 0.35,
                    "median": 0.35
                }
            }
        }),
        # Some intermediate score
        ([file_4_type1, file_4_type2], config, {
            "no_sentences": {
                "appearing": 3,
                "total": 4
            },
            "ious": {
                "all_only": [1.0, 0.5, 0.5],
                "all": [(1.0, 0.1, 0.1, "foo", "\\file_4"),
                        (0.5, 0.2, 0.1, "bar", "\\file_4"),
                        (0.5, 0.1, 0.2, "baz", "\\file_4")],
                "low": [],
                "mean":
                0.66666666666666663,
                "median":
                0.5,
                "per_file": {
                    "\\file_4": {
                        "mean":
                        0.66666666666666663,
                        "median":
                        0.5,
                        "all": [(1.0, 0.1, 0.1, "foo", "\\file_4"),
                                (0.5, 0.2, 0.1, "bar", "\\file_4"),
                                (0.5, 0.1, 0.2, "baz", "\\file_4")]
                    }
                }
            },
            "appearance": {
                "true_positives": 3,
                "false_positives": 0,
                "true_negatives": 0,
                "false_negatives": 1,
                "precision": 1.0,
                "recall": 0.75,
                "f1_score": 0.8571428571428571
            },
            "scores": {
                "alignment_scores": {
                    "all": [0.4, 0.6, 0.7],
                    "mean": 0.5666666666666667,
                    "median": 0.6
                },
                "calculated": {
                    "all":
                    [-47.05002604, -65.42503905999999, -74.61254557000001]
                },
                "deviation": {
                    "all": [0.0, 0.1, 0.1],
                    "mean": 0.06666666666666667,
                    "median": 0.1
                },
                "google_confidence": {
                    "all": [0.5, 0.7, 0.8],
                    "mean": 0.6666666666666666,
                    "median": 0.7
                },
                "google_gaps": {
                    "all": [0.2, 0.4, 0.5],
                    "mean": 0.3666666666666667,
                    "median": 0.4
                },
                "transcript_gaps": {
                    "all": [0.3, 0.5, 0.6],
                    "mean": 0.4666666666666666,
                    "median": 0.5
                }
            }
        }),
        # One file exists, the other one doesn"t, hence no data
        ([file_1_type1], config, {
            "no_sentences": {
                "appearing": 0,
                "total": 0,
            },
            "ious": {
                "all_only": [],
                "all": [],
                "low": [],
                "mean": nan,
                "median": nan,
                "per_file": {}
            },
            "appearance": {
                "true_positives": 0,
                "false_positives": 0,
                "true_negatives": 0,
                "false_negatives": 0,
                "precision": 0.0,
                "recall": 0.0,
                "f1_score": 0.0
            },
            "scores": {
                "alignment_scores": {
                    "all": [],
                    "mean": nan,
                    "median": nan
                },
                "calculated": {
                    "all": []
                },
                "deviation": {
                    "all": [],
                    "mean": nan,
                    "median": nan
                },
                "google_confidence": {
                    "all": [],
                    "mean": nan,
                    "median": nan
                },
                "google_gaps": {
                    "all": [],
                    "mean": nan,
                    "median": nan
                },
                "transcript_gaps": {
                    "all": [],
                    "mean": nan,
                    "median": nan
                }
            }
        }),
    )
class TestSentence(unittest.TestCase):
    """
    Test class lib.src.model.Sentence
    """

    audacity_label_format_data_provider = lambda: (
        ("Lorem ipsum dolor", IntervalMock(0.0, 0.0),
         "mocked_interval\tLorem ipsum dolor"),  # Sanity check
        (None, IntervalMock(0.0, 0.0), "mocked_interval\tNone"
         ),  # Sentence is None
        ("", IntervalMock(0.0, 0.0), "mocked_interval\t"),  # Empty sentence
    )

    @data_provider(audacity_label_format_data_provider)
    def test_to_audacity_label_format(self, sentence: Any,
                                      interval: IntervalMock,
                                      expected_format: str) -> None:
        """
        Tests to_audacity_label_format method's behaviour
        :param sentence:        Inner sentence
        :param interval:        Mocked interval
        :param expected_format: Expected audacity label format
        :return: None
        """
        s = Sentence(sentence, interval, None)
        self.assertEqual(s.to_audacity_label_format(), expected_format)

    merge_sentence_data_provider = lambda: (
        ("foo", IntervalMock(0.0, 0.1), "bar", IntervalMock(0.1, 0.2),
         Sentence("foo bar", Interval(0.0, 0.2), None)),  # Sanity check
        ("foo", IntervalMock(0.1, 0.2), "bar", IntervalMock(0.0, 0.1),
         Sentence("bar foo", Interval(0.0, 0.2), None)),  # Flipped intervals
        ("   foo   ", IntervalMock(0.0, 0.1), "   bar   ",
         IntervalMock(0.1, 0.2), Sentence("foo bar", Interval(0.0, 0.2), None)
         ),  # Additional spaces around sentences
        (None, IntervalMock(0.0, 0.1), "bar", IntervalMock(0.1, 0.2),
         Sentence("None bar", Interval(0.0, 0.2), None)
         ),  # One sentence is None
        ("foo", IntervalMock(0.0, 0.1), None, IntervalMock(0.1, 0.2),
         Sentence("foo None", Interval(0.0, 0.2), None)
         ),  # Other sentence is None
        (None, IntervalMock(0.0, 0.1), None, IntervalMock(0.1, 0.2),
         Sentence("None None", Interval(0.0, 0.2), None)
         ),  # Both sentences are none
    )

    @data_provider(merge_sentence_data_provider)
    def test_merge_with(self, sentence: str, interval: IntervalMock,
                        other_sentence: str, other_interval: IntervalMock,
                        expected_sentence: Sentence) -> None:
        """
        Tests the merge_with method's behaviour
        :param sentence:          First sentence as string
        :param interval:          First interval
        :param other_sentence:    Other sentence as string
        :param other_interval:    Other interval
        :param expected_sentence: Expected merged sentence
        :return: None
        """
        a = Sentence(sentence, interval, None)
        b = Sentence(other_sentence, other_interval, None)
        self.assertEqual(
            a.merge_with(b).to_audacity_label_format(),
            expected_sentence.to_audacity_label_format())
        pass

    from_string_data_provider = lambda: (
        ("0.0000\t1.0000\tfoo bar baz", float, 0.0, float, 1.0, "foo bar baz"
         ),  # Sanity check
        ("-\t0.00\tfoo bar baz", str, "-", float, 0.0, "foo bar baz"
         ),  # One interval part isn't float
        ("0.00\t-\tfoo bar baz", float, 0.0, str, "-", "foo bar baz"
         ),  # Other interval part isn't float
        ("-\t-\tfoo bar baz", str, "-", str, "-", "foo bar baz"
         ),  # Both interval parts aren't float
    )

    @data_provider(from_string_data_provider)
    def test_sentence_from_string(self, input_string: str,
                                  expected_start_type: Any,
                                  expected_start: Any, expected_end_type: Any,
                                  expected_end: Any,
                                  expected_sentence: str) -> None:
        """
        Tests sntence_from_string function's behaviour.
        :param input_string:        Formatted string to parse
        :param expected_start_type: Type of the inner Interval's start property
        :param expected_start:      Value of the inner Interval's start property
        :param expected_end_type:   Type of the inner Interval's end property
        :param expected_end:        Value of the inner Interval's end property
        :param expected_sentence:   Expected inner sentence as string
        :return: None
        """
        s = sentence_from_string(input_string)
        self.assertIsInstance(s.interval.start, expected_start_type)
        self.assertEqual(s.interval.start, expected_start)
        self.assertIsInstance(s.interval.end, expected_end_type)
        self.assertEqual(s.interval.end, expected_end)
        self.assertEqual(s.sentence, expected_sentence)