Beispiel #1
0
 def test_punct_as_separate_words_2deletes(self):
     sample_ref = [
         "it's", "cheaper", "to", "settle", "than", "to", "fight", "the",
         "lawsuit", "clearly"
     ]
     sample_ref_map = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
     sample_align = ["C", "C", "C", "S", "C", "C", "C", "D", "D", "C"]
     sample_hyp = [
         "it's", "cheaper", "to", "settled", "than", "to", "fight", "", "",
         "clearly"
     ]
     sample_hyp_map = [0, 1, 2, 3, 4, 5, 6, 9]
     sample_hyp_punct = [
         "it's", "cheaper", "to", "settled", "than", "to", "fight --", "",
         "", "clearly,"
     ]
     error_alignment = ExpandedAlignment(sample_ref,
                                         sample_hyp,
                                         sample_align,
                                         sample_ref_map,
                                         sample_hyp_map,
                                         lowercase=True)
     ref_punct = '''it's cheaper to settle than to fight the lawsuit -- clearly,'''
     expected = ExpandedAlignment(sample_ref,
                                  sample_hyp_punct,
                                  sample_align,
                                  sample_ref_map,
                                  sample_hyp_map,
                                  lowercase=True).s2
     actual = PunctInsertOracle.insertPunct(error_alignment, ref_punct).s2
     print(actual)
     self.maxDiff = None
     self.assertEqual(actual, expected)
Beispiel #2
0
 def test_punct_period_at_end_wer_2(self):
     ref_punct = "So we call this the slicer mode."
     sample_ref = ["So", "we", "call", "this", "the", "slicer", "mode", ""]
     sample_ref_map = [0, 1, 2, 3, 4, 5, 6]
     sample_align = ["C", "C", "S", "C", "S", "S", "S", "I"]
     sample_hyp = ["so", "we", "hold", "this", "new", "slice", "them", "on"]
     sample_hyp_map = [0, 1, 2, 3, 4, 5, 6, 7]
     sample_hyp_punct = [
         "so", "we", "hold", "this", "new", "slice", "them", "on."
     ]
     error_alignment = ExpandedAlignment(sample_ref,
                                         sample_hyp,
                                         sample_align,
                                         sample_ref_map,
                                         sample_hyp_map,
                                         lowercase=True)
     expected = ExpandedAlignment(sample_ref,
                                  sample_hyp_punct,
                                  sample_align,
                                  sample_ref_map,
                                  sample_hyp_map,
                                  lowercase=True).s2
     actual = PunctInsertOracle.insertPunct(error_alignment, ref_punct).s2
     print(actual)
     self.maxDiff = None
     self.assertEqual(actual, expected)
Beispiel #3
0
 def test_punct_as_separate_words(self):
     sample_ref = [
         "The", "reason", "they", "settled", "out", "is", "because", "it's",
         "cheaper", "to", "settle", "than", "to", "fight", "the", "lawsuit",
         "clearly", "two", "million", "dollars", "cheaper", "in", "some",
         "cases", "and", "much", "worse", "if", "you", "actually", "lose"
     ]
     sample_ref_map = [
         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
         19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
     ]
     sample_align = [
         "C", "C", "C", "S", "C", "C", "C", "C", "C", "C", "C", "C", "C",
         "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C",
         "C", "S", "S", "C", "C"
     ]
     sample_hyp = [
         "the", "reason", "they", "settle", "out", "is", "because", "it's",
         "cheaper", "to", "settle", "than", "to", "fight", "the", "lawsuit",
         "clearly", "two", "million", "dollars", "cheaper", "in", "some",
         "cases", "and", "much", "worse", "a", "few", "actually", "lose"
     ]
     sample_hyp_map = [
         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
         19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
     ]
     sample_hyp_punct = [
         "the", "reason", "they", "settle", "out", "is", "because", "it's",
         "cheaper", "to", "settle", "than", "to", "fight", "the",
         "lawsuit --", "clearly,", "two", "million", "dollars", "cheaper",
         "in", "some", "cases,", "and", "much", "worse", "a", "few",
         "actually", "lose."
     ]
     error_alignment = ExpandedAlignment(sample_ref,
                                         sample_hyp,
                                         sample_align,
                                         sample_ref_map,
                                         sample_hyp_map,
                                         lowercase=True)
     ref_punct = '''The reason they settled out is because it's cheaper to settle than to fight the lawsuit -- clearly, two million dollars cheaper in some cases, and much worse if you actually lose.'''
     expected = ExpandedAlignment(sample_ref,
                                  sample_hyp_punct,
                                  sample_align,
                                  sample_ref_map,
                                  sample_hyp_map,
                                  lowercase=True).s2
     actual = PunctInsertOracle.insertPunct(error_alignment, ref_punct).s2
     print(actual)
     self.maxDiff = None
     self.assertEqual(actual, expected)
Beispiel #4
0
 def test_normalizeAlignment_textnum_hyphen(self):
     ref_aligned = [
         u'A', u'50-year-old', u'business', u'man', u'lamented', u'to',
         u'me', u'that', u'he', u'feels', u'he', u"doesn't", u'have',
         u'colleagues', u'anymore', u'at', u'work'
     ]
     hyp_aligned = [
         u'', u'fifty year old', u'business', u'man', u'laments', u'to',
         u'me', u'that', u'he', u'feels', u'he', u"doesn't", u'have',
         u'colleagues', u'anymore', u'it', u'work'
     ]
     alignment = [
         u'D', u'S', u'C', u'C', u'S', u'C', u'C', u'C', u'C', u'C', u'C',
         u'C', u'C', u'C', u'C', u'S', u'C'
     ]
     expected = ' '.join([
         u'', u'50-year-old', u'business', u'man', u'laments', u'to', u'me',
         u'that', u'he', u'feels', u'he', u"doesn't", u'have',
         u'colleagues', u'anymore', u'it', u'work'
     ])
     expand_align = ExpandedAlignment(ref_aligned,
                                      hyp_aligned,
                                      alignment,
                                      lowercase=True)
     actual = HypothesisNormalizer.normalizeAligned(expand_align)
     self.assertEqual(actual, expected)
Beispiel #5
0
 def test_punct_period_at_end_power(self):
     ref_aligned = [
         u'A', u'50-year-old', u'business', u'man', u'lamented', u'to',
         u'me', u'that', u'he', u'feels', u'he', u"doesn't", u'have',
         u'colleagues', u'anymore', u'at', u'work'
     ]
     ref_map = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
     align = [
         u'D', u'S', u'C', u'C', u'S', u'C', u'C', u'C', u'C', u'C', u'C',
         u'S', u'C', u'C', u'S', u'S', u'S'
     ]
     hyp_aligned = [
         u'', u'fifty year old', u'business', u'man', u'laments', u'to',
         u'me', u'that', u'he', u'feels', u'he', u'does not', u'have',
         u'colleagues', u'any more', u'it', u'work'
     ]
     hyp_map = [
         1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16
     ]
     expand_align = ExpandedAlignment(ref_aligned,
                                      hyp_aligned,
                                      align,
                                      ref_map,
                                      hyp_map,
                                      lowercase=True)
     expected = [
         u'', u'fifty year old', u'business', u'man', u'laments', u'to',
         u'me', u'that', u'he', u'feels', u'he', u'does not', u'have',
         u'colleagues', u'any more', u'it', u'work.'
     ]
     ref_punct = "A 50-year-old business man lamented to me that he feels he doesn't have colleagues anymore at work."
     actual = PunctInsertOracle.insertPunct(expand_align, ref_punct).s2
     print(actual)
     self.maxDiff = None
     self.assertEqual(actual, expected)
Beispiel #6
0
 def test_normalizeAlignment_number(self):
     ref_aligned = [
         u'You', u'need', u'to', u'know', u'that', u'the', u'average',
         u'patent', u'troll', u'defense', u'costs', u'two million',
         u'dollars', u'and', u'takes', u'18', u'months', u'when', u'you',
         u'win'
     ]
     hyp_aligned = [
         u'you', u'need', u'to', u'know', u'that', u'the', u'average',
         u'patent', u'troll', u'defense', u'cost', u'2000000', u'dollars',
         u'and', u'takes', u'18', u'months', u'when', u'you', u'win'
     ]
     alignment = [
         u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'S',
         u'S', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C'
     ]
     expected = ' '.join([
         u'you', u'need', u'to', u'know', u'that', u'the', u'average',
         u'patent', u'troll', u'defense', u'cost', u'two million',
         u'dollars', u'and', u'takes', u'18', u'months', u'when', u'you',
         u'win'
     ])
     expand_align = ExpandedAlignment(ref_aligned,
                                      hyp_aligned,
                                      alignment,
                                      lowercase=True)
     actual = HypothesisNormalizer.normalizeAligned(expand_align)
     self.assertEqual(actual, expected)
Beispiel #7
0
 def test_normalize_alignment_year(self):
     ref_aligned = ["Now", "fast-forward", "to", "2012"]
     hyp_aligned = ["now", "fast-forward", "to", "twenty twelve"]
     alignment = ["C", "C", "C", "S"]
     expected = ' '.join(["now", "fast-forward", "to", "2012"])
     expand_align = ExpandedAlignment(ref_aligned,
                                      hyp_aligned,
                                      alignment,
                                      lowercase=True)
     actual = HypothesisNormalizer.normalizeAligned(expand_align)
     self.assertEqual(actual, expected)
Beispiel #8
0
    def test_read_alignments_911(self):
        jstr = """{"errRate": 0.32653061224489793, "alignments": [{"hyp": "Everyone", "align": "C", "ref": "Everyone"}, {"hyp": "who", "align": "C", "ref": "who"}, {"hyp": "knew", "align": "C", "ref": "knew"}, {"hyp": "me", "align": "C", "ref": "me"}, {"hyp": "before", "align": "C", "ref": "before"}, {"hyp": "nine", "align": "S", "ref": "9/11"}, {"hyp": "eleven", "align":  "I", "ref": ""}, {"hyp": "the", "align": "I", "ref": ""}, {"hyp": "believes", "align": "S", "ref": "believes"}, {"hyp": "line", "align": "S", "ref": "I'm"}, {"hyp": "", "align": "D", "ref": "dead"}, {"hyp": "i", "align": "C", "ref": "I"}, {"hyp": "used", "align": "C", "ref": "used"}, {"hyp": "to", "align": "C", "ref": "to"}, {"hyp": "work", "align": "C", "ref": "work"}, {"hyp": "with", "align": "C", "ref": "with"}, {"hyp": "a", "align": "C", "ref": "a"}, {"hyp": "bunch", "align": "C", "ref": "bunch"}, {"hyp": "of", "align": "C", "ref": "of"}, {"hyp": "uptight", "align": "C", "ref": "uptight"}, {"hyp": "religious", "align": "C", "ref": "religious"}, {"hyp": "people", "align": "C", "ref": "people"}, {"hyp": "so", "align": "C", "ref": "so"}, {"hyp": "sometimes", "align": "C", "ref": "sometimes"}, {"hyp": "i", "align": "C", "ref": "I"}, {"hyp": "didn't", "align": "C", "ref": "didn't"}, {"hyp": "wear", "align": "C", "ref": "wear"}, {"hyp": "panties", "align": "C", "ref": "panties"}, {"hyp": "is", "align": "S", "ref": "and"}, {"hyp": "that", "align": "S", "ref": "just"}, {"hyp": "", "align": "D", "ref": "had"}, {"hyp": "a", "align": "C", "ref": "a"}, {"hyp": "big", "align": "C", "ref": "big"}, {"hyp": "smile", "align": "C", "ref": "smile"}, {"hyp": "and", "align": "C", "ref": "and"}, {"hyp": "chuckle", "align": "S", "ref": "chuckled"}, {"hyp": "to", "align": "C", "ref": "to"}, {"hyp": "myself", "align": "C", "ref": "myself"}, {"hyp": "from", "align": "I", "ref": ""}, {"hyp": "this", "align": "C", "ref": "This"}, {"hyp": "next", "align": "C", "ref": "next"}, {"hyp": "one", "align": "C", "ref": "one"}, {"hyp": "takes", "align": "C", "ref": "takes"}, {"hyp": "", "align": "D", "ref": "a"}, {"hyp": "little", "align": "C", "ref": "little"}, {"hyp": "explanation", "align": "C", "ref": "explanation"}, {"hyp": "of", "align": "S", "ref": "before"}, {"hyp": "right", "align": "S", "ref": "I"}, {"hyp": "here", "align": "S", "ref": "share"}, {"hyp": "", "align": "D", "ref": "it"}, {"hyp": "with", "align": "C", "ref": "with"}, {"hyp": "you", "align": "C", "ref": "you"}], "id": 6, "errorTypes": {"C": 36, "D": 4, "I": 3, "S": 9, "refLength": 49}}"""
        exp_ref = [
            u'Everyone', u'who', u'knew', u'me', u'before', u'9/11', u'', u'',
            u'believes', u"I'm", u'dead', u'I', u'used', u'to', u'work',
            u'with', u'a', u'bunch', u'of', u'uptight', u'religious',
            u'people', u'so', u'sometimes', u'I', u"didn't", u'wear',
            u'panties', u'and', u'just', u'had', u'a', u'big', u'smile',
            u'and', u'chuckled', u'to', u'myself', u'', u'This', u'next',
            u'one', u'takes', u'a', u'little', u'explanation', u'before', u'I',
            u'share', u'it', u'with', u'you'
        ]
        exp_ref_map = [
            0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
            21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
            39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51
        ]
        exp_align = [
            u'C', u'C', u'C', u'C', u'C', u'S', u'I', u'I', u'S', u'S', u'D',
            u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C',
            u'C', u'C', u'C', u'C', u'C', u'C', u'S', u'S', u'D', u'C', u'C',
            u'C', u'C', u'S', u'C', u'C', u'I', u'C', u'C', u'C', u'C', u'D',
            u'C', u'C', u'S', u'S', u'S', u'D', u'C', u'C'
        ]
        exp_hyp = [
            u'Everyone', u'who', u'knew', u'me', u'before', u'nine', u'eleven',
            u'the', u'believes', u'line', u'', u'i', u'used', u'to', u'work',
            u'with', u'a', u'bunch', u'of', u'uptight', u'religious',
            u'people', u'so', u'sometimes', u'i', u"didn't", u'wear',
            u'panties', u'is', u'that', u'', u'a', u'big', u'smile', u'and',
            u'chuckle', u'to', u'myself', u'from', u'this', u'next', u'one',
            u'takes', u'', u'little', u'explanation', u'of', u'right', u'here',
            u'', u'with', u'you'
        ]
        exp_hyp_map = [
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19,
            20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 37,
            38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 50, 51
        ]
        expected = ExpandedAlignment(exp_ref,
                                     exp_hyp,
                                     exp_align,
                                     exp_ref_map,
                                     exp_hyp_map,
                                     lowercase=True)

        actual = AlignmentReaderJson.read_json(jstr)
        self.maxDiff = None
        self.assertEqual(len(actual.s1), len(actual.s2))
        self.assertEqual(actual.s1, expected.s1)
        self.assertEqual(actual.s2, expected.s2)
        self.assertEqual(actual.align, expected.align)
        self.assertEqual(actual.s1_map, expected.s1_map)
        self.assertEqual(actual.s2_map, expected.s2_map)
Beispiel #9
0
 def test_normalize_alignment_any_more(self):
     hyp_aligned = ["any more"]
     ref_aligned = ["anymore"]
     alignment = ["S"]
     expand_align = ExpandedAlignment(ref_aligned,
                                      hyp_aligned,
                                      alignment,
                                      lowercase=True)
     actual = HypothesisNormalizer.normalizeAligned(expand_align)
     expected = ' '.join(ref_aligned)
     self.assertEqual(actual, expected)
Beispiel #10
0
 def test_normalize_middle_age(self):
     ref_aligned = [
         "They're",
         "happier",
         "than",
         "middle-aged",
         "people",
         "and",
         "younger",
         "people",
         "certainly",
     ]
     hyp_aligned = [
         "they are",
         "happier",
         "the",
         "middle age",
         "people",
         "and",
         "younger",
         "people",
         "certainly",
     ]
     alignment = [
         "S",
         "C",
         "S",
         "S",
         "C",
         "C",
         "C",
         "C",
         "C",
     ]
     expected = [
         "They're",
         "happier",
         "the",
         "middle age",
         "people",
         "and",
         "younger",
         "people",
         "certainly",
     ]
     expected = ' '.join([x for x in expected if x])
     expand_align = ExpandedAlignment(ref_aligned,
                                      hyp_aligned,
                                      alignment,
                                      lowercase=True)
     actual = HypothesisNormalizer.normalizeAligned(expand_align)
     self.assertEqual(actual, expected)
Beispiel #11
0
    def align(self):
        # Find the error regions that may need to be realigned
        self.split_regions, self.error_indexes = self.wer_alignment.split_error_regions(
        )
        self.phonetic_alignments = [None] * len(self.split_regions)

        for error_index in self.error_indexes:
            seg = self.split_regions[error_index]
            ref_words = seg.s1_tokens()
            hyp_words = seg.s2_tokens()
            ref_phones = self.pronouncer.pronounce(ref_words)
            hyp_phones = self.pronouncer.pronounce(hyp_words)

            power_seg_alignment, self.phonetic_alignments[
                error_index] = PowerAligner.phoneAlignToWordAlign(
                    ref_words, hyp_words, ref_phones, hyp_phones)

            # Replace the error region at the current index.
            self.split_regions[error_index] = power_seg_alignment

        # Merge the alignment segments back together.
        self.power_alignment = ExpandedAlignment(self.split_regions[0].s1,
                                                 self.split_regions[0].s2,
                                                 self.split_regions[0].align,
                                                 self.split_regions[0].s1_map,
                                                 self.split_regions[0].s2_map,
                                                 lowercase=self.lowercase)
        for i in range(1, len(self.split_regions)):
            self.power_alignment.append_alignment(self.split_regions[i])

        # Get the alignment score
        self.power, self.power_components = self.power_alignment.error_rate()

        assert self.hypwords == self.power_alignment.s2_string(
        ), "hyp mismatch:\n{0}\n{1}".format(self.hypwords,
                                            self.power_alignment.s2_string())
        assert self.refwords == self.power_alignment.s1_string(
        ), "ref mismatch:\n{0}\n{1}".format(self.refwords,
                                            self.power_alignment.s1_string())
Beispiel #12
0
 def test_normalize_94(self):
     ref_aligned = [
         "Originally",
         "the",
         "sample",
         "was",
         "aged",
         "18",
         "to",
         "94",
     ]
     hyp_aligned = [
         "originally",
         "the",
         "sample",
         "was",
         "aged",
         "eighteen",
         "to",
         "ninety four",
     ]
     alignment = [
         "C",
         "C",
         "C",
         "C",
         "C",
         "S",
         "C",
         "S",
     ]
     expected = [
         "originally",
         "the",
         "sample",
         "was",
         "aged",
         "18",
         "to",
         "94",
     ]
     expected = ' '.join([x for x in expected if x])
     expand_align = ExpandedAlignment(ref_aligned,
                                      hyp_aligned,
                                      alignment,
                                      lowercase=True)
     actual = HypothesisNormalizer.normalizeAligned(expand_align)
     self.assertEqual(actual, expected)
Beispiel #13
0
    def test_read_alignments_hyphen(self):
        jstr = """{"errRate": 0.0967741935483871, "alignments": [{"hyp": "the", "align": "C", "ref": "The"}, {"hyp": "reason", "align": "C", "ref": "reason"}, {"hyp": "they", "align": "C", "ref": "they"}, {"hyp": "settle", "align": "S", "ref": "settled"}, {"hyp": "out", "align": "C", "ref": "out"}, {"hyp": "is", "align": "C", "ref": "is"}, {"hyp": "because", "align": "C", "ref": "because"}, {"hyp": "it's", "align": "C", "ref": "it's"}, {"hyp": "cheaper", "align": "C", "ref": "cheaper"}, {"hyp": "to", "align": "C", "ref": "to"}, {"hyp": "settle", "align": "C", "ref": "settle"}, {"hyp": "than", "align": "C", "ref": "than"}, {"hyp": "to", "align": "C", "ref": "to"}, {"hyp": "fight", "align": "C", "ref": "fight"}, {"hyp": "the", "align": "C", "ref": "the"}, {"hyp": "lawsuit", "align": "C", "ref": "lawsuit"}, {"hyp": "clearly", "align": "C", "ref": "clearly"}, {"hyp": "two", "align": "C", "ref": "two"}, {"hyp": "million", "align": "C", "ref": "million"}, {"hyp": "dollars", "align": "C", "ref": "dollars"}, {"hyp": "cheaper", "align": "C", "ref": "cheaper"}, {"hyp": "in", "align": "C", "ref": "in"}, {"hyp": "some", "align": "C", "ref": "some"}, {"hyp": "cases", "align": "C", "ref": "cases"}, {"hyp": "and", "align": "C", "ref": "and"}, {"hyp": "much", "align": "C", "ref": "much"}, {"hyp": "worse", "align": "C", "ref": "worse"}, {"hyp": "a", "align": "S", "ref": "if"}, {"hyp": "few", "align": "S", "ref": "you"}, {"hyp": "actually", "align": "C", "ref": "actually"}, {"hyp": "lose", "align": "C", "ref": "lose"}], "id": 9, "errorTypes": {"C": 28, "D": 0, "I": 0, "S": 3, "refLength": 31}}"""

        exp_ref = [
            "The", "reason", "they", "settled", "out", "is", "because", "it's",
            "cheaper", "to", "settle", "than", "to", "fight", "the", "lawsuit",
            "clearly", "two", "million", "dollars", "cheaper", "in", "some",
            "cases", "and", "much", "worse", "if", "you", "actually", "lose"
        ]
        exp_ref_map = [
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
        ]
        exp_align = [
            "C", "C", "C", "S", "C", "C", "C", "C", "C", "C", "C", "C", "C",
            "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C",
            "C", "S", "S", "C", "C"
        ]
        exp_hyp = [
            "the", "reason", "they", "settle", "out", "is", "because", "it's",
            "cheaper", "to", "settle", "than", "to", "fight", "the", "lawsuit",
            "clearly", "two", "million", "dollars", "cheaper", "in", "some",
            "cases", "and", "much", "worse", "a", "few", "actually", "lose"
        ]
        exp_hyp_map = [
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
        ]
        expected = ExpandedAlignment(exp_ref,
                                     exp_hyp,
                                     exp_align,
                                     exp_ref_map,
                                     exp_hyp_map,
                                     lowercase=True)

        actual = AlignmentReaderJson.read_json(jstr)
        self.maxDiff = None
        self.assertEqual(len(actual.s1), len(actual.s2))
        self.assertEqual(actual.s1, expected.s1)
        self.assertEqual(actual.s2, expected.s2)
        self.assertEqual(actual.align, expected.align)
        self.assertEqual(actual.s1_map, expected.s1_map)
        self.assertEqual(actual.s2_map, expected.s2_map)
Beispiel #14
0
    def read_json(jstr):
        in_dict = json.loads(jstr)
        if not in_dict:
            return None

        ref = []
        hyp = []
        align = []
        ref_map = []
        hyp_map = []

        for i in range(len(in_dict['alignments'])):
            alignment = in_dict['alignments'][i]
            ref.append(alignment['ref'])
            hyp.append(alignment['hyp'])
            align.append(alignment['align'])
            if ref[-1]:
                ref_map.extend([i for r in ref[-1].split()])
            if hyp[-1]:
                hyp_map.extend([i for h in hyp[-1].split()])
        return ExpandedAlignment(ref, hyp, align, ref_map, hyp_map)
Beispiel #15
0
 def test_normalizeAlignment_text_hyphen(self):
     ref_aligned = [
         u'Our', u'digital', u'body', u'is', u'', u'', u'', u'',
         u'one-to-one', u'life'
     ]
     hyp_aligned = [
         u'are', u'what', u'it', u'is', u'all', u'about', u'the', u'these',
         u'one to one', u'life'
     ]
     alignment = [
         u'S', u'S', u'S', u'C', u'I', u'I', u'I', u'I', u'S', u'C'
     ]
     expected = ' '.join([
         u'are', u'what', u'it', u'is', u'all', u'about', u'the', u'these',
         u'one-to-one', u'life'
     ])
     expand_align = ExpandedAlignment(ref_aligned,
                                      hyp_aligned,
                                      alignment,
                                      lowercase=True)
     actual = HypothesisNormalizer.normalizeAligned(expand_align)
     self.assertEqual(actual, expected)
Beispiel #16
0
 def test_normalizeAlignment_911(self):
     ref_aligned = [
         u'Everyone', u'who', u'knew', u'me', u'before', u'911', u'',
         u'believes', u"I'm", u'dead'
     ]
     hyp_aligned = [
         u'everyone', u'who', u'knew', u'me', u'before', u'nine 11', u'the',
         u'believes', u'line', u''
     ]
     alignment = [
         u'C', u'C', u'C', u'C', u'C', u'S', u'I', u'S', u'S', u'D'
     ]
     expected = ' '.join([
         u'everyone', u'who', u'knew', u'me', u'before', u'911', u'the',
         u'believes', u'line', u''
     ]).strip()
     expand_align = ExpandedAlignment(ref_aligned,
                                      hyp_aligned,
                                      alignment,
                                      lowercase=True)
     actual = HypothesisNormalizer.normalizeAligned(expand_align)
     self.assertEqual(actual, expected)
Beispiel #17
0
 def test_normalize_alignment_theyre2(self):
     ref_aligned = [
         "In", "our", "study", "they", "are", "more", "positive", "but",
         "they're", "", "also", "more", "likely", "than", "younger",
         "people", "to", "experience", "mixed", "emotions", "sadness", "at",
         "the", "same", "time", "you", "experience", "happiness", "you",
         "know", "that", "tear", "in", "the", "eye", "when", "you're",
         "smiling", "at", "a", "friend"
     ]
     hyp_aligned = [
         "in", "our", "study", "they", "are", "more", "positive", "but",
         "they", "are", "also", "more", "likely", "than", "younger",
         "people", "to", "experience", "mixed", "emotions", "sadness", "at",
         "the", "same", "time", "you", "experience", "happiness", "you",
         "know", "that", "tear", "in", "the", "eye", "when", "you're",
         "smiling", "at", "a", "friend"
     ]
     alignment = [
         "C", "C", "C", "C", "C", "C", "C", "C", "S", "I", "C", "C", "C",
         "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C",
         "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C",
         "C", "C"
     ]
     expected = ' '.join([
         "in", "our", "study", "they", "are", "more", "positive", "but",
         "they're", "also", "more", "likely", "than", "younger", "people",
         "to", "experience", "mixed", "emotions", "sadness", "at", "the",
         "same", "time", "you", "experience", "happiness", "you", "know",
         "that", "tear", "in", "the", "eye", "when", "you're", "smiling",
         "at", "a", "friend"
     ])
     expand_align = ExpandedAlignment(ref_aligned,
                                      hyp_aligned,
                                      alignment,
                                      lowercase=True)
     actual = HypothesisNormalizer.normalizeAligned(expand_align)
     self.assertEqual(actual, expected)
Beispiel #18
0
 def test_normalize_alignment_ex1(self):
     ref_aligned = [
         "Our",
         "digital",
         "body",
         "is",
         "one-to-one",
         "",
         "",
         "life",
         "size",
         "so",
         "this",
         "is",
         "exactly",
         "the",
         "way",
         "students",
         "will",
         "see",
         "the",
         "real",
         "anatomy",
     ]
     hyp_aligned = [
         "our",
         "peaceful",
         "body",
         "is",
         "one two",
         "one",
         "life",
         "life",
         "size",
         "so",
         "this",
         "is",
         "exactly",
         "the",
         "way",
         "students",
         "would",
         "see",
         "the",
         "real",
         "anatomy",
     ]
     alignment = [
         "C",
         "S",
         "C",
         "C",
         "S",
         "I",
         "I",
         "C",
         "C",
         "C",
         "C",
         "C",
         "C",
         "C",
         "C",
         "C",
         "S",
         "C",
         "C",
         "C",
         "C",
     ]
     expected = ' '.join(hyp_aligned)
     expand_align = ExpandedAlignment(ref_aligned,
                                      hyp_aligned,
                                      alignment,
                                      lowercase=True)
     actual = HypothesisNormalizer.normalizeAligned(expand_align)
     self.assertEqual(actual, expected)
Beispiel #19
0
 def test_normalize_abbrev_wrong(self):
     ref_aligned = [
         "So",
         "we",
         "learned",
         "the",
         "majority",
         "of",
         "anatomic",
         "classes",
         "taught",
         "they",
         "do",
         "not",
         "have",
         "a",
         "cadaver",
         "dissection",
         "lab",
     ]
     hyp_aligned = [
         "so",
         "we",
         "learned",
         "the",
         "most jury p.",
         "o.",
         "anatomy",
         "class",
         "called",
         "they",
         "do",
         "not",
         "have",
         "",
         "had ever",
         "dissection",
         "lead",
     ]
     alignment = [
         "C",
         "C",
         "C",
         "C",
         "S",
         "S",
         "S",
         "S",
         "S",
         "C",
         "C",
         "C",
         "C",
         "D",
         "S",
         "C",
         "S",
     ]
     expected = [
         "So",
         "we",
         "learned",
         "the",
         "most jury p.",
         "o.",
         "anatomy",
         "class",
         "called",
         "they",
         "do",
         "not",
         "have",
         "",
         "had ever",
         "dissection",
         "lead",
     ]
     expected = ' '.join([x for x in expected if x])
     expand_align = ExpandedAlignment(ref_aligned,
                                      hyp_aligned,
                                      alignment,
                                      lowercase=True)
     actual = HypothesisNormalizer.normalizeAligned(expand_align,
                                                    fix_casing=True)
     self.assertEqual(actual, expected)
Beispiel #20
0
 def test_punct_tokens_at_front_and_end_short(self):
     sample_ref = [
         u'Everyone', u'who', u'knew', u'me', u'before', u'9/11', u'', u'',
         u'believes', u"I'm", u'dead'
     ]
     sample_ref_map = [
         0,
         1,
         2,
         3,
         4,
         5,
         8,
         9,
         10,
     ]
     sample_align = [
         u'C',
         u'C',
         u'C',
         u'C',
         u'C',
         u'S',
         u'I',
         u'I',
         u'S',
         u'S',
         u'D',
     ]
     sample_hyp = [
         u'Everyone',
         u'who',
         u'knew',
         u'me',
         u'before',
         u'nine',
         u'eleven',
         u'the',
         u'believes',
         u'line',
         u'',
     ]
     sample_hyp_map = [
         0,
         1,
         2,
         3,
         4,
         5,
         6,
         7,
         8,
         9,
     ]
     sample_hyp_punct = [
         u'"Everyone',
         u'who',
         u'knew',
         u'me',
         u'before',
         u'nine',
         u'eleven',
         u'the',
         u'believes',
         u'line."',
         u'',
     ]
     error_alignment = ExpandedAlignment(sample_ref,
                                         sample_hyp,
                                         sample_align,
                                         sample_ref_map,
                                         sample_hyp_map,
                                         lowercase=True)
     ref_punct = u'''"Everyone who knew me before 9/11 believes I'm dead."'''
     expected = ExpandedAlignment(sample_ref,
                                  sample_hyp_punct,
                                  sample_align,
                                  sample_ref_map,
                                  sample_hyp_map,
                                  lowercase=True).s2
     actual = PunctInsertOracle.insertPunct(error_alignment, ref_punct).s2
     self.maxDiff = None
     self.assertEqual(actual, expected)
Beispiel #21
0
 def test_punct_tokens_at_front_and_end(self):
     sample_ref = [
         u'Everyone', u'who', u'knew', u'me', u'before', u'9/11', u'', u'',
         u'believes', u"I'm", u'dead', u'I', u'used', u'to', u'work',
         u'with', u'a', u'bunch', u'of', u'uptight', u'religious',
         u'people', u'so', u'sometimes', u'I', u"didn't", u'wear',
         u'panties', u'and', u'just', u'had', u'a', u'big', u'smile',
         u'and', u'chuckled', u'to', u'myself', u'', u'This', u'next',
         u'one', u'takes', u'a', u'little', u'explanation', u'before', u'I',
         u'share', u'it', u'with', u'you'
     ]
     sample_ref_map = [
         0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
         21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
         39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51
     ]
     sample_align = [
         u'C', u'C', u'C', u'C', u'C', u'S', u'I', u'I', u'S', u'S', u'D',
         u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C', u'C',
         u'C', u'C', u'C', u'C', u'C', u'C', u'S', u'S', u'D', u'C', u'C',
         u'C', u'C', u'S', u'C', u'C', u'I', u'C', u'C', u'C', u'C', u'D',
         u'C', u'C', u'S', u'S', u'S', u'D', u'C', u'C'
     ]
     sample_hyp = [
         u'Everyone', u'who', u'knew', u'me', u'before', u'nine', u'eleven',
         u'the', u'believes', u'line', u'', u'i', u'used', u'to', u'work',
         u'with', u'a', u'bunch', u'of', u'uptight', u'religious',
         u'people', u'so', u'sometimes', u'i', u"didn't", u'wear',
         u'panties', u'is', u'that', u'', u'a', u'big', u'smile', u'and',
         u'chuckle', u'to', u'myself', u'from', u'this', u'next', u'one',
         u'takes', u'', u'little', u'explanation', u'of', u'right', u'here',
         u'', u'with', u'you'
     ]
     sample_hyp_map = [
         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19,
         20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 37,
         38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 50, 51
     ]
     sample_hyp_punct = [
         u'"Everyone', u'who', u'knew', u'me', u'before', u'nine',
         u'eleven', u'the', u'believes', u'line."', u'', u'"i', u'used',
         u'to', u'work', u'with', u'a', u'bunch', u'of', u'uptight',
         u'religious', u'people,', u'so', u'sometimes', u'i', u"didn't",
         u'wear', u'panties,', u'is', u'that', u'', u'a', u'big', u'smile',
         u'and', u'chuckle', u'to', u'myself."', u'from', u'this', u'next',
         u'one', u'takes', u'', u'little', u'explanation', u'of', u'right',
         u'here', u'', u'with', u'you.'
     ]
     error_alignment = ExpandedAlignment(sample_ref,
                                         sample_hyp,
                                         sample_align,
                                         sample_ref_map,
                                         sample_hyp_map,
                                         lowercase=True)
     ref_punct = u'''"Everyone who knew me before 9/11 believes I'm dead." "I used to work with a bunch of uptight religious people, so sometimes I didn't wear panties, and just had a big smile and chuckled to myself."  This next one takes a little explanation before I share it with you.'''
     expected = ExpandedAlignment(sample_ref,
                                  sample_hyp_punct,
                                  sample_align,
                                  sample_ref_map,
                                  sample_hyp_map,
                                  lowercase=True).s2
     actual = PunctInsertOracle.insertPunct(error_alignment, ref_punct).s2
     #         print(actual)
     self.maxDiff = None
     self.assertEqual(actual, expected)
Beispiel #22
0
    def charAlignToWordAlign(self):
        if not self.char_align:
            raise Exception("char_align is None")

        ref_word_align = []
        hyp_word_align = []
        align_word = []

        tmp_ref_word = []
        tmp_hyp_word = []

        for i in range(len(self.char_align.align)):
            ref_char = self.char_align.s1[i]
            hyp_char = self.char_align.s2[i]
            align_char = self.char_align.align[i]

            # check if both words are completed
            # There are a few of ways this could happen:
            if ((align_char == AlignLabels.correct and ref_char == ' ')
                    or (align_char == AlignLabels.deletion and ref_char == ' ')
                    or
                (align_char == AlignLabels.insertion and hyp_char == ' ')):

                ref_word = ''.join(tmp_ref_word)
                hyp_word = ''.join(tmp_hyp_word)

                if ref_word or hyp_word:
                    ref_word_align.append(ref_word)
                    hyp_word_align.append(hyp_word)
                    tmp_ref_word = []
                    tmp_hyp_word = []

                    # Check align type
                    if ref_word and hyp_word:
                        if ref_word == hyp_word:
                            align_word.append(AlignLabels.correct)
                        else:
                            align_word.append(AlignLabels.substitution)
                    elif ref_word:
                        align_word.append(AlignLabels.deletion)
                    else:
                        align_word.append(AlignLabels.insertion)
                continue

            # Read current chars and check if one of the words is complete
            if ref_char == ' ':
                if len(tmp_ref_word) > 1:
                    # Probably a D
                    ref_word = ''.join(tmp_ref_word)
                    ref_word_align.append(ref_word)
                    hyp_word_align.append('')
                    tmp_ref_word = []
                    align_word.append(AlignLabels.deletion)
            else:
                tmp_ref_word.append(ref_char)

            if hyp_char == ' ':
                if len(tmp_hyp_word) > 1:
                    # Probably an I
                    hyp_word = ''.join(tmp_hyp_word)
                    ref_word_align.append('')
                    hyp_word_align.append(hyp_word)
                    tmp_hyp_word = []
                    align_word.append(AlignLabels.insertion)
            else:
                tmp_hyp_word.append(hyp_char)

        self.word_align = ExpandedAlignment(ref_word_align,
                                            hyp_word_align,
                                            align_word,
                                            lowercase=self.lowercase)
        return self.word_align
Beispiel #23
0
    def phoneAlignToWordAlign(cls,
                              ref_words,
                              hyp_words,
                              ref_phones,
                              hyp_phones,
                              break_on_syllables=True):
        ref_word_span = (0, len(ref_words))
        hyp_word_span = (0, len(hyp_words))

        # Perform Levenshtein Alignment
        lev = Levenshtein.align(ref=ref_phones,
                                hyp=hyp_phones,
                                reserve_list=PowerAligner.reserve_list,
                                exclusive_sets=PowerAligner.exclusive_sets,
                                weights=Levenshtein.wordAlignWeights)  #,
        #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights)
        phone_align = lev.expandAlignCompact()

        worklist = list()
        worklist.append((ref_word_span, hyp_word_span, phone_align))

        full_reference = list()
        full_hypothesis = list()
        full_alignment = list()
        full_phone_align = list()

        while worklist:
            # Take the next set of sequence boundaries off the worklist
            ref_word_span, hyp_word_span, phone_align = worklist.pop()
            ref_word_index, ref_word_limit = ref_word_span
            hyp_word_index, hyp_word_limit = hyp_word_span

            # TODO: Currently only checking in the forward direction
            ref_word_builder = []  # Temp storage of words in alignment span
            hyp_word_builder = []

            ref_word_iter = enumerate(
                ref_words[ref_word_span[0]:ref_word_span[1]]
            )  # Iterates through the surface words
            hyp_word_iter = enumerate(
                hyp_words[hyp_word_span[0]:hyp_word_span[1]])

            ref_aligned = []  # Finalized alignments
            hyp_aligned = []
            alignment = []  # Finalized alignment labels

            ref_extra_syllable_word_index = None  # Used for marking words mapping to extra syllables in alignment.
            hyp_extra_syllable_word_index = None
            ref_syllable_count = 0
            hyp_syllable_count = 0

            ref_word_started = False  # Indicates whether a word is already accounted for in the alignment when a phoneme is reached.
            hyp_word_started = False

            advance_worklist = False
            commit_alignment = False

            for i in range(len(phone_align.align)):
                ref_type = TokType.checkAnnotation(phone_align.s1[i])
                hyp_type = TokType.checkAnnotation(phone_align.s2[i])

                # Check if word boundaries are reached, both on ref an hyp -- or the case where no more symbols can be read.
                if (i == len(phone_align.align) -
                        1) or (ref_type == TokType.WordBoundary
                               and ref_type == hyp_type):
                    align_tok = None
                    # Only write outputs if either the ref or the hyp has scanned some words.
                    if ref_word_builder:
                        if hyp_word_builder:
                            align_tok = AlignLabels.substitution if ref_word_builder != hyp_word_builder else AlignLabels.correct
                        else:
                            align_tok = AlignLabels.deletion
                    elif hyp_word_builder:
                        align_tok = AlignLabels.insertion

                    if align_tok:
                        # Add the remainder to the worklist
                        ref_word_span_next = (ref_word_index +
                                              len(ref_word_builder),
                                              ref_word_limit)
                        hyp_word_span_next = (hyp_word_index +
                                              len(hyp_word_builder),
                                              hyp_word_limit)
                        phone_align_next = phone_align.subsequence(
                            i, phone_align.length(), preserve_index=False)
                        worklist.append((ref_word_span_next,
                                         hyp_word_span_next, phone_align_next))

                        # "Commit" the current alignment
                        if align_tok in (AlignLabels.correct,
                                         AlignLabels.substitution):
                            alignment.append(align_tok)

                            # Check for syllable conflicts
                            if not break_on_syllables or not ref_extra_syllable_word_index:
                                ref_aligned.append(' '.join(ref_word_builder))
                                ref_syllable_count = 0
                                hyp_syllable_count = 0
                            else:
                                ref_aligned.append(' '.join(ref_word_builder[
                                    0:ref_extra_syllable_word_index]))
                                # The remaining words are deletions
                                for word in ref_word_builder[
                                        ref_extra_syllable_word_index:]:
                                    alignment.append(AlignLabels.deletion)
                                    ref_aligned.append(word)
                                    hyp_aligned.append('')
                                ref_syllable_count = 0

                            if not break_on_syllables or not hyp_extra_syllable_word_index:
                                hyp_aligned.append(' '.join(hyp_word_builder))
                                ref_syllable_count = 0
                                hyp_syllable_count = 0
                            else:
                                hyp_aligned.append(' '.join(hyp_word_builder[
                                    0:hyp_extra_syllable_word_index]))
                                # The remaining words are insertions
                                for word in hyp_word_builder[
                                        hyp_extra_syllable_word_index:]:
                                    alignment.append(AlignLabels.insertion)
                                    ref_aligned.append('')
                                    hyp_aligned.append(word)
                                    hyp_syllable_count = 0

                            if align_tok == AlignLabels.substitution:
                                # Check if you need to rework this alignment.
                                if len(ref_word_builder) != len(
                                        hyp_word_builder):
                                    # Word count mismatch in the alignment span. Is there a possibility that we need to re-align this segment?
                                    ref_word_span_curr = (
                                        ref_word_index,
                                        ref_word_index + len(ref_word_builder))
                                    hyp_word_span_curr = (
                                        hyp_word_index,
                                        hyp_word_index + len(hyp_word_builder))
                                    phone_align_curr = phone_align.subsequence(
                                        0, i + 1, preserve_index=False)

                                    lev = Levenshtein.align(
                                        ref=phone_align_curr.s1_tokens(),
                                        hyp=phone_align_curr.s2_tokens(),
                                        reserve_list=PowerAligner.reserve_list,
                                        exclusive_sets=PowerAligner.
                                        exclusive_sets,
                                        weights=Levenshtein.wordAlignWeights
                                    )  #,
                                    #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights)

                                    phone_align_adjusted = lev.expandAlignCompact(
                                    )

                                    if phone_align_curr.align != phone_align_adjusted.align:
                                        # Looks like we need to redo the phone-to-word alignment.
                                        worklist.append((ref_word_span_curr,
                                                         hyp_word_span_curr,
                                                         phone_align_adjusted))
                                    else:
                                        commit_alignment = True
                                else:
                                    commit_alignment = True

                        elif align_tok == AlignLabels.deletion:
                            for word in ref_word_builder:
                                alignment.append(align_tok)
                                ref_aligned.append(word)
                                hyp_aligned.append('')

                            commit_alignment = True
                            ref_syllable_count = 0

                        elif align_tok == AlignLabels.insertion:
                            for word in hyp_word_builder:
                                alignment.append(align_tok)
                                ref_aligned.append('')
                                hyp_aligned.append(word)

                            commit_alignment = True
                            hyp_syllable_count = 0

                        if commit_alignment:
                            # Commit the alignment.
                            full_reference.extend(ref_aligned)
                            full_hypothesis.extend(hyp_aligned)
                            full_alignment.extend(alignment)
                            full_phone_align.append(
                                phone_align.subsequence(0,
                                                        i,
                                                        preserve_index=False))
                            ref_aligned = []
                            hyp_aligned = []
                            alignment = []
                        break

                # Add words if word boundaries are reached.
                else:
                    if ref_type == TokType.WordBoundary:
                        ref_word_started = False
                        if hyp_type != TokType.WordBoundary and ref_word_builder and not hyp_word_builder:
                            # DELETION
                            # Ref word ended, but no hyp words have been added. Mark the current ref word(s) in the span as deletion errors.
                            # TODO: Dedupe this logic
                            for word in ref_word_builder:
                                alignment.append(AlignLabels.deletion)
                                ref_aligned.append(word)
                                hyp_aligned.append('')
                            ref_syllable_count = 0

                            # Commit the alignment.
                            full_reference.extend(ref_aligned)
                            full_hypothesis.extend(hyp_aligned)
                            full_alignment.extend(alignment)
                            full_phone_align.append(
                                phone_align.subsequence(0,
                                                        i,
                                                        preserve_index=False))

                            # Add the remainder to the worklist
                            ref_word_span_next = (ref_word_index +
                                                  len(ref_word_builder),
                                                  ref_word_limit)
                            hyp_word_span_next = (hyp_word_index +
                                                  len(hyp_word_builder),
                                                  hyp_word_limit)
                            lev = Levenshtein.align(
                                ref=[x for x in phone_align.s1[i:] if x],
                                hyp=[x for x in phone_align.s2 if x],
                                reserve_list=PowerAligner.reserve_list,
                                exclusive_sets=PowerAligner.exclusive_sets,
                                weights=Levenshtein.wordAlignWeights)  #,
                            #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights)
                            phone_align_next = lev.expandAlignCompact()

                            worklist.append(
                                (ref_word_span_next, hyp_word_span_next,
                                 phone_align_next))
                            break
                    elif ref_type == TokType.Phoneme and not ref_word_started:
                        ref_word_started = True
                        try:
                            ref_word_item = ref_word_iter.__next__()
                            ref_word_builder.append(ref_word_item[1])
                        except StopIteration:
                            pass

                    if hyp_type == TokType.WordBoundary:
                        hyp_word_started = False
                        if ref_type != TokType.WordBoundary and hyp_word_builder and not ref_word_builder:
                            # INSERTION
                            # Hyp word ended, but no ref words have been added. Mark the current hyp word(s) in the span as insertion errors.
                            # TODO: Dedupe this logic
                            for word in hyp_word_builder:
                                alignment.append(AlignLabels.insertion)
                                ref_aligned.append('')
                                hyp_aligned.append(word)
                            hyp_syllable_count = 0

                            # Commit the alignment.
                            full_reference.extend(ref_aligned)
                            full_hypothesis.extend(hyp_aligned)
                            full_alignment.extend(alignment)
                            full_phone_align.append(
                                phone_align.subsequence(0,
                                                        i,
                                                        preserve_index=False))

                            # Add the remainder to the worklist
                            ref_word_span_next = (ref_word_index +
                                                  len(ref_word_builder),
                                                  ref_word_limit)
                            hyp_word_span_next = (hyp_word_index +
                                                  len(hyp_word_builder),
                                                  hyp_word_limit)
                            lev = Levenshtein.align(
                                ref=[x for x in phone_align.s1 if x],
                                hyp=[x for x in phone_align.s2[i:] if x],
                                reserve_list=PowerAligner.reserve_list,
                                exclusive_sets=PowerAligner.exclusive_sets,
                                weights=Levenshtein.wordAlignWeights)  #,
                            #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights)
                            phone_align_next = lev.expandAlignCompact()

                            worklist.append(
                                (ref_word_span_next, hyp_word_span_next,
                                 phone_align_next))
                            break
                    elif hyp_type == TokType.Phoneme and not hyp_word_started:
                        hyp_word_started = True
                        try:
                            hyp_word_item = hyp_word_iter.__next__()
                            hyp_word_builder.append(hyp_word_item[1])
                        except StopIteration:
                            pass

                # Check for syllable mismatches
                if ref_type == TokType.SyllableBoundary:
                    ref_syllable_count += 1
                if hyp_type == TokType.SyllableBoundary:
                    hyp_syllable_count += 1

                if (ref_type == TokType.SyllableBoundary == hyp_type
                        or ref_syllable_count == hyp_syllable_count):
                    # No syllable conflicts here!
                    ref_extra_syllable_word_index = None
                    hyp_extra_syllable_word_index = None
                elif (ref_type == TokType.SyllableBoundary
                      and not ref_extra_syllable_word_index
                      and TokType.checkAnnotation(
                          phone_align.s2[i - 1]) == TokType.WordBoundary):
                    # Extra syllable in hypothesis. We only care if the syllable immediately follows a word boundary.
                    # This is because this indicates that a new word is being formed, which may likely be an insertion in hyp.
                    ref_extra_syllable_word_index = len(ref_word_builder) - 1
                    # print ref_word_builder
                    # print 'Syllable/word mismatch at', i
                    # print 'Extra hyp word:', ref_word_builder[ref_extra_syllable_word_index]
                elif (hyp_type == TokType.SyllableBoundary
                      and not hyp_extra_syllable_word_index
                      and TokType.checkAnnotation(
                          phone_align.s2[i - 1]) == TokType.WordBoundary):
                    # This time there's an extra syllable in the ref, corresponding to a new ref word.
                    hyp_extra_syllable_word_index = len(hyp_word_builder) - 1
                    # print hyp_word_builder
                    # print 'Syllable/word mismatch at', i
                    # print 'Extra ref word:', hyp_word_builder[hyp_extra_syllable_word_index]
        # Concatenate all phoneme alignments
        fp_align = full_phone_align[0]
        for expand_align in full_phone_align[1:]:
            fp_align.append_alignment(expand_align)

        return ExpandedAlignment(full_reference, full_hypothesis,
                                 full_alignment), fp_align
Beispiel #24
0
class PowerAligner:
    # Exclusive tokens that can only align to themselves; not other members in this set.
    reserve_list = set(['|', '#'])

    # R-sounds
    r_set = set.union(set('r'), Phonemes.r_vowels)
    exclusive_sets = [Phonemes.vowels, Phonemes.consonants, r_set]

    phoneDistPenalty = 0.25
    phoneDistPenalt16ySet = set(['|'])

    def __init__(self,
                 ref,
                 hyp,
                 lowercase=False,
                 verbose=False,
                 pronounce_type=PronouncerType.Lexicon,
                 lexicon=None,
                 word_align_weights=Levenshtein.wordAlignWeights):
        if not ref:
            raise Exception("No reference file.\nref: {0}\nhyp: {1}".format(
                ref, hyp))

        if pronounce_type == PronouncerType.Lexicon:
            self.pronouncer = PronouncerLex(lexicon)
        else:
            self.pronouncer = PronouncerBase()

        self.ref = [x for x in ref.strip().split() if x]
        self.hyp = [x for x in hyp.strip().split() if x]
        self.refwords = ' '.join(self.ref)
        self.hypwords = ' '.join(self.hyp)

        self.lowercase = lowercase
        self.verbose = verbose

        # Perform word alignment
        lev = Levenshtein.align(self.ref,
                                self.hyp,
                                lowercase=self.lowercase,
                                weights=word_align_weights)
        lev.editops()
        self.wer_alignment = lev.expandAlign()
        self.wer, self.wer_components = self.wer_alignment.error_rate()

        # Used for POWER alignment
        self.power_alignment = None
        self.power = None
        self.power_components = None

        # Used to find potential error regions
        self.split_regions = None
        self.error_indexes = None
        self.phonetic_alignments = None
        self.phonetic_lev = None

    def align(self):
        # Find the error regions that may need to be realigned
        self.split_regions, self.error_indexes = self.wer_alignment.split_error_regions(
        )
        self.phonetic_alignments = [None] * len(self.split_regions)

        for error_index in self.error_indexes:
            seg = self.split_regions[error_index]
            ref_words = seg.s1_tokens()
            hyp_words = seg.s2_tokens()
            ref_phones = self.pronouncer.pronounce(ref_words)
            hyp_phones = self.pronouncer.pronounce(hyp_words)

            power_seg_alignment, self.phonetic_alignments[
                error_index] = PowerAligner.phoneAlignToWordAlign(
                    ref_words, hyp_words, ref_phones, hyp_phones)

            # Replace the error region at the current index.
            self.split_regions[error_index] = power_seg_alignment

        # Merge the alignment segments back together.
        self.power_alignment = ExpandedAlignment(self.split_regions[0].s1,
                                                 self.split_regions[0].s2,
                                                 self.split_regions[0].align,
                                                 self.split_regions[0].s1_map,
                                                 self.split_regions[0].s2_map,
                                                 lowercase=self.lowercase)
        for i in range(1, len(self.split_regions)):
            self.power_alignment.append_alignment(self.split_regions[i])

        # Get the alignment score
        self.power, self.power_components = self.power_alignment.error_rate()

        assert self.hypwords == self.power_alignment.s2_string(
        ), "hyp mismatch:\n{0}\n{1}".format(self.hypwords,
                                            self.power_alignment.s2_string())
        assert self.refwords == self.power_alignment.s1_string(
        ), "ref mismatch:\n{0}\n{1}".format(self.refwords,
                                            self.power_alignment.s1_string())

    # TODO: Make this simpler (and maybe recursive)
    @classmethod
    def phoneAlignToWordAlign(cls,
                              ref_words,
                              hyp_words,
                              ref_phones,
                              hyp_phones,
                              break_on_syllables=True):
        ref_word_span = (0, len(ref_words))
        hyp_word_span = (0, len(hyp_words))

        # Perform Levenshtein Alignment
        lev = Levenshtein.align(ref=ref_phones,
                                hyp=hyp_phones,
                                reserve_list=PowerAligner.reserve_list,
                                exclusive_sets=PowerAligner.exclusive_sets,
                                weights=Levenshtein.wordAlignWeights)  #,
        #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights)
        phone_align = lev.expandAlignCompact()

        worklist = list()
        worklist.append((ref_word_span, hyp_word_span, phone_align))

        full_reference = list()
        full_hypothesis = list()
        full_alignment = list()
        full_phone_align = list()

        while worklist:
            # Take the next set of sequence boundaries off the worklist
            ref_word_span, hyp_word_span, phone_align = worklist.pop()
            ref_word_index, ref_word_limit = ref_word_span
            hyp_word_index, hyp_word_limit = hyp_word_span

            # TODO: Currently only checking in the forward direction
            ref_word_builder = []  # Temp storage of words in alignment span
            hyp_word_builder = []

            ref_word_iter = enumerate(
                ref_words[ref_word_span[0]:ref_word_span[1]]
            )  # Iterates through the surface words
            hyp_word_iter = enumerate(
                hyp_words[hyp_word_span[0]:hyp_word_span[1]])

            ref_aligned = []  # Finalized alignments
            hyp_aligned = []
            alignment = []  # Finalized alignment labels

            ref_extra_syllable_word_index = None  # Used for marking words mapping to extra syllables in alignment.
            hyp_extra_syllable_word_index = None
            ref_syllable_count = 0
            hyp_syllable_count = 0

            ref_word_started = False  # Indicates whether a word is already accounted for in the alignment when a phoneme is reached.
            hyp_word_started = False

            advance_worklist = False
            commit_alignment = False

            for i in range(len(phone_align.align)):
                ref_type = TokType.checkAnnotation(phone_align.s1[i])
                hyp_type = TokType.checkAnnotation(phone_align.s2[i])

                # Check if word boundaries are reached, both on ref an hyp -- or the case where no more symbols can be read.
                if (i == len(phone_align.align) -
                        1) or (ref_type == TokType.WordBoundary
                               and ref_type == hyp_type):
                    align_tok = None
                    # Only write outputs if either the ref or the hyp has scanned some words.
                    if ref_word_builder:
                        if hyp_word_builder:
                            align_tok = AlignLabels.substitution if ref_word_builder != hyp_word_builder else AlignLabels.correct
                        else:
                            align_tok = AlignLabels.deletion
                    elif hyp_word_builder:
                        align_tok = AlignLabels.insertion

                    if align_tok:
                        # Add the remainder to the worklist
                        ref_word_span_next = (ref_word_index +
                                              len(ref_word_builder),
                                              ref_word_limit)
                        hyp_word_span_next = (hyp_word_index +
                                              len(hyp_word_builder),
                                              hyp_word_limit)
                        phone_align_next = phone_align.subsequence(
                            i, phone_align.length(), preserve_index=False)
                        worklist.append((ref_word_span_next,
                                         hyp_word_span_next, phone_align_next))

                        # "Commit" the current alignment
                        if align_tok in (AlignLabels.correct,
                                         AlignLabels.substitution):
                            alignment.append(align_tok)

                            # Check for syllable conflicts
                            if not break_on_syllables or not ref_extra_syllable_word_index:
                                ref_aligned.append(' '.join(ref_word_builder))
                                ref_syllable_count = 0
                                hyp_syllable_count = 0
                            else:
                                ref_aligned.append(' '.join(ref_word_builder[
                                    0:ref_extra_syllable_word_index]))
                                # The remaining words are deletions
                                for word in ref_word_builder[
                                        ref_extra_syllable_word_index:]:
                                    alignment.append(AlignLabels.deletion)
                                    ref_aligned.append(word)
                                    hyp_aligned.append('')
                                ref_syllable_count = 0

                            if not break_on_syllables or not hyp_extra_syllable_word_index:
                                hyp_aligned.append(' '.join(hyp_word_builder))
                                ref_syllable_count = 0
                                hyp_syllable_count = 0
                            else:
                                hyp_aligned.append(' '.join(hyp_word_builder[
                                    0:hyp_extra_syllable_word_index]))
                                # The remaining words are insertions
                                for word in hyp_word_builder[
                                        hyp_extra_syllable_word_index:]:
                                    alignment.append(AlignLabels.insertion)
                                    ref_aligned.append('')
                                    hyp_aligned.append(word)
                                    hyp_syllable_count = 0

                            if align_tok == AlignLabels.substitution:
                                # Check if you need to rework this alignment.
                                if len(ref_word_builder) != len(
                                        hyp_word_builder):
                                    # Word count mismatch in the alignment span. Is there a possibility that we need to re-align this segment?
                                    ref_word_span_curr = (
                                        ref_word_index,
                                        ref_word_index + len(ref_word_builder))
                                    hyp_word_span_curr = (
                                        hyp_word_index,
                                        hyp_word_index + len(hyp_word_builder))
                                    phone_align_curr = phone_align.subsequence(
                                        0, i + 1, preserve_index=False)

                                    lev = Levenshtein.align(
                                        ref=phone_align_curr.s1_tokens(),
                                        hyp=phone_align_curr.s2_tokens(),
                                        reserve_list=PowerAligner.reserve_list,
                                        exclusive_sets=PowerAligner.
                                        exclusive_sets,
                                        weights=Levenshtein.wordAlignWeights
                                    )  #,
                                    #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights)

                                    phone_align_adjusted = lev.expandAlignCompact(
                                    )

                                    if phone_align_curr.align != phone_align_adjusted.align:
                                        # Looks like we need to redo the phone-to-word alignment.
                                        worklist.append((ref_word_span_curr,
                                                         hyp_word_span_curr,
                                                         phone_align_adjusted))
                                    else:
                                        commit_alignment = True
                                else:
                                    commit_alignment = True

                        elif align_tok == AlignLabels.deletion:
                            for word in ref_word_builder:
                                alignment.append(align_tok)
                                ref_aligned.append(word)
                                hyp_aligned.append('')

                            commit_alignment = True
                            ref_syllable_count = 0

                        elif align_tok == AlignLabels.insertion:
                            for word in hyp_word_builder:
                                alignment.append(align_tok)
                                ref_aligned.append('')
                                hyp_aligned.append(word)

                            commit_alignment = True
                            hyp_syllable_count = 0

                        if commit_alignment:
                            # Commit the alignment.
                            full_reference.extend(ref_aligned)
                            full_hypothesis.extend(hyp_aligned)
                            full_alignment.extend(alignment)
                            full_phone_align.append(
                                phone_align.subsequence(0,
                                                        i,
                                                        preserve_index=False))
                            ref_aligned = []
                            hyp_aligned = []
                            alignment = []
                        break

                # Add words if word boundaries are reached.
                else:
                    if ref_type == TokType.WordBoundary:
                        ref_word_started = False
                        if hyp_type != TokType.WordBoundary and ref_word_builder and not hyp_word_builder:
                            # DELETION
                            # Ref word ended, but no hyp words have been added. Mark the current ref word(s) in the span as deletion errors.
                            # TODO: Dedupe this logic
                            for word in ref_word_builder:
                                alignment.append(AlignLabels.deletion)
                                ref_aligned.append(word)
                                hyp_aligned.append('')
                            ref_syllable_count = 0

                            # Commit the alignment.
                            full_reference.extend(ref_aligned)
                            full_hypothesis.extend(hyp_aligned)
                            full_alignment.extend(alignment)
                            full_phone_align.append(
                                phone_align.subsequence(0,
                                                        i,
                                                        preserve_index=False))

                            # Add the remainder to the worklist
                            ref_word_span_next = (ref_word_index +
                                                  len(ref_word_builder),
                                                  ref_word_limit)
                            hyp_word_span_next = (hyp_word_index +
                                                  len(hyp_word_builder),
                                                  hyp_word_limit)
                            lev = Levenshtein.align(
                                ref=[x for x in phone_align.s1[i:] if x],
                                hyp=[x for x in phone_align.s2 if x],
                                reserve_list=PowerAligner.reserve_list,
                                exclusive_sets=PowerAligner.exclusive_sets,
                                weights=Levenshtein.wordAlignWeights)  #,
                            #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights)
                            phone_align_next = lev.expandAlignCompact()

                            worklist.append(
                                (ref_word_span_next, hyp_word_span_next,
                                 phone_align_next))
                            break
                    elif ref_type == TokType.Phoneme and not ref_word_started:
                        ref_word_started = True
                        try:
                            ref_word_item = ref_word_iter.__next__()
                            ref_word_builder.append(ref_word_item[1])
                        except StopIteration:
                            pass

                    if hyp_type == TokType.WordBoundary:
                        hyp_word_started = False
                        if ref_type != TokType.WordBoundary and hyp_word_builder and not ref_word_builder:
                            # INSERTION
                            # Hyp word ended, but no ref words have been added. Mark the current hyp word(s) in the span as insertion errors.
                            # TODO: Dedupe this logic
                            for word in hyp_word_builder:
                                alignment.append(AlignLabels.insertion)
                                ref_aligned.append('')
                                hyp_aligned.append(word)
                            hyp_syllable_count = 0

                            # Commit the alignment.
                            full_reference.extend(ref_aligned)
                            full_hypothesis.extend(hyp_aligned)
                            full_alignment.extend(alignment)
                            full_phone_align.append(
                                phone_align.subsequence(0,
                                                        i,
                                                        preserve_index=False))

                            # Add the remainder to the worklist
                            ref_word_span_next = (ref_word_index +
                                                  len(ref_word_builder),
                                                  ref_word_limit)
                            hyp_word_span_next = (hyp_word_index +
                                                  len(hyp_word_builder),
                                                  hyp_word_limit)
                            lev = Levenshtein.align(
                                ref=[x for x in phone_align.s1 if x],
                                hyp=[x for x in phone_align.s2[i:] if x],
                                reserve_list=PowerAligner.reserve_list,
                                exclusive_sets=PowerAligner.exclusive_sets,
                                weights=Levenshtein.wordAlignWeights)  #,
                            #dist_penalty=PowerAligner.phoneDistPenalty, dist_penalty_set=Levenshtein.wordAlignWeights)
                            phone_align_next = lev.expandAlignCompact()

                            worklist.append(
                                (ref_word_span_next, hyp_word_span_next,
                                 phone_align_next))
                            break
                    elif hyp_type == TokType.Phoneme and not hyp_word_started:
                        hyp_word_started = True
                        try:
                            hyp_word_item = hyp_word_iter.__next__()
                            hyp_word_builder.append(hyp_word_item[1])
                        except StopIteration:
                            pass

                # Check for syllable mismatches
                if ref_type == TokType.SyllableBoundary:
                    ref_syllable_count += 1
                if hyp_type == TokType.SyllableBoundary:
                    hyp_syllable_count += 1

                if (ref_type == TokType.SyllableBoundary == hyp_type
                        or ref_syllable_count == hyp_syllable_count):
                    # No syllable conflicts here!
                    ref_extra_syllable_word_index = None
                    hyp_extra_syllable_word_index = None
                elif (ref_type == TokType.SyllableBoundary
                      and not ref_extra_syllable_word_index
                      and TokType.checkAnnotation(
                          phone_align.s2[i - 1]) == TokType.WordBoundary):
                    # Extra syllable in hypothesis. We only care if the syllable immediately follows a word boundary.
                    # This is because this indicates that a new word is being formed, which may likely be an insertion in hyp.
                    ref_extra_syllable_word_index = len(ref_word_builder) - 1
                    # print ref_word_builder
                    # print 'Syllable/word mismatch at', i
                    # print 'Extra hyp word:', ref_word_builder[ref_extra_syllable_word_index]
                elif (hyp_type == TokType.SyllableBoundary
                      and not hyp_extra_syllable_word_index
                      and TokType.checkAnnotation(
                          phone_align.s2[i - 1]) == TokType.WordBoundary):
                    # This time there's an extra syllable in the ref, corresponding to a new ref word.
                    hyp_extra_syllable_word_index = len(hyp_word_builder) - 1
                    # print hyp_word_builder
                    # print 'Syllable/word mismatch at', i
                    # print 'Extra ref word:', hyp_word_builder[hyp_extra_syllable_word_index]
        # Concatenate all phoneme alignments
        fp_align = full_phone_align[0]
        for expand_align in full_phone_align[1:]:
            fp_align.append_alignment(expand_align)

        return ExpandedAlignment(full_reference, full_hypothesis,
                                 full_alignment), fp_align