Example #1
0
    def test_matching(self):
        filename = os.path.join(DATA, "track_000000")
        wordalign = self._alignerio.read_aligned( filename )[1]
        self.assertEqual(len(wordalign), 21)

        ref = [u"好", u"感", u"啦 @", u"好似", u"梁", u"安", u"琪", u"咁", u"係", u"啦", u"係", u"我", u"以前", u"都", u"聽", u"過", u"佢", u"", u"節目", u"喀", u"覺得", u"佢", u"講", u"", u"非常之", u"膚", u"淺", u"及", u"幼稚", u"呀", u"哦", u"即", u"khut6", u"嘩", u"咁"]
        hyp = [(token,score) for (start,end,token,score) in wordalign]

        pattern = Patterns()
        pattern.set_ngram(3)
        m3 = pattern.ngram_matchings( ref,hyp )

        # Search for the lowest value in ref:
        minr = min( [ v[0] for v in m3 ] )
        maxr = max( [ v[0] for v in m3 ] )
        newref = ref[minr:maxr+1]
        minh = min( [ v[1] for v in m3 ] )
        maxh = max( [ v[1] for v in m3 ] )
        newhyp = hyp[minh:maxh+1]

        pattern = Patterns()
        pattern.set_ngram(3)
        newm3 = pattern.ngram_alignments( newref,newhyp )

        newm3 = [ (v[0]+minr,v[1]+minh) for v in newm3 ]

        self.assertEqual(newm3,[(5, 5), (6, 6), (7, 7), (17, 16), (18, 17), (19, 18)])

        pattern = Patterns()
        pattern.set_score(0.6)
        pattern.set_ngram(1)
        m1 = pattern.ngram_alignments( newref,newhyp )
        newm1 = [ (v[0]+minr,v[1]+minh) for v in m1 ]
        self.assertEqual(newm1,[(6, 6)])
Example #2
0
    def _fix_matchings_list(self, ref, hyp, N ):
        """
        Create the list of matches between ref and hyp.

        """
        pattern = Patterns()
        pattern.set_ngram( N )
        m3 = pattern.ngram_matchings( ref,hyp )

        if len( m3 ) == 0:
            return []

        # Search for the lowest/highest values in ref/hyp:
        listref = [ v[0] for v in m3 ]
        listhyp = [ v[1] for v in m3 ]
        minr = min( listref )
        maxr = max( listref )
        newref = ref[minr:maxr+1]
        minh = min( listhyp )
        maxh = max( listhyp )
        newhyp = hyp[minh:maxh+1]

        # Do some hypothesis were found several times in the reference?
        if len(listhyp) > len(list(set(listhyp))):

            pattern.set_ngram( N+2 )
            newm3 = pattern.ngram_matchings( ref,hyp )
            listref = [ v[0] for v in m3 ]
            listhyp = [ v[1] for v in m3 ]
            if len(listhyp) > len(list(set(listhyp))):
                newm3 = []

        else:
            newm3 = m3

        m1 = []
        if len(hyp) < N:
            pattern = Patterns()
            pattern.set_score(0.9)
            pattern.set_ngram(1)
            pattern.set_gap(1)
            m1 = pattern.ngram_alignments( newref,newhyp )

        return sorted(list(set(m1+newm3)))
Example #3
0
class TestPatterns(unittest.TestCase):

    def setUp(self):
        self._patterns = Patterns()

    def test_set_score(self):
        self.assertEqual( self._patterns.get_score(), 1. )
        self._patterns.set_score(0.9)
        self.assertEqual( self._patterns.get_score(), 0.9 )
        with self.assertRaises(ValueError):
            self._patterns.set_score(-1.)
            self._patterns.set_score(2.)

    def test_set_ngram(self):
        self.assertEqual( self._patterns.get_ngram(), 3 )
        self._patterns.set_ngram(2)
        self.assertEqual( self._patterns.get_ngram(), 2 )
        with self.assertRaises(ValueError):
            self._patterns.set_score(0)
            self._patterns.set_score(5)

    def test_set_gap(self):
        self.assertEqual( self._patterns.get_gap(), 2 )
        self._patterns.set_gap(1)
        self.assertEqual( self._patterns.get_gap(), 1 )
        with self.assertRaises(ValueError):
            self._patterns.set_gap(-1)
            self._patterns.set_gap(5)

    def test_ngram_alignments(self):
        ref = [ "w0", "w1",  "w2",  "w3",  "w4",  "w5",  "w6",  "w7",  "w8",  "w9",  "w10",  "w11",  "w12" ]
        hyp = [ ("w0",0.8), ("w1",1),  ("w2",0.7),  ("wX",0.9),  ("w3",1),  ("w5",0.4),  ("w6",0.95),  ("wX",1),  ("w9",1) ]

        self._patterns.set_ngram(3)
        self._patterns.set_gap(1)
        self.assertEqual([ (0,0), (1,1), (2,2) ], self._patterns.ngram_alignments(ref,hyp))

        self._patterns.set_ngram(2)
        self._patterns.set_gap(1)
        self.assertEqual([(0,0), (1,1), (2,2), (5,5), (6,6)], self._patterns.ngram_alignments(ref,hyp))

        self._patterns.set_ngram(1)
        self._patterns.set_score(0.9)

        self._patterns.set_gap(0)
        self.assertEqual([(1,1), (6,6)], self._patterns.ngram_alignments(ref,hyp))
        self._patterns.set_gap(1)
        self.assertEqual([(1,1), (3,4), (6,6), (9,8)], self._patterns.ngram_alignments(ref,hyp))
        self._patterns.set_gap(2)
        self.assertEqual([(1,1), (3,4), (6,6), (9,8)], self._patterns.ngram_alignments(ref,hyp))

        self._patterns.set_score(0.5)
        self._patterns.set_gap(1)
        self.assertEqual([(0,0), (1,1), (2,2), (3,4), (6,6), (9,8)], self._patterns.ngram_alignments(ref,hyp))


    def test_ngram_alignments_repeats(self):
        ref = [ "喀", "早晨", "係", "係", "係", "喀", "我" ]
        hyp = [ ("兩", 0.207), ("兩", 0.369), ("兩", 0.536), ("係", 0.165), ("係", 0.201), ("係", 0.193), ("係", 0.172), ("係", 0.182)]

        self._patterns.set_gap(2)
        self._patterns.set_ngram(3)
        #print self._patterns.ngram_alignments(ref,hyp)
        self._patterns.set_ngram(2)
        #print self._patterns.ngram_alignments(ref,hyp)


    def test_ngram_matchings(self):
        ref = [ "wa", "wb", "w0", "wa", "wX", "w0", "w1",  "w2",  "w3",  "w4",  "w5",  "w6",  "w7",  "w8",  "w9",  "w10",  "w11",  "w12", "wX", "wX" ]
        hyp = [ ("w0",0.8), ("w1",1),  ("w2",0.7),  ("wX",0.9),  ("w3",1),  ("w5",0.4),  ("w6",0.95),  ("wX",1),  ("w9",1) ]

        self.assertEqual([(5, 0), (6, 1), (7, 2)] , self._patterns.ngram_matchings(ref,hyp))