def test_matching(self): filename = os.path.join(DATA, "track_000000") wordalign = self._alignerio.read_aligned( filename )[1] self.assertEqual(len(wordalign), 21) ref = [u"好", u"感", u"啦 @", u"好似", u"梁", u"安", u"琪", u"咁", u"係", u"啦", u"係", u"我", u"以前", u"都", u"聽", u"過", u"佢", u"", u"節目", u"喀", u"覺得", u"佢", u"講", u"", u"非常之", u"膚", u"淺", u"及", u"幼稚", u"呀", u"哦", u"即", u"khut6", u"嘩", u"咁"] hyp = [(token,score) for (start,end,token,score) in wordalign] pattern = Patterns() pattern.set_ngram(3) m3 = pattern.ngram_matchings( ref,hyp ) # Search for the lowest value in ref: minr = min( [ v[0] for v in m3 ] ) maxr = max( [ v[0] for v in m3 ] ) newref = ref[minr:maxr+1] minh = min( [ v[1] for v in m3 ] ) maxh = max( [ v[1] for v in m3 ] ) newhyp = hyp[minh:maxh+1] pattern = Patterns() pattern.set_ngram(3) newm3 = pattern.ngram_alignments( newref,newhyp ) newm3 = [ (v[0]+minr,v[1]+minh) for v in newm3 ] self.assertEqual(newm3,[(5, 5), (6, 6), (7, 7), (17, 16), (18, 17), (19, 18)]) pattern = Patterns() pattern.set_score(0.6) pattern.set_ngram(1) m1 = pattern.ngram_alignments( newref,newhyp ) newm1 = [ (v[0]+minr,v[1]+minh) for v in m1 ] self.assertEqual(newm1,[(6, 6)])
class TestPatterns(unittest.TestCase): def setUp(self): self._patterns = Patterns() def test_set_score(self): self.assertEqual( self._patterns.get_score(), 1. ) self._patterns.set_score(0.9) self.assertEqual( self._patterns.get_score(), 0.9 ) with self.assertRaises(ValueError): self._patterns.set_score(-1.) self._patterns.set_score(2.) def test_set_ngram(self): self.assertEqual( self._patterns.get_ngram(), 3 ) self._patterns.set_ngram(2) self.assertEqual( self._patterns.get_ngram(), 2 ) with self.assertRaises(ValueError): self._patterns.set_score(0) self._patterns.set_score(5) def test_set_gap(self): self.assertEqual( self._patterns.get_gap(), 2 ) self._patterns.set_gap(1) self.assertEqual( self._patterns.get_gap(), 1 ) with self.assertRaises(ValueError): self._patterns.set_gap(-1) self._patterns.set_gap(5) def test_ngram_alignments(self): ref = [ "w0", "w1", "w2", "w3", "w4", "w5", "w6", "w7", "w8", "w9", "w10", "w11", "w12" ] hyp = [ ("w0",0.8), ("w1",1), ("w2",0.7), ("wX",0.9), ("w3",1), ("w5",0.4), ("w6",0.95), ("wX",1), ("w9",1) ] self._patterns.set_ngram(3) self._patterns.set_gap(1) self.assertEqual([ (0,0), (1,1), (2,2) ], self._patterns.ngram_alignments(ref,hyp)) self._patterns.set_ngram(2) self._patterns.set_gap(1) self.assertEqual([(0,0), (1,1), (2,2), (5,5), (6,6)], self._patterns.ngram_alignments(ref,hyp)) self._patterns.set_ngram(1) self._patterns.set_score(0.9) self._patterns.set_gap(0) self.assertEqual([(1,1), (6,6)], self._patterns.ngram_alignments(ref,hyp)) self._patterns.set_gap(1) self.assertEqual([(1,1), (3,4), (6,6), (9,8)], self._patterns.ngram_alignments(ref,hyp)) self._patterns.set_gap(2) self.assertEqual([(1,1), (3,4), (6,6), (9,8)], self._patterns.ngram_alignments(ref,hyp)) self._patterns.set_score(0.5) self._patterns.set_gap(1) self.assertEqual([(0,0), (1,1), (2,2), (3,4), (6,6), (9,8)], self._patterns.ngram_alignments(ref,hyp)) def test_ngram_alignments_repeats(self): ref = [ "喀", "早晨", "係", "係", "係", "喀", "我" ] hyp = [ ("兩", 0.207), ("兩", 0.369), ("兩", 0.536), ("係", 0.165), ("係", 0.201), ("係", 0.193), ("係", 0.172), ("係", 0.182)] self._patterns.set_gap(2) self._patterns.set_ngram(3) #print self._patterns.ngram_alignments(ref,hyp) self._patterns.set_ngram(2) #print self._patterns.ngram_alignments(ref,hyp) def test_ngram_matchings(self): ref = [ "wa", "wb", "w0", "wa", "wX", "w0", "w1", "w2", "w3", "w4", "w5", "w6", "w7", "w8", "w9", "w10", "w11", "w12", "wX", "wX" ] hyp = [ ("w0",0.8), ("w1",1), ("w2",0.7), ("wX",0.9), ("w3",1), ("w5",0.4), ("w6",0.95), ("wX",1), ("w9",1) ] self.assertEqual([(5, 0), (6, 1), (7, 2)] , self._patterns.ngram_matchings(ref,hyp))
def setUp(self): self._patterns = Patterns()
def _fix_matchings_list(self, ref, hyp, N ): """ Create the list of matches between ref and hyp. """ pattern = Patterns() pattern.set_ngram( N ) m3 = pattern.ngram_matchings( ref,hyp ) if len( m3 ) == 0: return [] # Search for the lowest/highest values in ref/hyp: listref = [ v[0] for v in m3 ] listhyp = [ v[1] for v in m3 ] minr = min( listref ) maxr = max( listref ) newref = ref[minr:maxr+1] minh = min( listhyp ) maxh = max( listhyp ) newhyp = hyp[minh:maxh+1] # Do some hypothesis were found several times in the reference? if len(listhyp) > len(list(set(listhyp))): pattern.set_ngram( N+2 ) newm3 = pattern.ngram_matchings( ref,hyp ) listref = [ v[0] for v in m3 ] listhyp = [ v[1] for v in m3 ] if len(listhyp) > len(list(set(listhyp))): newm3 = [] else: newm3 = m3 m1 = [] if len(hyp) < N: pattern = Patterns() pattern.set_score(0.9) pattern.set_ngram(1) pattern.set_gap(1) m1 = pattern.ngram_alignments( newref,newhyp ) return sorted(list(set(m1+newm3)))