def test_acceptor_wo_arcs_from_openfst(self): s1 = ''' ''' s2 = ''' 0 Inf 1 0.1 ''' s3 = ''' 0 Inf 1 0.1 2 0.2 ''' for device in self.devices: fsa1 = k2.Fsa.from_openfst(s1) print("fsa1 = ", k2.to_str(fsa1)) self.assertEqual('', k2.to_str(fsa1)) fsa2 = k2.Fsa.from_openfst(s2) self.assertEqual(_remove_leading_spaces(k2.to_str(fsa2)), "1 2 -1 -0.1\n2") arcs2 = fsa2.arcs.values()[:, :-1] assert torch.all( torch.eq(arcs2, torch.tensor([[1, 2, -1]], dtype=torch.int32))) fsa3 = k2.Fsa.from_openfst(s3) self.assertEqual(fsa3.arcs.dim0(), 4) self.assertEqual(_remove_leading_spaces(k2.to_str(fsa3)), "1 3 -1 -0.1\n2 3 -1 -0.2\n3")
def test_transducer2_ragged2_from_str(self): s = ''' 0 1 2 22 101 [] [] -1.2 0 2 10 100 102 [] [] -2.2 1 6 -1 16 103 [20 30] [40] -4.2 1 3 3 33 104 [] [] -3.2 2 6 -1 26 105 [] [] -5.2 2 4 2 22 106 [] [] -6.2 3 6 -1 36 107 [] [] -7.2 5 0 1 50 108 [] [] -8.2 6 ''' fsa = k2.Fsa.from_str(s, aux_label_names=['aux_labels', 'aux_labels2'], ragged_label_names=['ragged1', 'ragged2']) assert fsa.aux_labels.dtype == torch.int32 assert fsa.aux_labels.device.type == 'cpu' assert isinstance(fsa.ragged1, k2.RaggedTensor) assert isinstance(fsa.ragged2, k2.RaggedTensor) assert torch.all( torch.eq( fsa.aux_labels, torch.tensor([22, 100, 16, 33, 26, 22, 36, 50], dtype=torch.int32))) assert torch.all( torch.eq( fsa.aux_labels2, torch.tensor([101, 102, 103, 104, 105, 106, 107, 108], dtype=torch.int32))) assert torch.allclose( fsa.scores, torch.tensor([-1.2, -2.2, -4.2, -3.2, -5.2, -6.2, -7.2, -8.2], dtype=torch.float32)) print("fsa.ragged1 = ", fsa.ragged1) print("fsa.ragged2 = ", fsa.ragged2) assert fsa.ragged1 == k2.RaggedTensor( '[ [] [] [20 30] [] [] [] [] [] ]') assert fsa.ragged2 == k2.RaggedTensor('[ [] [] [40] [] [] [] [] [] ]') # only aux_labels will be printed right now.. expected_str = ''' 0 1 2 22 101 [ ] [ ] -1.2 0 2 10 100 102 [ ] [ ] -2.2 1 6 -1 16 103 [ 20 30 ] [ 40 ] -4.2 1 3 3 33 104 [ ] [ ] -3.2 2 6 -1 26 105 [ ] [ ] -5.2 2 4 2 22 106 [ ] [ ] -6.2 3 6 -1 36 107 [ ] [ ] -7.2 5 0 1 50 108 [ ] [ ] -8.2 6 ''' print("fsa = ", _remove_leading_spaces(k2.to_str(fsa))) assert _remove_leading_spaces(expected_str) == \ _remove_leading_spaces(k2.to_str(fsa))
def test_acceptor_from_openfst(self): s = ''' 0 1 2 -1.2 0 2 10 -2.2 1 6 1 -3.2 1 3 3 -4.2 2 6 2 -5.2 2 4 2 -6.2 3 6 3 -7.2 5 0 1 -8.2 7 6 -9.2 ''' for i in range(4): if i == 0: fsa = k2.Fsa.from_openfst(s) elif i == 1: fsa = k2.Fsa.from_openfst(s, acceptor=True) elif i == 2: fsa = k2.Fsa.from_openfst(s, num_aux_labels=0) else: fsa = k2.Fsa.from_openfst(s, aux_label_names=[]) expected_str = ''' 0 1 2 -1.2 0 2 10 -2.2 1 6 1 -3.2 1 3 3 -4.2 2 6 2 -5.2 2 4 2 -6.2 3 6 3 -7.2 5 0 1 -8.2 6 8 -1 -9.2 7 8 -1 0 8 ''' assert _remove_leading_spaces(expected_str) == \ _remove_leading_spaces(k2.to_str(fsa, openfst=True)) arcs = fsa.arcs.values()[:, :-1] assert isinstance(arcs, torch.Tensor) assert arcs.dtype == torch.int32 assert arcs.device.type == 'cpu' assert arcs.shape == (10, 3), 'there should be 10 arcs' assert torch.all( torch.eq(arcs[0], torch.tensor([0, 1, 2], dtype=torch.int32))) assert torch.allclose( fsa.scores, torch.tensor([1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2, 8.2, 9.2, 0], dtype=torch.float32)) fsa.scores *= -1 assert torch.allclose( fsa.scores, torch.tensor( [-1.2, -2.2, -3.2, -4.2, -5.2, -6.2, -7.2, -8.2, -9.2, 0], dtype=torch.float32))
def test_acceptor_from_openfst_ragged1(self): s = ''' 0 1 2 [] -1.2 0 2 10 [10] -2.2 1 6 1 [] -3.2 1 3 3 [11 12] -4.2 2 6 2 [] -5.2 2 4 2 [] -6.2 3 6 3 [] -7.2 5 0 1 [13] -8.2 7 6 -9.2 ''' fsa = k2.Fsa.from_openfst(s, num_aux_labels=0, ragged_label_names=['ragged']) expected_str = ''' 0 1 2 [ ] -1.2 0 2 10 [ 10 ] -2.2 1 6 1 [ ] -3.2 1 3 3 [ 11 12 ] -4.2 2 6 2 [ ] -5.2 2 4 2 [ ] -6.2 3 6 3 [ ] -7.2 5 0 1 [ 13 ] -8.2 6 8 -1 [ ] -9.2 7 8 -1 [ ] 0 8 ''' string = _remove_leading_spaces(k2.to_str(fsa, openfst=True)) print("fsa=", string) assert _remove_leading_spaces(expected_str) == string
def transform(args): if args.normalise: normalise(args) symbols = symboletable(args) with open(os.path.join(args.data_directory, 'emission_symbols'), 'w') as f: f.write(symbols) lpzscp = os.path.join(args.data_directory, 'lpz.scp') if not args.normalise else os.path.join( args.data_directory, 'lpz_norm.scp') fstpath = os.path.join(args.data_directory, 'efst') if not args.normalise else os.path.join( args.data_directory, 'efst_norm') os.makedirs(fstpath, exist_ok=True) uttid2datafile = dict() with open(lpzscp) as f: for line in f: lc = line.strip().split() uttid2datafile[lc[0]] = lc[1] fst_scp = os.path.join(args.data_directory, 'efst.scp') if not args.normalise else os.path.join( args.data_directory, 'efst_norm.fst') with open(fst_scp, 'w') as f: fc = '' for uttid in uttid2datafile.keys(): datafile = uttid2datafile[uttid] data = np.load(datafile) efst = transform_utt(data) efst_str = k2.to_str(efst) efst_file = os.path.join(fstpath, uttid) with open(efst_file, 'w') as g: g.write(efst_str) fc += '%s %s\n' % (uttid, efst_file) f.write(fc)
def lexicon_fst(args): ''' This programme create lexicon.fst.pdf and lexicon.fst.txt based on args.word_file input: args: name_space return: lexicon: k2.Fsa, lexicon fst output: lexicon.fst.txt and lexicon.fst.pdf in args.data_directory By lexicon fst, we compress the repeated chars in emission fst. ''' symbols_str = symboletable(args) symbols_paris = symbols_str.split('\n') num_noneps = len(symbols_paris) - 1 symbol2fst = [None] # <eps> has no fst for i in range(1, num_noneps + 1): s = ''' 0 1 %d %d 0.0 1 1 %d 0 0.0 1 2 -1 -1 0.0 2 ''' % (i, i, i) g = k2.Fsa.from_str(s, acceptor=False) symbol2fst.append(g) fst_vec = k2.create_fsa_vec(symbol2fst[1:]) fst_union = k2.union(fst_vec) lexicon = k2.closure(fst_union) lexicon.draw(os.path.join(args.data_directory, 'lexicon.fst.pdf'), title='lexicon') # lexicon.symbols = k2.SymbolTable.from_str(symbols_str) # lexicon.aux_symbols = k2.SymbolTable.from_str(symbols_str) with open(os.path.join(args.data_directory, 'lexicon.fst.txt'), 'w') as f: f.write(k2.to_str(lexicon))
def test_transducer_from_str(self): s = ''' 0 1 2 22 -1.2 0 2 10 100 -2.2 1 6 -1 16 -4.2 1 3 3 33 -3.2 2 6 -1 26 -5.2 2 4 2 22 -6.2 3 6 -1 36 -7.2 5 0 1 50 -8.2 6 ''' fsa = k2.Fsa.from_str(_remove_leading_spaces(s)) assert fsa.aux_labels.dtype == torch.int32 assert fsa.aux_labels.device.type == 'cpu' assert torch.allclose( fsa.aux_labels, torch.tensor([22, 100, 16, 33, 26, 22, 36, 50], dtype=torch.int32)) expected_str = ''' 0 1 2 22 -1.2 0 2 10 100 -2.2 1 6 -1 16 -4.2 1 3 3 33 -3.2 2 6 -1 26 -5.2 2 4 2 22 -6.2 3 6 -1 36 -7.2 5 0 1 50 -8.2 6 ''' assert _remove_leading_spaces(expected_str) == _remove_leading_spaces( k2.to_str(fsa))
def test(self): # for the symbol table # <eps> 0 # a 0 # b 1 # c 2 # an FSA that recognizes a+(b|c) s = ''' 0 1 1 0.1 1 1 1 0.2 1 2 2 0.3 1 3 3 0.4 2 4 -1 0.5 3 4 -1 0.6 5 ''' a_fsa = k2.Fsa.from_str(s) a_fsa.requires_grad_(True) # an FSA that recognizes ab s = ''' 0 1 1 10 1 2 2 20 2 3 -1 30 3 ''' b_fsa = k2.Fsa.from_str(s) b_fsa.requires_grad_(True) fsa = k2.intersect(a_fsa, b_fsa) assert len(fsa.shape) == 2 actual_str = k2.to_str(fsa) expected_str = '\n'.join( ['0 1 1 10.1', '1 2 2 20.3', '2 3 -1 30.5', '3']) assert actual_str.strip() == expected_str loss = fsa.scores.sum() loss.backward() # arc 0, 2, and 4 of a_fsa are kept in the final intersected FSA assert torch.allclose( a_fsa.scores.grad, torch.tensor([1, 0, 1, 0, 1, 0], dtype=torch.float32)) assert torch.allclose(b_fsa.scores.grad, torch.tensor([1, 1, 1], dtype=torch.float32)) # if any of the input FSA is an FsaVec, # the outupt FSA is also an FsaVec. a_fsa.scores.grad = None b_fsa.scores.grad = None a_fsa = k2.create_fsa_vec([a_fsa]) fsa = k2.intersect(a_fsa, b_fsa) assert len(fsa.shape) == 3
def test_treat_epsilon_specially_true(self): # this version works only on CPU and requires # arc-sorted inputs # a_fsa recognizes `(1|3)?2*` s1 = ''' 0 1 3 0.0 0 1 1 0.2 0 1 0 0.1 1 1 2 0.3 1 2 -1 0.4 2 ''' a_fsa = k2.Fsa.from_str(s1) a_fsa.requires_grad_(True) # b_fsa recognizes `1|2|5` s2 = ''' 0 1 5 0 0 1 1 1 0 1 2 2 1 2 -1 3 2 ''' b_fsa = k2.Fsa.from_str(s2) b_fsa.requires_grad_(True) # fsa recognizes 1|2 fsa = k2.intersect(k2.arc_sort(a_fsa), k2.arc_sort(b_fsa)) assert len(fsa.shape) == 2 actual_str = k2.to_str(fsa) expected_str = '\n'.join( ['0 1 0 0.1', '0 2 1 1.2', '1 2 2 2.3', '2 3 -1 3.4', '3']) assert actual_str.strip() == expected_str loss = fsa.scores.sum() (-loss).backward() # arc 1, 2, 3, and 4 of a_fsa are kept in the final intersected FSA assert torch.allclose(a_fsa.grad, torch.tensor([0, -1, -1, -1, -1]).to(a_fsa.grad)) # arc 1, 2, and 3 of b_fsa are kept in the final intersected FSA assert torch.allclose(b_fsa.grad, torch.tensor([0, -1, -1, -1]).to(b_fsa.grad)) # if any of the input FSA is an FsaVec, # the outupt FSA is also an FsaVec. a_fsa.scores.grad = None b_fsa.scores.grad = None a_fsa = k2.create_fsa_vec([a_fsa]) fsa = k2.intersect(k2.arc_sort(a_fsa), k2.arc_sort(b_fsa)) assert len(fsa.shape) == 3
def test_treat_epsilon_specially_false(self): devices = [torch.device('cpu')] if torch.cuda.is_available(): devices.append(torch.device('cuda')) for device in devices: # a_fsa recognizes `(0|1)2*` s1 = ''' 0 1 0 0.1 0 1 1 0.2 1 1 2 0.3 1 2 -1 0.4 2 ''' a_fsa = k2.Fsa.from_str(s1).to(device) a_fsa.requires_grad_(True) # b_fsa recognizes `1|2` s2 = ''' 0 1 1 1 0 1 2 2 1 2 -1 3 2 ''' b_fsa = k2.Fsa.from_str(s2).to(device) b_fsa.requires_grad_(True) # fsa recognizes `1` fsa = k2.intersect(a_fsa, b_fsa, treat_epsilons_specially=False) assert len(fsa.shape) == 2 actual_str = k2.to_str(fsa) expected_str = '\n'.join(['0 1 1 1.2', '1 2 -1 3.4', '2']) assert actual_str.strip() == expected_str loss = fsa.scores.sum() (-loss).backward() # arc 1 and 3 of a_fsa are kept in the final intersected FSA assert torch.allclose(a_fsa.grad, torch.tensor([0, -1, 0, -1]).to(a_fsa.grad)) # arc 0 and 2 of b_fsa are kept in the final intersected FSA assert torch.allclose(b_fsa.grad, torch.tensor([-1, 0, -1]).to(b_fsa.grad)) # if any of the input FSA is an FsaVec, # the outupt FSA is also an FsaVec. a_fsa.scores.grad = None b_fsa.scores.grad = None a_fsa = k2.create_fsa_vec([a_fsa]) fsa = k2.intersect(a_fsa, b_fsa, treat_epsilons_specially=False) assert len(fsa.shape) == 3
def test_transducer2_from_str(self): s = ''' 0 1 2 22 101 -1.2 0 2 10 100 102 -2.2 1 6 -1 16 103 -4.2 1 3 3 33 104 -3.2 2 6 -1 26 105 -5.2 2 4 2 22 106 -6.2 3 6 -1 36 107 -7.2 5 0 1 50 108 -8.2 6 ''' for i in range(2): if i == 0: fsa = k2.Fsa.from_str(s, num_aux_labels=2) else: fsa = k2.Fsa.from_str( s, aux_label_names=['aux_labels', 'aux_labels2']) assert fsa.aux_labels.dtype == torch.int32 assert fsa.aux_labels.device.type == 'cpu' assert torch.all( torch.eq( fsa.aux_labels, torch.tensor([22, 100, 16, 33, 26, 22, 36, 50], dtype=torch.int32))) assert torch.all( torch.eq( fsa.aux_labels2, torch.tensor([101, 102, 103, 104, 105, 106, 107, 108], dtype=torch.int32))) assert torch.allclose( fsa.scores, torch.tensor([-1.2, -2.2, -4.2, -3.2, -5.2, -6.2, -7.2, -8.2], dtype=torch.float32)) # only aux_labels will be printed right now.. expected_str = ''' 0 1 2 22 -1.2 0 2 10 100 -2.2 1 6 -1 16 -4.2 1 3 3 33 -3.2 2 6 -1 26 -5.2 2 4 2 22 -6.2 3 6 -1 36 -7.2 5 0 1 50 -8.2 6 ''' assert _remove_leading_spaces(expected_str) == \ _remove_leading_spaces(k2.to_str(fsa))
def test_transducer_from_openfst(self): s = ''' 0 1 2 22 -1.2 0 2 10 100 -2.2 1 6 1 16 -4.2 1 3 3 33 -3.2 2 6 2 26 -5.2 2 4 2 22 -6.2 3 6 3 36 -7.2 5 0 1 50 -8.2 7 -9.2 6 ''' for i in range(3): if i == 0: fsa = k2.Fsa.from_openfst(s, acceptor=False) elif i == 1: fsa = k2.Fsa.from_openfst(s, num_aux_labels=1) else: fsa = k2.Fsa.from_openfst(s, aux_label_names=['aux_labels']) assert fsa.aux_labels.dtype == torch.int32 assert fsa.aux_labels.device.type == 'cpu' assert torch.all( torch.eq( fsa.aux_labels, torch.tensor([22, 100, 16, 33, 26, 22, 36, 50, -1, -1], dtype=torch.int32))) assert torch.allclose( fsa.scores, torch.tensor([1.2, 2.2, 4.2, 3.2, 5.2, 6.2, 7.2, 8.2, 0, 9.2], dtype=torch.float32)) expected_str = ''' 0 1 2 22 -1.2 0 2 10 100 -2.2 1 6 1 16 -4.2 1 3 3 33 -3.2 2 6 2 26 -5.2 2 4 2 22 -6.2 3 6 3 36 -7.2 5 0 1 50 -8.2 6 8 -1 -1 0 7 8 -1 -1 -9.2 8 ''' assert _remove_leading_spaces(expected_str) == \ _remove_leading_spaces(k2.to_str(fsa, openfst=True))
def test_acceptor_from_str(self): s = ''' 0 1 2 -1.2 0 2 10 -2.2 1 6 -1 -3.2 1 3 3 -4.2 2 6 -1 -5.2 2 4 2 -6.2 3 6 -1 -7.2 5 0 1 -8.2 6 ''' fsa = k2.Fsa.from_str(_remove_leading_spaces(s)) expected_str = ''' 0 1 2 -1.2 0 2 10 -2.2 1 6 -1 -3.2 1 3 3 -4.2 2 6 -1 -5.2 2 4 2 -6.2 3 6 -1 -7.2 5 0 1 -8.2 6 ''' assert _remove_leading_spaces(expected_str) == _remove_leading_spaces( k2.to_str(fsa)) arcs = fsa.arcs.values()[:, :-1] assert isinstance(arcs, torch.Tensor) assert arcs.dtype == torch.int32 assert arcs.device.type == 'cpu' assert arcs.shape == (8, 3), 'there should be 8 arcs' assert torch.allclose(arcs[0], torch.tensor([0, 1, 2], dtype=torch.int32)) assert torch.allclose( fsa.scores, torch.tensor([-1.2, -2.2, -3.2, -4.2, -5.2, -6.2, -7.2, -8.2], dtype=torch.float32)) fsa.scores *= -1 assert torch.allclose( fsa.scores, torch.tensor([1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2, 8.2], dtype=torch.float32))
def gfst(args): ''' This programme is for debugging only. Usually, for different task, we need diffrent gfst. Imagine that we only have ' <eps> 0 <blank> 1 <unk> 2 n 3 y 4 <eos> 5 ' these 6 different symbols and try to generate yyn, ynn, and 3gram gfst. ''' symbols = symboletable(args) yyn = ''' 0 1 0 0 0.0 0 2 0 0 0.0 0 3 0 0 0.0 0 4 0 0 0.0 0 5 0 0 0.0 0 6 0 0 0.0 1 2 4 4 0.0 1 6 1 0 0.0 1 7 5 5 0.0 2 3 1 0 0.0 2 7 5 5 0.0 3 4 4 4 0.0 3 7 5 5 0.0 4 5 1 0 0.0 4 1 3 3 0.0 4 7 5 5 0.0 5 1 3 3 0.0 5 7 5 5 0.0 6 2 4 4 0.0 6 7 5 5 0.0 7 8 -1 -1 0.0 8 ''' yyn_fst = k2.Fsa.from_str(yyn, acceptor=False) # yyn_fst.symbols = k2.SymbolTable.from_str(symbols) # yyn_fst.aux_symbols = k2.SymbolTable.from_str(symbols) gfst_dir = os.path.join(args.data_directory, 'G') os.makedirs(gfst_dir, exist_ok=True) yyn_fst.draw(os.path.join(gfst_dir, 'yyn.pdf'), 'yyn') with open(os.path.join(gfst_dir, 'yyn.fst.txt'), 'w') as f: f.write(k2.to_str(yyn_fst))
def test_acceptor_from_tensor(self): fsa_tensor = torch.tensor( [[0, 1, 2, _k2._float_as_int(-1.2)], [0, 2, 10, _k2._float_as_int(-2.2)], [1, 6, -1, _k2._float_as_int(-3.2)], [1, 3, 3, _k2._float_as_int(-4.2)], [2, 6, -1, _k2._float_as_int(-5.2)], [2, 4, 2, _k2._float_as_int(-6.2)], [3, 6, -1, _k2._float_as_int(-7.2)], [5, 0, 1, _k2._float_as_int(-8.2)]], dtype=torch.int32) fsa = k2.Fsa(fsa_tensor) expected_str = ''' 0 1 2 -1.2 0 2 10 -2.2 1 6 -1 -3.2 1 3 3 -4.2 2 6 -1 -5.2 2 4 2 -6.2 3 6 -1 -7.2 5 0 1 -8.2 6 ''' assert _remove_leading_spaces(expected_str) == _remove_leading_spaces( k2.to_str(fsa)) arcs = fsa.arcs.values()[:, :-1] assert isinstance(arcs, torch.Tensor) assert arcs.dtype == torch.int32 assert arcs.device.type == 'cpu' assert arcs.shape == (8, 3), 'there should be 8 arcs' assert torch.allclose(arcs[0], torch.tensor([0, 1, 2], dtype=torch.int32)) assert torch.allclose( fsa.score, torch.tensor([-1.2, -2.2, -3.2, -4.2, -5.2, -6.2, -7.2, -8.2], dtype=torch.float32)) fsa.score *= -1 assert torch.allclose( fsa.score, torch.tensor([1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2, 8.2], dtype=torch.float32))
def test_transducer_from_str(self): s = ''' 0 1 2 22 -1.2 0 2 10 100 -2.2 1 6 -1 16 -4.2 1 3 3 33 -3.2 2 6 -1 26 -5.2 2 4 2 22 -6.2 3 6 -1 36 -7.2 5 0 1 50 -8.2 6 ''' for i in range(3): if i == 0: fsa = k2.Fsa.from_str(s, num_aux_labels=1) elif i == 1: fsa = k2.Fsa.from_str(s, acceptor=False) else: fsa = k2.Fsa.from_str(s, aux_label_names=['aux_labels']) assert fsa.aux_labels.dtype == torch.int32 assert fsa.aux_labels.device.type == 'cpu' assert torch.all( torch.eq( fsa.aux_labels, torch.tensor([22, 100, 16, 33, 26, 22, 36, 50], dtype=torch.int32))) assert torch.allclose( fsa.scores, torch.tensor([-1.2, -2.2, -4.2, -3.2, -5.2, -6.2, -7.2, -8.2], dtype=torch.float32)) expected_str = ''' 0 1 2 22 -1.2 0 2 10 100 -2.2 1 6 -1 16 -4.2 1 3 3 33 -3.2 2 6 -1 26 -5.2 2 4 2 22 -6.2 3 6 -1 36 -7.2 5 0 1 50 -8.2 6 ''' assert _remove_leading_spaces(expected_str) == \ _remove_leading_spaces(k2.to_str(fsa))
def test(self): s = ''' 0 1 2 0.1 0 1 1 0.2 1 2 -1 0.3 2 ''' fsa = k2.Fsa.from_str(s) fsa.requires_grad_(True) sorted_fsa = k2.arc_sort(fsa) actual_str = k2.to_str(sorted_fsa) expected_str = '\n'.join(['0 1 1 0.2', '0 1 2 0.1', '1 2 -1 0.3', '2']) assert actual_str.strip() == expected_str loss = (sorted_fsa.scores[1] + sorted_fsa.scores[2]) / 2 loss.backward() assert torch.allclose(fsa.scores.grad, torch.tensor([0.5, 0, 0.5], dtype=torch.float32))
class TestConnect(unittest.TestCase): s = ''' 0 1 1 0.1 0 2 2 0.2 1 4 -1 0.3 3 4 -1 0.4 4 ''' fsa = k2.Fsa.from_str(s) fsa.requires_grad_(True) expected_str = '\n'.join(['0 1 1 0.1', '1 2 -1 0.3', '2']) connected_fsa = k2.connect(fsa) actual_str = k2.to_str(connected_fsa) assert actual_str.strip() == expected_str loss = connected_fsa.scores.sum() loss.backward() assert torch.allclose(fsa.scores.grad, torch.tensor([1, 0, 1, 0], dtype=torch.float32))
def test_transducer_from_tensor(self): for device in self.devices: fsa_tensor = torch.tensor( [[0, 1, 2, _k2.float_as_int(-1.2)], [0, 2, 10, _k2.float_as_int(-2.2)], [1, 6, -1, _k2.float_as_int(-4.2)], [1, 3, 3, _k2.float_as_int(-3.2)], [2, 6, -1, _k2.float_as_int(-5.2)], [2, 4, 2, _k2.float_as_int(-6.2)], [3, 6, -1, _k2.float_as_int(-7.2)], [5, 0, 1, _k2.float_as_int(-8.2)]], dtype=torch.int32).to(device) aux_labels_tensor = torch.tensor([22, 100, 16, 33, 26, 22, 36, 50], dtype=torch.int32).to(device) fsa = k2.Fsa(fsa_tensor, aux_labels_tensor) assert fsa.aux_labels.dtype == torch.int32 assert fsa.aux_labels.device.type == device.type assert torch.all( torch.eq( fsa.aux_labels, torch.tensor([22, 100, 16, 33, 26, 22, 36, 50], dtype=torch.int32).to(device))) assert torch.allclose( fsa.scores, torch.tensor([-1.2, -2.2, -4.2, -3.2, -5.2, -6.2, -7.2, -8.2], dtype=torch.float32, device=device)) expected_str = ''' 0 1 2 22 -1.2 0 2 10 100 -2.2 1 6 -1 16 -4.2 1 3 3 33 -3.2 2 6 -1 26 -5.2 2 4 2 22 -6.2 3 6 -1 36 -7.2 5 0 1 50 -8.2 6 ''' assert _remove_leading_spaces(expected_str) == \ _remove_leading_spaces(k2.to_str(fsa))
def test_transducer_from_tensor(self): devices = [torch.device('cpu')] if torch.cuda.is_available(): devices.append(torch.device('cuda', 0)) for device in devices: fsa_tensor = torch.tensor( [[0, 1, 2, _k2.float_as_int(-1.2)], [0, 2, 10, _k2.float_as_int(-2.2)], [1, 6, -1, _k2.float_as_int(-4.2)], [1, 3, 3, _k2.float_as_int(-3.2)], [2, 6, -1, _k2.float_as_int(-5.2)], [2, 4, 2, _k2.float_as_int(-6.2)], [3, 6, -1, _k2.float_as_int(-7.2)], [5, 0, 1, _k2.float_as_int(-8.2)]], dtype=torch.int32).to(device) aux_labels_tensor = torch.tensor([22, 100, 16, 33, 26, 22, 36, 50], dtype=torch.int32).to(device) fsa = k2.Fsa(fsa_tensor, aux_labels_tensor) assert fsa.aux_labels.dtype == torch.int32 assert fsa.aux_labels.device.type == device.type assert torch.allclose( fsa.aux_labels, torch.tensor([22, 100, 16, 33, 26, 22, 36, 50], dtype=torch.int32).to(device)) expected_str = ''' 0 1 2 22 -1.2 0 2 10 100 -2.2 1 6 -1 16 -4.2 1 3 3 33 -3.2 2 6 -1 26 -5.2 2 4 2 22 -6.2 3 6 -1 36 -7.2 5 0 1 50 -8.2 6 ''' assert _remove_leading_spaces( expected_str) == _remove_leading_spaces(k2.to_str(fsa))
def test_acceptor_wo_arcs_from_str(self): s1 = ''' ''' s2 = ''' 0 1 ''' s3 = ''' 1 ''' for device in self.devices: fsa1 = k2.Fsa.from_str(s1) self.assertEqual(k2.to_str(fsa1), '') with self.assertRaises(ValueError): _ = k2.Fsa.from_str(s2) fsa3 = k2.Fsa.from_str(s3) self.assertEqual(fsa3.arcs.dim0(), 0)
def test_transducer_from_openfst(self): s = ''' 0 1 2 22 -1.2 0 2 10 100 -2.2 1 6 1 16 -4.2 1 3 3 33 -3.2 2 6 2 26 -5.2 2 4 2 22 -6.2 3 6 3 36 -7.2 5 0 1 50 -8.2 7 -9.2 6 ''' fsa = k2.Fsa.from_openfst(_remove_leading_spaces(s), acceptor=False) assert fsa.aux_labels.dtype == torch.int32 assert fsa.aux_labels.device.type == 'cpu' assert torch.allclose( fsa.aux_labels, torch.tensor([22, 100, 16, 33, 26, 22, 36, 50, 0, 0], dtype=torch.int32)) expected_str = ''' 0 1 2 22 -1.2 0 2 10 100 -2.2 1 6 1 16 -4.2 1 3 3 33 -3.2 2 6 2 26 -5.2 2 4 2 22 -6.2 3 6 3 36 -7.2 5 0 1 50 -8.2 6 8 -1 0 -0 7 8 -1 0 -9.2 8 ''' assert _remove_leading_spaces(expected_str) == _remove_leading_spaces( k2.to_str(fsa, openfst=True))
def test(self): devices = [torch.device('cpu')] if torch.cuda.is_available(): devices.append(torch.device('cuda')) for device in devices: for use_identity_map, sorted_match_a in [(True, True), (False, True), (True, False), (False, False)]: # recognizes (0|1)(0|2) s1 = ''' 0 1 0 0.1 0 1 1 0.2 1 2 0 0.4 1 2 2 0.3 2 3 -1 0.5 3 ''' # recognizes 02* s2 = ''' 0 1 0 1 1 1 2 2 1 2 -1 3 2 ''' # recognizes 1*0 s3 = ''' 0 0 1 10 0 1 0 20 1 2 -1 30 2 ''' a_fsa = k2.Fsa.from_str(s1).to(device) b_fsa_1 = k2.Fsa.from_str(s2).to(device) b_fsa_2 = k2.Fsa.from_str(s3).to(device) a_fsa.requires_grad_(True) b_fsa_1.requires_grad_(True) b_fsa_2.requires_grad_(True) b_fsas = k2.create_fsa_vec([b_fsa_1, b_fsa_2]) if use_identity_map: a_fsas = k2.create_fsa_vec([a_fsa, a_fsa]) b_to_a_map = torch.tensor([0, 1], dtype=torch.int32).to(device) else: a_fsas = k2.create_fsa_vec([a_fsa]) b_to_a_map = torch.tensor([0, 0], dtype=torch.int32).to(device) c_fsas = k2.intersect_device(a_fsas, b_fsas, b_to_a_map, sorted_match_a) assert c_fsas.shape == (2, None, None) c_fsas = k2.connect(c_fsas.to('cpu')) # c_fsas[0] recognizes: 02 # c_fsas[1] recognizes: 10 actual_str_0 = k2.to_str(c_fsas[0]) expected_str_0 = '\n'.join( ['0 1 0 1.1', '1 2 2 2.3', '2 3 -1 3.5', '3']) assert actual_str_0.strip() == expected_str_0 actual_str_1 = k2.to_str(c_fsas[1]) expected_str_1 = '\n'.join( ['0 1 1 10.2', '1 2 0 20.4', '2 3 -1 30.5', '3']) assert actual_str_1.strip() == expected_str_1 loss = c_fsas.scores.sum() (-loss).backward() assert torch.allclose( a_fsa.grad, torch.tensor([-1, -1, -1, -1, -2]).to(a_fsa.grad)) assert torch.allclose( b_fsa_1.grad, torch.tensor([-1, -1, -1]).to(b_fsa_1.grad)) assert torch.allclose( b_fsa_2.grad, torch.tensor([-1, -1, -1]).to(b_fsa_2.grad))
def lexicon_fst_whole(args): ''' This programme create lexicon.fst.pdf and lexicon.fst.txt based on args.word_file input: args: name_space return: lexicon: k2.Fsa, lexicon fst output: lexicon.fst.txt and lexicon.fst.pdf in args.data_directory By lexicon fst, we compress the repeated chars in emission fst. ''' symbols_str = symboletable(args) symbols_paris = symbols_str.split('\n') num_noneps = len(symbols_paris) - 1 s = '' count = 1 for i in range(1, num_noneps + 1): s += ''' 0 %d %d %d 0.0 %d %d %d 0 0.0 %d %d -1 -1 0.0 %d 0 0 0 0.0''' % ( i, i, i, i, i, i, i, num_noneps + 1, i) slines = s.strip().split('\n') def extract_first_index(line): line_content = line.strip().split() return int(line_content[0]) slines = sorted(slines, key=lambda l: extract_first_index(l)) s = '\n'.join(slines) s += '\n%d\n' % (num_noneps + 1) # s1 = ''' # 0 1 1 1 0.0 # 0 2 2 2 0.0 # 0 3 3 3 0.0 # 0 4 4 4 0.0 # 0 5 5 5 0.0 # 1 1 1 0 0.0 # 1 6 -1 -1 0.0 # 1 0 0 0 0.0 # 2 2 2 0 0.0 # 2 6 -1 -1 0.0 # 2 0 0 0 0.0 # 3 3 3 0 0.0 # 3 6 -1 -1 0.0 # 3 0 0 0 0.0 # 4 4 4 0 0.0 # 4 6 -1 -1 0.0 # 4 0 0 0 0.0 # 5 5 5 0 0.0 # 5 6 -1 -1 0.0 # 5 0 0 0 0.0 # 6 # ''' with open('lex.txt', 'w') as f: f.write(s) g = k2.Fsa.from_str(s, acceptor=False) # g.symbols = k2.SymbolTable.from_str(symbols_str) # g.aux_symbols = k2.SymbolTable.from_str(symbols_str) g.draw(os.path.join(args.data_directory, 'lexicon.newfst.pdf'), title='lexicon') with open(os.path.join(args.data_directory, 'lexicon.newfst.txt'), 'w') as f: f.write(k2.to_str(g))
def test_transducer3_from_openfst(self): s = ''' 0 1 2 22 33 44 -1.2 0 2 10 100 101 102 -2.2 1 6 1 16 17 18 -4.2 1 3 3 33 34 35 -3.2 2 6 2 26 27 28 -5.2 2 4 2 22 23 24 -6.2 3 6 3 36 37 38 -7.2 5 0 1 50 51 52 -8.2 7 -9.2 6 ''' for i in range(2): if i == 0: fsa = k2.Fsa.from_openfst(s, num_aux_labels=3) else: fsa = k2.Fsa.from_openfst(s, aux_label_names=[ 'aux_labels', 'aux_labels2', 'aux_labels3' ]) assert fsa.aux_labels.dtype == torch.int32 assert fsa.aux_labels.device.type == 'cpu' assert torch.all( torch.eq( fsa.aux_labels, torch.tensor([22, 100, 16, 33, 26, 22, 36, 50, -1, -1], dtype=torch.int32))) assert fsa.aux_labels2.dtype == torch.int32 assert fsa.aux_labels2.device.type == 'cpu' assert torch.all( torch.eq( fsa.aux_labels2, torch.tensor([33, 101, 17, 34, 27, 23, 37, 51, -1, -1], dtype=torch.int32))) assert fsa.aux_labels3.dtype == torch.int32 assert fsa.aux_labels3.device.type == 'cpu' assert torch.all( torch.eq( fsa.aux_labels3, torch.tensor([44, 102, 18, 35, 28, 24, 38, 52, -1, -1], dtype=torch.int32))) assert torch.allclose( fsa.scores, torch.tensor([1.2, 2.2, 4.2, 3.2, 5.2, 6.2, 7.2, 8.2, 0, 9.2], dtype=torch.float32)) expected_str = ''' 0 1 2 22 -1.2 0 2 10 100 -2.2 1 6 1 16 -4.2 1 3 3 33 -3.2 2 6 2 26 -5.2 2 4 2 22 -6.2 3 6 3 36 -7.2 5 0 1 50 -8.2 6 8 -1 -1 0 7 8 -1 -1 -9.2 8 ''' assert _remove_leading_spaces(expected_str) == \ _remove_leading_spaces(k2.to_str(fsa, openfst=True))