forked from peterdrysdale/bellbird_extras
/
decodelex.py
executable file
·133 lines (105 loc) · 3.86 KB
/
decodelex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python3
# (c) The contributors to Bellbird
# This code is licensed GPLv2+
# This script decodes a Bellbird lexicon file
# Usage decodelex fname_compressed_lex fname_cmu_lex_entries.c
# Script outputs an uncompressed dictionary 'dict'
import sys # for argv
import re # for regex
import symbols # module for symbol table functions
def find_sym_freq(lex):
"Finds frequency of symbols in lexicon entries"
freq_bins = [0]*256
for line in lex:
for elmnt in enumerate(line):
freq_bins[int(elmnt)] += 1
return freq_bins
def get_entries_only(lex):
"Extract only entries from lexicon"
entries = [0]*len(lex)
for i, line in enumerate(lex):
try:
start = line.index('255')
except ValueError:
exit(1)
entries[i] = line[start+1:]
return entries
def get_phonemes_only(lex):
"Extract only phonemes from lexicon"
phonemes = [0]*len(lex)
for i, line in enumerate(lex):
try:
end = line.index('255')
except ValueError:
exit(1)
phonemes[i] = line[:end]
return phonemes
def get_lex_as_lists(fname):
"Prepare list of lists of decimal numbers of compressed dictionary"
with open(fname, 'rb') as fpin:
rawdata = fpin.read()
lexbyline = rawdata.splitlines(True)
lexbyline.pop(0) # remove comment header
lexbyline.pop() # remove comment on last line
# convert lexicon into utf-8 strings
for i, line in enumerate(lexbyline):
lexbyline[i] = line.decode("utf_8")
# remove C comments and extraneous spaces
for i, line in enumerate(lexbyline):
lexbyline[i] = re.sub(r" /\*.*\*/ ", "", line)
lexbyline[i] = re.sub(" *", "", lexbyline[i])
# Convert into list of lists of decimal numbers
for i, line in enumerate(lexbyline):
lexbyline[i] = lexbyline[i].replace('\n', '')
lexbyline[i] = lexbyline[i].split(',')
lexbyline[i].pop()
return lexbyline
def decode_entries_dict(entries, symdata):
"Decode entries given symdata to give dictionary words"
words = []
for i, line in enumerate(entries):
words.append(b'')
numofsymbols = len(line)
j = 0
while j < numofsymbols:
if int(line[j]) == 1:
words[i] = words[i] + bytes((int(line[j+1]),))
j += 2
else:
words[i] = words[i] + symdata[int(line[j])].encode('ascii')
j += 1
return words
def decode_phonemes_dict(phonemes, symdata, rep_table):
"Decode phonemes given symbol data and representation table for a dictionary"
phones = []
for i, line in enumerate(phonemes):
phones.append(b'')
numofsymbols = len(line)
j = numofsymbols-1
if j == -1:
phones[i] = phones[i] + b' '
while j > -1: # phoneme symbols are stored in reverse order so read them backwards
sym = symdata[int(line[j])]
for k in range(len(sym)):
phones[i] = phones[i] + rep_table[sym[k]].encode('utf-8') + b' '
j -= 1
return phones
def main():
rep_table = symbols.get_phonemes_rep_table(sys.argv[2])
phonemes_symdata = symbols.get_phonemes_symdata(sys.argv[2])
entries_symdata = symbols.get_entries_symdata(sys.argv[2])
lex = get_lex_as_lists(sys.argv[1])
entries = get_entries_only(lex)
phonemes = get_phonemes_only(lex)
phones = decode_phonemes_dict(phonemes, phonemes_symdata, rep_table)
words = decode_entries_dict(entries, entries_symdata)
# freq_bins = find_sym_freq(phonemes)
# print(freq_bins)
with open('dict', 'wb') as fpout:
for i in range(len(phones)):
fpout.write(words[i])
fpout.write(bytes(':', 'utf-8'))
fpout.write(phones[i])
fpout.write(bytes('\n', 'utf-8'))
if __name__ == "__main__":
main()