def main(): entries_symdata = symbols.get_entries_symdata(sys.argv[1]) rep_table = symbols.get_phonemes_rep_table(sys.argv[1]) phonemes_symdata = symbols.get_phonemes_symdata(sys.argv[1]) with open(sys.argv[2], 'rb') as fpin: rawdata = fpin.read() dictionary = rawdata.splitlines(True) phone_dict = ['']*len(dictionary) for i, line in enumerate(dictionary): tmp = line.decode('utf_8') tmp = tmp.split(':') dictionary[i] = tmp[0] phone_dict[i] = tmp[1] bytecount = 1 with open('compressed-lex', 'wb') as outfp: outfp.write(bytes('/* index to compressed data */\n', 'utf-8')) for i in range(len(dictionary)): word = dictionary[i] word = word.replace('\n', '') tmp = encode_word(word, entries_symdata) encodedlineend = ' */ '+''.join(tmp)+'0,\n' phonelist = phone_dict[i] phonelist = phonelist.replace('\n', '') phonelist = phonelist.split(' ') if phonelist[-1] == '': phonelist.pop() tmp = encode_phonelist(phonelist, phonemes_symdata, rep_table) encodedlinestart = ' '+''.join(reversed(tmp))+' 255, /* ' bytecount += encodedlinestart.count(',') outfp.write(bytes(encodedlinestart, 'utf-8') +bytes(word, 'utf-8')+bytes(encodedlineend, 'utf-8')) bytecount += encodedlineend.count(',') outfp.write(bytes('/* num_bytes = ', 'utf-8') + bytes(repr(bytecount), 'utf-8') + bytes(' */\n', 'utf-8'))
def main(): rep_table = symbols.get_phonemes_rep_table(sys.argv[2]) phonemes_symdata = symbols.get_phonemes_symdata(sys.argv[2]) entries_symdata = symbols.get_entries_symdata(sys.argv[2]) lex = get_lex_as_lists(sys.argv[1]) entries = get_entries_only(lex) phonemes = get_phonemes_only(lex) phones = decode_phonemes_dict(phonemes, phonemes_symdata, rep_table) words = decode_entries_dict(entries, entries_symdata) # freq_bins = find_sym_freq(phonemes) # print(freq_bins) with open('dict', 'wb') as fpout: for i in range(len(phones)): fpout.write(words[i]) fpout.write(bytes(':', 'utf-8')) fpout.write(phones[i]) fpout.write(bytes('\n', 'utf-8'))