Beispiel #1
0
def main():

    entries_symdata = symbols.get_entries_symdata(sys.argv[1])
    rep_table = symbols.get_phonemes_rep_table(sys.argv[1])
    phonemes_symdata = symbols.get_phonemes_symdata(sys.argv[1])

    with open(sys.argv[2], 'rb') as fpin:
        rawdata = fpin.read()

    dictionary = rawdata.splitlines(True)
    phone_dict = ['']*len(dictionary)
    for i, line in enumerate(dictionary):
        tmp = line.decode('utf_8')
        tmp = tmp.split(':')
        dictionary[i] = tmp[0]
        phone_dict[i] = tmp[1]

    bytecount = 1
    with open('compressed-lex', 'wb') as outfp:
        outfp.write(bytes('/* index to compressed data */\n', 'utf-8'))
        for i in range(len(dictionary)):
            word = dictionary[i]
            word = word.replace('\n', '')
            tmp = encode_word(word, entries_symdata)
            encodedlineend = ' */ '+''.join(tmp)+'0,\n'
            phonelist = phone_dict[i]
            phonelist = phonelist.replace('\n', '')
            phonelist = phonelist.split(' ')
            if phonelist[-1] == '':
                phonelist.pop()
            tmp = encode_phonelist(phonelist, phonemes_symdata, rep_table)
            encodedlinestart = '   '+''.join(reversed(tmp))+' 255, /* '
            bytecount += encodedlinestart.count(',')
            outfp.write(bytes(encodedlinestart, 'utf-8')
                        +bytes(word, 'utf-8')+bytes(encodedlineend, 'utf-8'))
            bytecount += encodedlineend.count(',')
        outfp.write(bytes('/* num_bytes = ', 'utf-8') + bytes(repr(bytecount), 'utf-8')
                    + bytes(' */\n', 'utf-8'))
Beispiel #2
0
def main():

    rep_table = symbols.get_phonemes_rep_table(sys.argv[2])
    phonemes_symdata = symbols.get_phonemes_symdata(sys.argv[2])
    entries_symdata = symbols.get_entries_symdata(sys.argv[2])

    lex = get_lex_as_lists(sys.argv[1])

    entries = get_entries_only(lex)

    phonemes = get_phonemes_only(lex)

    phones = decode_phonemes_dict(phonemes, phonemes_symdata, rep_table)
    words = decode_entries_dict(entries, entries_symdata)

#   freq_bins = find_sym_freq(phonemes)
#   print(freq_bins)

    with open('dict', 'wb') as fpout:
        for i in range(len(phones)):
            fpout.write(words[i])
            fpout.write(bytes(':', 'utf-8'))
            fpout.write(phones[i])
            fpout.write(bytes('\n', 'utf-8'))