def build_dictionary(filenames, tokenize=word_tokenize):
    dictionary = Dictionary()
    for filename in filenames:
        with open(filename, 'r') as file:
            for line in file:
                for symbol in word_tokenize(line.strip()):
                    dictionary.add_word(symbol)
                dictionary.add_word(dictionary.eos_word)
    return dictionary
Beispiel #2
0
def build_dictionary(filenames, tokenize=word_tokenize): 
    #Change for enabling subwords according to Sennrich 2016
    dictionary = Dictionary()
    for filename in filenames:
        with open(filename, 'r') as file:
            for line in file:
                for symbol in word_tokenize(line.strip()):
                    dictionary.add_word(symbol + '$')
                dictionary.add_word(dictionary.eos_word)
                
        num_merges = 10
        for i in range(0,num_merges):
            pairs = get_stats(dictionary)
            best = max(pairs, key=pairs.get)
            dictionary = merge_vocab(best, dictionary)
            print(best)
       
    return dictionary