def build_dictionary(filenames, tokenize=word_tokenize): dictionary = Dictionary() for filename in filenames: with open(filename, 'r') as file: for line in file: for symbol in word_tokenize(line.strip()): dictionary.add_word(symbol) dictionary.add_word(dictionary.eos_word) return dictionary
def build_dictionary(filenames, tokenize=word_tokenize): #Change for enabling subwords according to Sennrich 2016 dictionary = Dictionary() for filename in filenames: with open(filename, 'r') as file: for line in file: for symbol in word_tokenize(line.strip()): dictionary.add_word(symbol + '$') dictionary.add_word(dictionary.eos_word) num_merges = 10 for i in range(0,num_merges): pairs = get_stats(dictionary) best = max(pairs, key=pairs.get) dictionary = merge_vocab(best, dictionary) print(best) return dictionary