def write_vocabulary(source_path, vocab_path_base): lang_ext = os.path.splitext(source_path)[1] vocab_location = os.path.dirname(vocab_path_base) if not os.path.exists(vocab_location): os.makedirs(vocab_location) with open(source_path, encoding='utf-8') as source, \ open(vocab_path_base + lang_ext, 'w', encoding='utf-8', newline='\n') as vocab: seen = dict() for line in source: words = tokenize(line.strip()) for word in words: try: seen[word] += 1 except KeyError: vocab.write(word + '\n') seen[word] = 1
def t(data, exptected=None): eq_(parse(tokenize(data)), exptected)
def t(self, data, expected=None): self.assertEqual(parse(tokenize(data)), expected)