Esempio n. 1
0
def write_vocabulary(source_path, vocab_path_base):
    lang_ext = os.path.splitext(source_path)[1]
    vocab_location = os.path.dirname(vocab_path_base)

    if not os.path.exists(vocab_location):
        os.makedirs(vocab_location)

    with open(source_path, encoding='utf-8') as source, \
            open(vocab_path_base + lang_ext, 'w', encoding='utf-8', newline='\n') as vocab:
        seen = dict()

        for line in source:
            words = tokenize(line.strip())

            for word in words:
                try:
                    seen[word] += 1
                except KeyError:
                    vocab.write(word + '\n')
                    seen[word] = 1
Esempio n. 2
0
def t(data, exptected=None):
    eq_(parse(tokenize(data)), exptected)
Esempio n. 3
0
 def t(self, data, expected=None):
     self.assertEqual(parse(tokenize(data)), expected)
Esempio n. 4
0
 def t(self, data, expected=None):
     self.assertEqual(parse(tokenize(data)), expected)
Esempio n. 5
0
def t(data, exptected=None):
    eq_(parse(tokenize(data)), exptected)