def count_freqs(input_loc, output_loc): print(output_loc) vocab = English.default_vocab(get_lex_attr=None) tokenizer = Tokenizer.from_dir( vocab, path.join(English.default_data_dir(), 'tokenizer')) counts = PreshCounter() for json_comment in iter_comments(input_loc): doc = tokenizer(json_comment['body']) doc.count_by(ORTH, counts=counts) with codecs.open(output_loc, 'w', 'utf8') as file_: for orth, freq in counts: string = tokenizer.vocab.strings[orth] if not string.isspace(): file_.write('%d\t%s\n' % (freq, string))
def count_freqs(input_loc, output_loc): print(output_loc) vocab = English.default_vocab(get_lex_attr=None) tokenizer = Tokenizer.from_dir(vocab, path.join(English.default_data_dir(), 'tokenizer')) counts = PreshCounter() for json_comment in iter_comments(input_loc): doc = tokenizer(json_comment['body']) doc.count_by(ORTH, counts=counts) with io.open(output_loc, 'w', 'utf8') as file_: for orth, freq in counts: string = tokenizer.vocab.strings[orth] if not string.isspace(): file_.write('%d\t%s\n' % (freq, string))
def test_load(self): data_dir = English.default_data_dir() vocab = Vocab.from_dir(path.join(data_dir, 'vocab')) tokenizer = Tokenizer.from_dir(vocab, path.join(data_dir, 'tokenizer'))
def test_load(self): data_dir = English.default_data_dir() if path.exists(path.join(data_dir, 'vocab')): vocab = Vocab.from_dir(path.join(data_dir, 'vocab')) tokenizer = Tokenizer.from_dir(vocab, path.join(data_dir, 'tokenizer'))