def vocab(): vocab = English.default_vocab() lex = vocab['dog'] assert vocab[vocab.strings['dog']].orth_ == 'dog' lex = vocab['the'] lex = vocab['quick'] lex = vocab['jumped'] return vocab
def vocab(): data_dir = os.environ.get('SPACY_DATA') if data_dir is None: package = util.get_package_by_name('en') else: package = util.get_package(data_dir) vocab = English.default_vocab(package=package) lex = vocab['dog'] assert vocab[vocab.strings['dog']].orth_ == 'dog' lex = vocab['the'] lex = vocab['quick'] lex = vocab['jumped'] return vocab
def count_freqs(input_loc, output_loc): print(output_loc) vocab = English.default_vocab(get_lex_attr=None) tokenizer = Tokenizer.from_dir( vocab, path.join(English.default_data_dir(), 'tokenizer')) counts = PreshCounter() for json_comment in iter_comments(input_loc): doc = tokenizer(json_comment['body']) doc.count_by(ORTH, counts=counts) with codecs.open(output_loc, 'w', 'utf8') as file_: for orth, freq in counts: string = tokenizer.vocab.strings[orth] if not string.isspace(): file_.write('%d\t%s\n' % (freq, string))
def count_freqs(input_loc, output_loc): print(output_loc) vocab = English.default_vocab(get_lex_attr=None) tokenizer = Tokenizer.from_dir(vocab, path.join(English.default_data_dir(), 'tokenizer')) counts = PreshCounter() for json_comment in iter_comments(input_loc): doc = tokenizer(json_comment['body']) doc.count_by(ORTH, counts=counts) with io.open(output_loc, 'w', 'utf8') as file_: for orth, freq in counts: string = tokenizer.vocab.strings[orth] if not string.isspace(): file_.write('%d\t%s\n' % (freq, string))