def build_vocab(filepath, tokenizer): counter = Counter() with io.open(filepath, encoding="utf8") as f: for string_ in f: counter.update(tokenizer(string_)) vocab = Vocab(counter, specials=['<unk>', '<pad>', '<sos>', '<eos>'], max_size=10000, min_freq=2); vocab.init_token = '<sos>' vocab.eos_token = '<eos>' return vocab
def build_vocab_tsv(filepath, tokenizer, column = 0): counter = Counter() with io.open(filepath, encoding="utf8") as f: for string_ in f: string_ = string_.rstrip() string_ = string_.split("\t") string_ = string_[column] counter.update(tokenizer(string_.lower())) vocab = Vocab(counter, specials=['<unk>', '<pad>', '<sos>', '<eos>'], max_size=40000, min_freq=2); vocab.init_token = '<sos>' vocab.eos_token = '<eos>' vocab.pad_token = '<pad>' return vocab