def get_vocab(args): vocab = Vocab() if args.model in ["bert", "mmbt", "concatbert"]: bert_tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True) vocab.stoi = bert_tokenizer.vocab vocab.itos = bert_tokenizer.ids_to_tokens vocab.vocab_sz = len(vocab.itos) else: word_list = get_glove_words(args.glove_path) vocab.add(word_list) return vocab
logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger("brc") logger.setLevel(logging.INFO) brc_data = DatasetReader( test_file=args.input, bert_dir='/home/wujindou/chinese_L-12_H-768_A-12', # prefix='bert_meizhuang' #test_file = None, ) from data.vocab import Vocab vocab = Vocab(lower=True) import sys for word in brc_data.word_iter(None): vocab.add(word) for char in word: vocab.add_char(char) logger.info(' char size {}'.format(vocab.get_char_vocab_size())) logger.info(' vocab size {} '.format(vocab.get_word_vocab())) # unfiltered_vocab_size = vocab.size() unfiltered_char_size = vocab.get_char_vocab_size() vocab.filter_tokens_by_cnt(min_cnt=2) vocab.filter_chars_by_cnt(min_cnt=2) filtered_num = unfiltered_vocab_size - vocab.size() logger.info('After filter {} tokens, the final vocab size is {}'.format( filtered_num, vocab.size())) filtered_num = unfiltered_char_size - vocab.get_char_vocab_size()