def test_itf(): tokenizer = Tokenizer.from_pretrained(Config.model_name) if Config.use_pickle: with open(f'{Config.pickle_path}', 'rb') as f: train_data = pickle.load(f) else: train_data = make_train_data_from_txt(Config, tokenizer) counter, itf = make_itf(train_data, Config.vocab_size, tokenizer) # itf = (itf - itf.min()) / (itf.max() - itf.min()) # for i in range(itf.size(0)): # print(i, itf[i]) # itf[itf == 0] += 1e-6 for k, v in counter.most_common(len(counter)): print(tokenizer.decode([k]), v)
def preprocess_bert(config): opt = config['opt'] from transformers import BertTokenizer from transformers import DistilBertTokenizer from transformers import AlbertTokenizer from transformers import RobertaTokenizer from transformers import BartTokenizer from transformers import ElectraTokenizer TOKENIZER_CLASSES = { "bert": BertTokenizer, "distilbert": DistilBertTokenizer, "albert": AlbertTokenizer, "roberta": RobertaTokenizer, "bart": BartTokenizer, "electra": ElectraTokenizer, } Tokenizer = TOKENIZER_CLASSES[config['emb_class']] tokenizer = Tokenizer.from_pretrained(opt.bert_model_name_or_path, do_lower_case=opt.bert_do_lower_case) # build poss, chars, labels path = os.path.join(opt.data_dir, _TRAIN_FILE) poss, chars, labels = build_dict(path, config) # build features path = os.path.join(opt.data_dir, _TRAIN_FILE) train_features = build_features(path, tokenizer, poss, labels, config, mode='train') path = os.path.join(opt.data_dir, _VALID_FILE) valid_features = build_features(path, tokenizer, poss, labels, config, mode='valid') path = os.path.join(opt.data_dir, _TEST_FILE) test_features = build_features(path, tokenizer, poss, labels, config, mode='test') # write features path = os.path.join(opt.data_dir, _TRAIN_FILE + _FSUFFIX) write_features(train_features, path) path = os.path.join(opt.data_dir, _VALID_FILE + _FSUFFIX) write_features(valid_features, path) path = os.path.join(opt.data_dir, _TEST_FILE + _FSUFFIX) write_features(test_features, path) # write poss, labels path = os.path.join(opt.data_dir, _POS_FILE) write_dict(poss, path) path = os.path.join(opt.data_dir, _LABEL_FILE) write_dict(labels, path)
logging.basicConfig(level=logging.INFO) if __name__ == '__main__': logging.info('*** Initializing ***') if not os.path.isdir(Config.data_dir): os.mkdir(Config.data_dir) seed_everything(Config.seed) device = torch.device(Config.device) start_epoch = 0 logging.info('Define Models') model = build_model(Config).to(device) tokenizer = Tokenizer.from_pretrained(Config.model_name) logging.info('Define Loss and Optimizer') criterion = LabelSmoothing(tokenizer.vocab_size, pad_id=tokenizer.pad_token_id, smoothing=Config.smoothing) _opt = optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9) optimizer = get_optimizer(_opt, factor=Config.factor, warmup=Config.warmup) logging.info('Preparing training data') if Config.use_pickle: with open(f'{Config.pickle_path}', 'rb') as f: train_data = pickle.load(f) else: train_data = make_train_data_from_txt(Config, tokenizer) dataset = DialogDataset(train_data, tokenizer)