def load_tokenizer(tknzr_file, flag_tknzr_fast, pad_token=None, mask_token=None): """ Interestingly, HuggingFace does not allow the base tokenizer to be called. This is a bizarre choice, but accordingly we have to look for something else , which is why I use the PreTrainedTokenizerFast to wrap the base tokenizer. Written in Rust, it's faster than the base tokenizer class, but also lets you call the tokenizer as tknzr('text to be tokenized'). Input tknzr_file (str) : .json file of the tokenizer trained previously *_tokens (str) : tokens that are to be used in the corresponding context Some of them are not implemented yet... Output tknzr : tokenizer as PreTrainedTokenizerFast class to be passed on """ if flag_tknzr_fast: tknzr = PreTrainedTokenizerFast(tokenizer_file=tknzr_file) else: tknzr = PreTrainedTokenizer(tokenizer_file=tknzr_file) tknzr.pad_token = pad_token tknzr.mask_token = mask_token return tknzr
def preprocess(texts, tokenizer_path, max_len=32): input_ids, input_masks = [], [] tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path) tokenizer.mask_token = '[MASK]' tokenizer.pad_token = "[PAD]" tokenizer.sep_token = "[SEP]" tokenizer.cls_token = "[CLS]" tokenizer.unk_token = "[UNK]" for text in tqdm(texts): encoded = tokenizer.encode_plus(text, max_length=max_len, pad_to_max_length=True, truncation=True) input_ids.append(encoded['input_ids']) input_masks.append(encoded['attention_mask']) return [np.array(input_ids), np.array(input_masks)]
tokenizer_path = dataset_path / 'tokenizer1' tokenizer_path.mkdir(parents=True, exist_ok=True) tokenizer.save(str(tokenizer_path / "tokenizer.json")) # Re-create as roberta compatible tokenizer tokenizer_path = dataset_path / 'tokenizer1' print(tokenizer_path) tokenizer2 = PreTrainedTokenizerFast(tokenizer_file=str(tokenizer_path / "tokenizer.json")) tokenizer2._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer2._tokenizer.token_to_id("</s>")), ("<s>", tokenizer2._tokenizer.token_to_id("<s>")), ) tokenizer2._tokenizer.enable_truncation(max_length=128) # 512 tokenizer2.mask_token = "<mask>" tokenizer2.pad_token = "<pad>" # 3. Train a language model config = RobertaConfig( vocab_size=tokenizer2._tokenizer.get_vocab_size(), hidden_size=240, intermediate_size=2048, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, bos_token_id=tokenizer2._tokenizer.token_to_id("<s>"), eos_token_id=tokenizer2._tokenizer.token_to_id("</s>"), pad_token_id=tokenizer2._tokenizer.token_to_id("<pad>"), # attention_probs_dropout_prob=0.0,
DATA_PATH = 'data/item_name.txt' parser = argparse.ArgumentParser(description='Training language model') parser.add_argument('--config_path', type=str, default='src/configs/train_lm1.yaml', help='path to config file') args = parser.parse_args() config = OmegaConf.load(args.config_path) print(OmegaConf.to_yaml(config)) os.environ['WANDB_DISABLED'] = 'true' tokenizer = PreTrainedTokenizerFast(tokenizer_file=config.tokenizer_path) tokenizer.mask_token = '[MASK]' tokenizer.pad_token = "[PAD]" tokenizer.sep_token = "[SEP]" tokenizer.cls_token = "[CLS]" tokenizer.unk_token = "[UNK]" distilbert_config = DistilBertConfig(vocab_size=config.vocab_size, n_heads=8, dim=512, hidden_dim=2048) model = DistilBertForMaskedLM(distilbert_config) dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=DATA_PATH, block_size=64) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer,