def roberta_build(self, sparse=False, base_model=None, density=1.0, eval=True): if base_model == None: config = RobertaConfig( vocab_size=52_000, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) model = RobertaForMaskedLM(config=config).cuda() else: model = base_model if sparse: mp = BlockSparseModelPatcher() mp.add_pattern( "roberta\.encoder\.layer\.[0-9]+.intermediate\.dense", {"density": density}) mp.add_pattern("roberta\.encoder\.layer\.[0-9]+.output\.dense", {"density": density}) mp.patch_model(model) if eval: model.eval() return model, model.num_parameters()
def train_MLM(vocf,outmodel,data_df): bs=8 #tokenizer=BertWordPieceTokenizer(vocf)#input vocab.txt ttk=BertTokenizer.from_pretrained(vocf)#input vocab.txt fvoc=open(vocf) vlen=len(fvoc.readlines()) fvoc.close() config=RobertaConfig(vocab_size=vlen,max_position_embeddings=12,num_attention_heads=12, \ num_hidden_layers=6,type_vocab_size=1,hidden_size=768) model=RobertaForMaskedLM(config=config) model.num_parameters() dataset=tokDataset(data_df,ttk) # Data= DataLoader(dataset, batch_size=bs,shuffle=True,drop_last=False,num_workers=0,collate_fn=collate_fn) # data_collator = DataCollatorForLanguageModeling( # tokenizer=ttk, mlm=True, mlm_probability=0.15 # ) data_collator=collate_fn( tokenizer=ttk, mlm=True, mlm_probability=0.15 ) training_args = TrainingArguments( output_dir=outmodel,#embedding model path overwrite_output_dir=True, num_train_epochs=2, per_device_train_batch_size=bs, save_steps=10_000, save_total_limit=2, ) trainer = Trainer( model=model, args=training_args, train_dataset=dataset, data_collator=data_collator, prediction_loss_only=True ) trainer.train() trainer.save_model(outmodel) print('LM train done: ')
def build(config): tokenizer = RobertaTokenizerFast.from_pretrained( os.path.join(config.save_directory), max_len=config.max_length ) model_config = RobertaConfig( vocab_size=config.vocab_size, max_position_embeddings=config.max_length, num_attention_heads=config.num_attention_heads, num_hidden_layers=config.num_hidden_layers, type_vocab_size=1 ) model = RobertaForMaskedLM(config=model_config) print("the number of parameters of model: ", model.num_parameters()) dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=config.files, block_size=32 ) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=config.mlm_probability ) training_args = TrainingArguments( output_dir=os.path.join(config.save_directory), overwrite_output_dir=config.overwrite_output_dir, num_train_epochs=config.num_train_epochs, per_gpu_train_batch_size=config.per_gpu_train_batch_size, save_steps=config.save_steps, save_total_limit=config.save_total_limit ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, prediction_loss_only=config.prediction_loss_only ) return trainer
tokenizer.enable_truncation(max_length=512) print(tokenizer.encode("For it is in reality vain to profess")) config = RobertaConfig( vocab_size=52_000, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) tokenizer = RobertaTokenizerFast.from_pretrained(SAVE_MODEL, max_len=512) model = RobertaForMaskedLM(config=config) print(model.num_parameters()) dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=PATH + "/kant.txt", block_size=128, ) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) training_args = TrainingArguments( output_dir=SAVE_MODEL, overwrite_output_dir=True, num_train_epochs=1, per_device_train_batch_size=64,
from transformers import RobertaConfig from transformers import RobertaTokenizerFast from transformers import RobertaForMaskedLM config = RobertaConfig( vocab_size=52_000, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) tokenizer = RobertaTokenizerFast.from_pretrained("./EsperBERTo", max_len=512) model = RobertaForMaskedLM(config=config) model.num_parameters() from transformers import LineByLineTextDataset from transformers import DataCollatorForLanguageModeling dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path="./oscar.eo.txt", block_size=128, ) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) from transformers import Trainer, TrainingArguments
("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) print(tokenizer.encode("gostei muito dessa ideia".lower()).tokens) # Model type # -------------------------------------------------- config = RobertaConfig( vocab_size=vocab_size, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=8, type_vocab_size=1, ) model = RobertaForMaskedLM(config=config) print("Params: ", model.num_parameters()) tokenizer = RobertaTokenizerFast.from_pretrained("./BR_BERTo", max_len=512) # Dataset load # -------------------------------------------------- dataset = EsperantoDataset( tokenizer=tokenizer, file_path="./corpus.txt", length=corpus_length ) # Start training # -------------------------------------------------- data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=0.15 )
def main(argv): wandb.login() is_gpu = torch.cuda.is_available() config = RobertaConfig( vocab_size=FLAGS.vocab_size, max_position_embeddings=FLAGS.max_position_embeddings, num_attention_heads=FLAGS.num_attention_heads, num_hidden_layers=FLAGS.num_hidden_layers, type_vocab_size=FLAGS.type_vocab_size, ) if FLAGS.tokenizer_path: tokenizer_path = FLAGS.tokenizer_path elif FLAGS.tokenizer_type.upper() == "BPE": tokenizer_path = FLAGS.output_tokenizer_dir if not os.path.isdir(tokenizer_path): os.makedirs(tokenizer_path) tokenizer = ByteLevelBPETokenizer() tokenizer.train( files=FLAGS.dataset_path, vocab_size=FLAGS.vocab_size, min_frequency=FLAGS.BPE_min_frequency, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]) tokenizer.save_model(tokenizer_path) else: print("Please provide a tokenizer path if using the SMILES tokenizer") tokenizer = RobertaTokenizerFast.from_pretrained( tokenizer_path, max_len=FLAGS.max_tokenizer_len) model = RobertaForMaskedLM(config=config) model.num_parameters() dataset = RawTextDataset(tokenizer=tokenizer, file_path=FLAGS.dataset_path, block_size=FLAGS.tokenizer_block_size) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=FLAGS.mlm_probability) training_args = TrainingArguments( output_dir=FLAGS.output_dir, overwrite_output_dir=FLAGS.overwrite_output_dir, num_train_epochs=FLAGS.num_train_epochs, per_device_train_batch_size=FLAGS.per_device_train_batch_size, save_steps=FLAGS.save_steps, save_total_limit=FLAGS.save_total_limit, fp16=is_gpu, # fp16 only works on CUDA devices ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, ) trainer.train() trainer.save_model(FLAGS.model_name)
from transformers import Trainer, TrainingArguments from transformers import DataCollatorForLanguageModeling #configs config = RobertaConfig( vocab_size=52000, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) tokenizer = RobertaTokenizerFast.from_pretrained("./latentbert", max_len=512) model = RobertaForMaskedLM(config=config) print('num params: {}'.format(model.num_parameters())) #training dataset dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path="../results_file.txt", block_size=128, ) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) print('before trainer') #trainer
def train(epoch, vocab_size, train_files_path, save_path, learning_rate, save_steps, per_gpu_train_batch_size, gradient_accumulation_steps): """ 从头训练一个语言模型RoBERTa. """ from transformers import RobertaConfig config = RobertaConfig( #可调整参数 vocab_size=vocab_size, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=12, type_vocab_size=1, ) # 不训练,直接用BERT的分词器 from transformers import BertTokenizer tokenizer = BertTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext", max_len=512) # 事实上,hfl/chinese-roberta-wwm-ext/vocab.txt == chinese_L-12_H-768_A-12/vocab.txt #词典 from transformers import RobertaForMaskedLM """ 源码: class RobertaForMaskedLM() def __init__(self, config): super().__init__(config) if not config.is_decoder: logger.warning("If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`") self.roberta = RobertaModel(config, add_pooling_layer=False) self.lm_head = RobertaLMHead(config) self.init_weights() 可以发现,RobertaForMaskedLM包含了roberta主体模型(不含pooling层)和一个语言模型输出层 """ # 初始化模型 model = RobertaForMaskedLM(config=config) print("参数数量: ", model.num_parameters()) # 1亿参数 dataset = TextDataset( tokenizer=tokenizer, file_path=train_files_path, block_size=512, ) from transformers import DataCollatorForLanguageModeling data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) from transformers import Trainer, TrainingArguments training_args = TrainingArguments( output_dir=save_path, overwrite_output_dir=True, num_train_epochs=epoch, per_gpu_train_batch_size=per_gpu_train_batch_size, save_steps=save_steps, save_total_limit=2, gradient_accumulation_steps= gradient_accumulation_steps, # 32*8=256 batch_size learning_rate=learning_rate, weight_decay=0.01, adam_epsilon=1e-6, warmup_steps=10000, ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, ) trainer.train() trainer.save_model(save_path)
def main(): # Main training script wandb.login() #verify file length fname = 'pubchem-10m.txt' def file_len(fname): with open(fname) as f: for i, l in enumerate(f): pass return i + 1 print(file_len(fname)) torch.cuda.is_available() #checking if CUDA + Colab GPU works config = RobertaConfig( vocab_size=52_000, max_position_embeddings=512, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) tokenizer = RobertaTokenizerFast.from_pretrained( "seyonec/SMILES_tokenized_PubChem_shard00_160k", max_len=512) # test tokenizer.encode("[O-][N+](=O)c1cnc(s1)Sc1nnc(s1)N") model = RobertaForMaskedLM(config=config) model.num_parameters() dataset = RawTextDataset(tokenizer=tokenizer, file_path="pubchem-10m.txt", block_size=512) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) training_args = TrainingArguments( output_dir="PubChem_10M_SMILES_Tokenizer", overwrite_output_dir=True, num_train_epochs=10, per_gpu_train_batch_size=64, save_steps=10_000, save_total_limit=2, fp16=True, ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, ) trainer.train() trainer.save_model("PubChem_10M_SMILES_Tokenizer")