def test_model_for_pretraining_from_pretrained(self): logging.basicConfig(level=logging.INFO) for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, BertConfig) model = AutoModelForPreTraining.from_pretrained(model_name) model, loading_info = AutoModelForPreTraining.from_pretrained(model_name, output_loading_info=True) self.assertIsNotNone(model) self.assertIsInstance(model, BertForPreTraining) for value in loading_info.values(): self.assertEqual(len(value), 0)
def test_model_for_pretraining_from_pretrained(self): for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, BertConfig) model = AutoModelForPreTraining.from_pretrained(model_name) model, loading_info = AutoModelForPreTraining.from_pretrained( model_name, output_loading_info=True) self.assertIsNotNone(model) self.assertIsInstance(model, BertForPreTraining) for key, value in loading_info.items(): # Only one value should not be initialized and in the missing keys. self.assertEqual(len(value), 1 if key == "missing_keys" else 0)
def test_model_for_pretraining_from_pretrained(self): for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, BertConfig) model = AutoModelForPreTraining.from_pretrained(model_name) model, loading_info = AutoModelForPreTraining.from_pretrained(model_name, output_loading_info=True) self.assertIsNotNone(model) self.assertIsInstance(model, BertForPreTraining) # Only one value should not be initialized and in the missing keys. missing_keys = loading_info.pop("missing_keys") self.assertListEqual(["cls.predictions.decoder.bias"], missing_keys) for key, value in loading_info.items(): self.assertEqual(len(value), 0)
def word_preds(datadir: str, ff_size: int): log.configure(os.path.join(datadir, "dabert-word-preds.log"), "daBERT word predictions") log("Loading metadata") with open(os.path.join(datadir, DatasetBuilder.metadata_file)) as f: metadata = json.load(f) log("Loading model") dabert = AutoModelForPreTraining.from_pretrained(daBERT).to(device) log("Loading data") dataloader = DataLoader( datadir, metadata, dict(), device, ) loader = dataloader.get_dataloader(ff_size, None) log("Forward passing") correct_preds = np.zeros(len(loader)) for i, batch in tqdm(enumerate(loader), total=len(loader)): logits = dabert(batch.words.ids).prediction_logits masked_logits = logits[batch.word_mask] preds = masked_logits.argmax(dim=1) correct_preds[i] = (preds == batch.word_mask_labels).float().mean().cpu() log( "MLM token prediction accuracy", " Mean: %.4f %%" % (100 * correct_preds.mean()), " Std.: %.4f %%" % (100 * correct_preds.std(ddof=1)), )
def export_model( hf_pretrained_model_name_or_path: str, output_base_path: str, ): """Retrieve model and tokenizer from Transformers and save all necessary data Things saved: - Model weights - Model config JSON (corresponding to corresponding Transformers model Config object) - Tokenizer data - JSON file pointing to paths for the above Args: hf_pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): Can be either: - A string, the `model id` of a pretrained model configuration hosted inside a model repo on okhuggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``. - A path to a `directory` containing a configuration file saved using the :meth:`~transformers.PretrainedConfig.save_pretrained` method, or the :meth:`~transformers.PreTrainedModel.save_pretrained` method, e.g., ``./my_model_directory/``. - A path or url to a saved configuration JSON `file`, e.g., ``./my_model_directory/configuration.json``. output_base_path: Base path to save output to """ model = AutoModelForPreTraining.from_pretrained( hf_pretrained_model_name_or_path) model_type = model.config_class.model_type model_fol_path = os.path.join(output_base_path, "model") model_path = os.path.join(model_fol_path, f"{model_type}.p") model_config_path = os.path.join(model_fol_path, f"{model_type}.json") tokenizer_fol_path = os.path.join(output_base_path, "tokenizer") os.makedirs(tokenizer_fol_path, exist_ok=True) os.makedirs(model_fol_path, exist_ok=True) torch.save(model.state_dict(), model_path) py_io.write_json(model.config.to_dict(), model_config_path) tokenizer = AutoTokenizer.from_pretrained(hf_pretrained_model_name_or_path) tokenizer.save_pretrained(tokenizer_fol_path) config = { "model_type": model_type, "model_path": model_path, "model_config_path": model_config_path, } py_io.write_json(config, os.path.join(output_base_path, "config.json"))
def test_model_for_pretraining_from_pretrained(self): # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: for model_name in ["bert-base-uncased"]: config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, BertConfig) model = TFAutoModelForPreTraining.from_pretrained(model_name, from_pt=True) self.assertIsNotNone(model) self.assertIsInstance(model, TFBertForPreTraining) model = AutoModelForPreTraining.from_pretrained(model_name, from_tf=True) self.assertIsNotNone(model) self.assertIsInstance(model, BertForPreTraining)
def __init__(self, model_name, num_epochs, batch_size, sentence_length, device, models_dir, patience=5): self.model_name = model_name # self.model = GPT2LMHeadModel.from_pretrained('gpt2-medium') self.model = AutoModelForPreTraining.from_pretrained(model_name) self.num_epochs = num_epochs self.batch_size = batch_size self.device = device self.models_dir = models_dir self.patience = patience # self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium') self.tokenizer = AutoTokenizer.from_pretrained(model_name)
AutoModelForPreTraining, AutoTokenizer, PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast, ) from transformers_interpret import BaseExplainer DISTILBERT_MODEL = AutoModelForMaskedLM.from_pretrained( "distilbert-base-uncased") DISTILBERT_TOKENIZER = AutoTokenizer.from_pretrained("distilbert-base-uncased") GPT2_MODEL = AutoModelForCausalLM.from_pretrained("sshleifer/tiny-gpt2") GPT2_TOKENIZER = AutoTokenizer.from_pretrained("sshleifer/tiny-gpt2") BERT_MODEL = AutoModelForPreTraining.from_pretrained( "lysandre/tiny-bert-random") BERT_TOKENIZER = AutoTokenizer.from_pretrained("lysandre/tiny-bert-random") class DummyExplainer(BaseExplainer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def encode(self, text: str = None): return self.tokenizer.encode(text, add_special_tokens=False) def decode(self, input_ids): return self.tokenizer.convert_ids_to_tokens(input_ids[0]) @property def word_attributions(self):
def get_model(self): from transformers import AutoModelForPreTraining _model = AutoModelForPreTraining.from_pretrained( self.pretrained_model_name_or_path) self.to_device(_model) return _model
def run_pretraining(args): if args.parallel and args.local_rank == -1: run_parallel_pretraining(args) return if args.local_rank == -1: if args.cpu: device = torch.device("cpu") else: device = torch.device("cuda") num_workers = 1 worker_index = 0 else: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl") device = torch.device("cuda", args.local_rank) num_workers = torch.distributed.get_world_size() worker_index = torch.distributed.get_rank() if args.local_rank not in (-1, 0): logging.getLogger().setLevel(logging.WARN) logger.info( "Starting pretraining with the following arguments: %s", json.dumps(vars(args), indent=2, sort_keys=True) ) if args.multilingual: dataset_dir_list = args.dataset_dir.split(",") dataset_list = [WikipediaPretrainingDataset(d) for d in dataset_dir_list] else: dataset_list = [WikipediaPretrainingDataset(args.dataset_dir)] bert_config = AutoConfig.from_pretrained(args.bert_model_name) dataset_size = sum([len(d) for d in dataset_list]) num_train_steps_per_epoch = math.ceil(dataset_size / args.batch_size) num_train_steps = math.ceil(dataset_size / args.batch_size * args.num_epochs) train_batch_size = int(args.batch_size / args.gradient_accumulation_steps / num_workers) entity_vocab = dataset_list[0].entity_vocab config = LukeConfig( entity_vocab_size=entity_vocab.size, bert_model_name=args.bert_model_name, entity_emb_size=args.entity_emb_size, **bert_config.to_dict(), ) model = LukePretrainingModel(config) global_step = args.global_step batch_generator_args = dict( batch_size=train_batch_size, masked_lm_prob=args.masked_lm_prob, masked_entity_prob=args.masked_entity_prob, whole_word_masking=args.whole_word_masking, unmasked_word_prob=args.unmasked_word_prob, random_word_prob=args.random_word_prob, unmasked_entity_prob=args.unmasked_entity_prob, random_entity_prob=args.random_entity_prob, mask_words_in_entity_span=args.mask_words_in_entity_span, num_workers=num_workers, worker_index=worker_index, skip=global_step * args.batch_size, ) if args.multilingual: data_size_list = [len(d) for d in dataset_list] batch_generator = MultilingualBatchGenerator( dataset_dir_list, data_size_list, args.sampling_smoothing, **batch_generator_args, ) else: batch_generator = LukePretrainingBatchGenerator(args.dataset_dir, **batch_generator_args) logger.info("Model configuration: %s", config) if args.fix_bert_weights: for param in model.parameters(): param.requires_grad = False for param in model.entity_embeddings.parameters(): param.requires_grad = True for param in model.entity_predictions.parameters(): param.requires_grad = True model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = LukeAdamW( optimizer_parameters, lr=args.learning_rate, betas=(args.adam_b1, args.adam_b2), eps=args.adam_eps, grad_avg_device=torch.device("cpu") if args.grad_avg_on_cpu else device, ) if args.fp16: from apex import amp if args.fp16_opt_level == "O2": model, optimizer = amp.initialize( model, optimizer, opt_level=args.fp16_opt_level, master_weights=args.fp16_master_weights, min_loss_scale=args.fp16_min_loss_scale, max_loss_scale=args.fp16_max_loss_scale, ) else: model, optimizer = amp.initialize( model, optimizer, opt_level=args.fp16_opt_level, min_loss_scale=args.fp16_min_loss_scale, max_loss_scale=args.fp16_max_loss_scale, ) if args.model_file is None: bert_model = AutoModelForPreTraining.from_pretrained(args.bert_model_name) bert_state_dict = bert_model.state_dict() model.load_bert_weights(bert_state_dict) else: model_state_dict = torch.load(args.model_file, map_location="cpu") model.load_state_dict(model_state_dict, strict=False) if args.optimizer_file is not None: optimizer.load_state_dict(torch.load(args.optimizer_file, map_location="cpu")) if args.amp_file is not None: amp.load_state_dict(torch.load(args.amp_file, map_location="cpu")) if args.lr_schedule == "warmup_constant": scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps) elif args.lr_schedule == "warmup_linear": scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=num_train_steps ) else: raise RuntimeError(f"Invalid scheduler: {args.lr_schedule}") if args.scheduler_file is not None: scheduler.load_state_dict(torch.load(args.scheduler_file, map_location="cpu")) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, broadcast_buffers=False, find_unused_parameters=True, ) model.train() if args.local_rank == -1 or worker_index == 0: entity_vocab.save(os.path.join(args.output_dir, ENTITY_VOCAB_FILE)) metadata = dict( model_config=config.to_dict(), max_seq_length=dataset_list[0].max_seq_length, max_entity_length=dataset_list[0].max_entity_length, max_mention_length=dataset_list[0].max_mention_length, arguments=vars(args), ) with open(os.path.join(args.output_dir, "metadata.json"), "w") as metadata_file: json.dump(metadata, metadata_file, indent=2, sort_keys=True) def save_model(model, suffix): if args.local_rank != -1: model = model.module model_file = f"model_{suffix}.bin" torch.save(model.state_dict(), os.path.join(args.output_dir, model_file)) optimizer_file = f"optimizer_{suffix}.bin" torch.save(optimizer.state_dict(), os.path.join(args.output_dir, optimizer_file)) scheduler_file = f"scheduler_{suffix}.bin" torch.save(scheduler.state_dict(), os.path.join(args.output_dir, scheduler_file)) metadata = dict( global_step=global_step, model_file=model_file, optimizer_file=optimizer_file, scheduler_file=scheduler_file ) if args.fp16: amp_file = f"amp_{suffix}.bin" torch.save(amp.state_dict(), os.path.join(args.output_dir, amp_file)) metadata["amp_file"] = amp_file with open(os.path.join(args.output_dir, f"metadata_{suffix}.json"), "w") as f: json.dump(metadata, f, indent=2, sort_keys=True) if args.local_rank == -1 or worker_index == 0: summary_writer = SummaryWriter(args.log_dir) pbar = tqdm(total=num_train_steps, initial=global_step) tr_loss = 0 accumulation_count = 0 results = [] prev_error = False prev_step_time = time.time() prev_save_time = time.time() for batch in batch_generator.generate_batches(): try: batch = {k: torch.from_numpy(v).to(device) for k, v in batch.items()} result = model(**batch) loss = result["loss"] result = {k: v.to("cpu").detach().numpy() for k, v in result.items()} if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps def maybe_no_sync(): if ( hasattr(model, "no_sync") and num_workers > 1 and accumulation_count + 1 != args.gradient_accumulation_steps ): return model.no_sync() else: return contextlib.ExitStack() with maybe_no_sync(): if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() except RuntimeError: if prev_error: logger.exception("Consecutive errors have been observed. Exiting...") raise logger.exception("An unexpected error has occurred. Skipping a batch...") prev_error = True loss = None torch.cuda.empty_cache() continue accumulation_count += 1 prev_error = False tr_loss += loss.item() loss = None results.append(result) if accumulation_count == args.gradient_accumulation_steps: if args.max_grad_norm != 0.0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() model.zero_grad() accumulation_count = 0 summary = {} summary["learning_rate"] = max(scheduler.get_last_lr()) summary["loss"] = tr_loss tr_loss = 0 current_time = time.time() summary["batch_run_time"] = current_time - prev_step_time prev_step_time = current_time for name in ("masked_lm", "masked_entity"): try: summary[name + "_loss"] = np.concatenate([r[name + "_loss"].flatten() for r in results]).mean() correct = np.concatenate([r[name + "_correct"].flatten() for r in results]).sum() total = np.concatenate([r[name + "_total"].flatten() for r in results]).sum() if total > 0: summary[name + "_acc"] = correct / total except KeyError: continue results = [] if args.local_rank == -1 or worker_index == 0: for (name, value) in summary.items(): summary_writer.add_scalar(name, value, global_step) desc = ( f"epoch: {int(global_step / num_train_steps_per_epoch)} " f'loss: {summary["loss"]:.4f} ' f'time: {datetime.datetime.now().strftime("%H:%M:%S")}' ) pbar.set_description(desc) pbar.update() global_step += 1 if args.local_rank == -1 or worker_index == 0: if global_step == num_train_steps: # save the final model save_model(model, f"epoch{args.num_epochs}") time.sleep(60) elif global_step % num_train_steps_per_epoch == 0: # save the model at each epoch epoch = int(global_step / num_train_steps_per_epoch) save_model(model, f"epoch{epoch}") if args.save_interval_sec and time.time() - prev_save_time > args.save_interval_sec: save_model(model, f"step{global_step:07}") prev_save_time = time.time() if args.save_interval_steps and global_step % args.save_interval_steps == 0: save_model(model, f"step{global_step}") if global_step == num_train_steps: break if args.local_rank == -1 or worker_index == 0: summary_writer.close()
def __init__(self, bert_model_name, dropout, linear_units): super(BERTBaseUncased, self).__init__() self.bert = AutoModelForPreTraining.from_pretrained(bert_model_name) self.pre_classifier = nn.Linear(30522, linear_units) self.bert_drop = nn.Dropout(dropout) self.classifier = nn.Linear(linear_units, 2)
def __init__(self, config, model_dir): super(LongformerForClassification, self).__init__(config) self.longformer = AutoModelForPreTraining.from_pretrained( model_dir, config=config) self.classify = torch.nn.Linear(config.hidden_size, config.num_labels) self.init_weights()
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if training_args.should_log else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if training_args.should_log: transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub # # For CSV/JSON files, this script will use the column called 'text' or the first column. You can easily tweak this # behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) if "validation" not in datasets.keys(): datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, ) datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, ) else: assert False # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config_kwargs = { "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") tokenizer_kwargs = { "cache_dir": model_args.cache_dir, "use_fast": model_args.use_fast_tokenizer, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, **tokenizer_kwargs) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = AutoModelForPreTraining.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) else: logger.info("Training new model from scratch") model = AutoModelForMaskedLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] if data_args.max_seq_length is None: max_seq_length = tokenizer.model_max_length if max_seq_length > 1024: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." ) max_seq_length = 1024 else: if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) if data_args.line_by_line: pass else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more # efficient when it receives the `special_tokens_mask`. combined_wiki = pickle.load(open("combined_wiki.p", "rb")) def tokenize_function(examples): new_examples_text = [] new_examples_title = [] for article, title in zip(examples['text'], examples['title']): temp = article.split("Category:") assert len(temp) >= 1 if len(temp) == 1: assert title not in combined_wiki or len( combined_wiki[title]) == 0 new_examples_text.append(temp[0]) examples['text'] = new_examples_text new_examples_text = [] for article, title in zip(examples['text'], examples['title']): full_paragraph = [] for i, paragraph in enumerate(article.split('.\n')): paragraph += '.\n' paragraph = paragraph.split(' ') if len(full_paragraph) + len(paragraph) > 300: new_examples_text.append(' '.join(full_paragraph)) new_examples_title.append(title) full_paragraph = [] full_paragraph += paragraph if full_paragraph: new_examples_text.append(' '.join(full_paragraph)) new_examples_title.append(title) examples['text'] = new_examples_text examples['title'] = new_examples_title examples['category_labels'] = [] for title in examples['title']: zeros = torch.zeros(2211) zeros[0] = 1 if title in combined_wiki: for category_idx in combined_wiki[title]: if category_idx in [5, 8, 9]: continue zeros[category_idx] = 1 zeros[0] = 0 examples['category_labels'].append(zeros) return tokenizer(examples[text_column_name], return_special_tokens_mask=True, truncation=True, padding=True) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) # # # Main data processing function that will concatenate all texts from our dataset and generate chunks of # # max_seq_length. # def group_texts(examples): # # Concatenate all texts. # concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} # total_length = len(concatenated_examples[list(examples.keys())[0]]) # # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # # customize this part to your needs. # total_length = (total_length // max_seq_length) * max_seq_length # # Split by chunks of max_len. # result = { # k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)] # for k, t in concatenated_examples.items() # } # return result # # # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a # # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value # # might be slower to preprocess. # # # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map # # tokenized_datasets = tokenized_datasets.map( # group_texts, # batched=True, # num_proc=data_args.preprocessing_num_workers, # load_from_cache_file=not data_args.overwrite_cache, # ) if training_args.do_train: if "train" not in tokenized_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = tokenized_datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) if training_args.do_eval: if "validation" not in tokenized_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = tokenized_datasets["validation"] if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select( range(data_args.max_eval_samples)) # Data collator # This one will take care of randomly masking the tokens. pad_to_multiple_of_8 = data_args.line_by_line and training_args.fp16 and not data_args.pad_to_max_length data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm_probability=data_args.mlm_probability, pad_to_multiple_of=8 if pad_to_multiple_of_8 else None, ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) try: perplexity = math.exp(metrics["eval_loss"]) except OverflowError: perplexity = float("inf") metrics["perplexity"] = perplexity trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) if training_args.push_to_hub: kwargs = { "finetuned_from": model_args.model_name_or_path, "tags": "fill-mask" } if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name kwargs[ "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name trainer.push_to_hub(**kwargs)
def train( rank: int, world_size: int, *, resume: bool, location: str, name: str, quiet: bool, save_every: int, validate_every: int, post_command: str, explicit_args: set[str], params: Hyperparams, ): # Get filepath within path context fpath = lambda path: os.path.join(location, path) if isinstance(path, str) else os.path.join(location, *path) # Setup multi-gpu if used setup(rank, world_size) is_master = rank < 1 # Are we on the main node? is_distributed = rank != -1 # Are we performing distributed computing? num_workers = torch.distributed.get_world_size() if is_distributed else 1 # Update locations TrainResults.subfolder = name Hyperparams.subfolder = name # Setup logger log.configure( os.path.join(location, name, "pretraining-worker=%s.log" % (rank if is_distributed else 0)), "DaLUKE pretraining on node %i" % rank, log_commit = True, print_level = (Levels.INFO if quiet else Levels.DEBUG) if is_master else None, append = resume, # Append to existing log file if we are resuming training ) post_time, post_command = parse_post_command(post_command) execute_post_command = False if post_time: log("Quitting in %.2f h and running command '%s'" % ((post_time-time.time())/3600, post_command)) if resume: log("Resuming from %s" % name) # Load results and hyperparameters from earlier training res = TrainResults.load(location) # Close unended profiles close_tt(res.tt) TT.fuse(res.tt) res.tt = TT tmp_saved_pu = res.parameter_update loaded_params = Hyperparams.load(location) # Overwrite ff-size if given explicitly if "ff_size" in explicit_args: loaded_params.ff_size = params.ff_size params = loaded_params else: tmp_saved_pu = None log.section("Starting pretraining with the following hyperparameters", params) log("Training using %i workers" % num_workers) log("Reading metadata and entity vocabulary") with open(fpath(DatasetBuilder.metadata_file)) as f: metadata = json.load(f) with open(fpath(DatasetBuilder.entity_vocab_file)) as f: entity_vocab = json.load(f) log("Loaded metadata:", json.dumps(metadata, indent=4)) log(f"Loaded entity vocabulary of {len(entity_vocab)} entities") if params.ent_min_mention: log("Removing entities with less than %i mentions" % params.ent_min_mention) entity_vocab = { ent: info for ent, info in entity_vocab.items() if info["count"] >= params.ent_min_mention or ent in {"[PAD]", "[UNK]", "[MASK]"} } log("After filtering, entity vocab now has %i entities" % len(entity_vocab)) # Device should be cuda:rank or just cuda if single gpu, else cpu if is_distributed: device = torch.device("cuda", index=rank) else: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") log.debug( "Hardware for this worker:", "CPU: %s" % cpuinfo.get_cpu_info()["brand_raw"], "GPU: %s" % (torch.cuda.get_device_name(device) if torch.cuda.is_available() else "NA"), sep="\t\n", ) if params.entity_loss_weight: log("Setting up loss function with entity loss weighting") # Don't weigh special tokens weights = torch.Tensor([0, 0, 0, *(1 / info["count"] for info in entity_vocab.values() if info["count"])]).to(device) entity_criterion = nn.CrossEntropyLoss(weight=weights) else: log("Setting up loss function without entity loss weighting") entity_criterion = nn.CrossEntropyLoss() word_criterion = nn.CrossEntropyLoss() loss_calculator = lambda w, e: params.word_ent_weight * w + (1 - params.word_ent_weight) * e # Load dataset and training results bert_config = AutoConfig.from_pretrained(metadata["base-model"]) if metadata["reduced-vocab"]: token_map_file = fpath(DatasetBuilder.token_map_file) log("Loading token map from '%s'" % token_map_file) token_map = np.load(token_map_file) tokenizer = AutoTokenizer.from_pretrained(metadata["base-model"]) *__, unk_id = get_special_ids(tokenizer) token_reduction = token_map_to_token_reduction(token_map, unk_id) else: token_map = None log("Building dataset") data = DataLoader( location, metadata, entity_vocab, device, params.word_mask_prob, params.word_unmask_prob, params.word_randword_prob, params.ent_mask_prob, vocab_size=metadata["vocab-size"], token_map=token_map, ent_min_mention=params.ent_min_mention, ) sampler = (DistributedSampler if is_distributed else RandomSampler)(data.train_examples) log("Built %i examples" % len(data)) loader = data.get_dataloader(params.ff_size, sampler) val_loader = data.get_dataloader(params.ff_size, SequentialSampler(data.val_examples), validation=True) # Number of subbatches in each parameter update (batch) grad_accumulation_steps = params.batch_size // (params.ff_size * num_workers) # How many full batches can be made from the dataset batches_in_data = len(data) // params.batch_size log( "Parameter updates: %i" % params.parameter_updates, "Subbatches per parameter update: %i" % grad_accumulation_steps, "Subbatches generated: %i" % len(loader), "Batches needed to cover dataset: %i" % batches_in_data, ) if not resume: # Calculate parameter differences, when at least 20k examples have been seen paramdiff_every = ceil(MIN_EXAMPLES_PER_PARAMDIFF / params.batch_size) log("Recalculating parameter differences every %i'th parameter update" % paramdiff_every) top_k = [1, 3, 10] log("Calculating top %s accuracies" % top_k) if validate_every: val_updates = unique(np.array( np.arange(-1, params.parameter_updates, validate_every).tolist() + [params.parameter_updates-1] ))[1:] else: val_updates = np.array([], dtype=int) res = TrainResults( runtime = np.zeros(params.parameter_updates), lr = np.zeros(params.parameter_updates), parameter_update = 0, losses = np.zeros(params.parameter_updates), scaled_loss = np.zeros(params.parameter_updates), top_k = top_k, w_losses = np.zeros(params.parameter_updates), e_losses = np.zeros(params.parameter_updates), w_accuracies = np.zeros((params.parameter_updates, len(top_k))), e_accuracies = np.zeros((params.parameter_updates, len(top_k))), val_param_updates = val_updates, val_losses = np.zeros(len(val_updates)), val_w_losses = np.zeros(len(val_updates)), val_e_losses = np.zeros(len(val_updates)), val_w_accuracies = np.zeros((len(val_updates), len(top_k))), val_e_accuracies = np.zeros((len(val_updates), len(top_k))), paramdiff_every = paramdiff_every, groups_to_slices = None, # Set later orig_params = None, paramdiff_1 = None, luke_exclusive_params = None, # Set later att_mats_from_base = None, # Set later tt = TT, ) save_pus = set(range(-1, params.parameter_updates, save_every)).union({params.parameter_updates-1}) log("Saving model at parameter updates: %s" % sorted(save_pus), "Validating at parameter updates: %s" % res.val_param_updates.tolist()) # Build model, possibly by loading previous weights log.section("Setting up model") bert_config = AutoConfig.from_pretrained(metadata["base-model"]) if params.ent_hidden_size is None: params.ent_hidden_size = bert_config.hidden_size else: assert params.ent_hidden_size <= bert_config.hidden_size,\ "Entity hidden size (%i) cannot be larger than hidden size in '%s' (%i)" % ( params.hidden_size, metadata["base-model"], bert_config.hidden_size, ) log("Initializing model") model_cls = BertAttentionPretrainTaskDaLUKE if params.bert_attention else PretrainTaskDaLUKE model = model_cls( bert_config, ent_vocab_size = len(entity_vocab), ent_embed_size = params.ent_embed_size, ent_hidden_size = params.ent_hidden_size, ent_intermediate_size = params.ent_intermediate_size, ).to(device) bert_config.vocab_size = metadata["vocab-size"] log("Bert config", bert_config.to_json_string()) if params.lukeinit: log("Initializing weights in accordance with LUKE") model.apply(lambda module: model.init_weights(module, bert_config.initializer_range)) # Load parameters from base model if not params.no_base_model: log("Loading base model parameters") with TT.profile("Loading base model parameters"): base_model = AutoModelForPreTraining.from_pretrained(metadata["base-model"]) new_weights = load_base_model_weights( model, base_model.state_dict(), params.bert_attention, ) if metadata["reduced-vocab"]: log("Removing unneeded token weights") reduced_model = model_cls( bert_config, ent_vocab_size = len(entity_vocab), ent_embed_size = params.ent_embed_size, ent_hidden_size = params.ent_hidden_size, ent_intermediate_size = params.ent_intermediate_size, ).to(device) copy_with_reduced_state_dict(token_reduction, model, reduced_model) model = reduced_model else: new_weights = set(model.state_dict()) # Initialize self-attention query matrices to BERT word query matrices att_mat_keys = set() if not params.bert_attention and not params.no_base_model: log("Initializing new attention matrices with%s PCA" % ("" if params.pcainit else "out")) att_mat_keys = model.init_special_attention(params.pcainit, device) if not resume: res.luke_exclusive_params = new_weights res.att_mats_from_base = att_mat_keys if is_master: res.orig_params = all_params(model).cpu().numpy() log("Pretraining model initialized with %s parameters" % thousand_seps(len(model))) # Unfixes params at this parameter update unfix_base_model_params_pu = round(params.bert_fix_prop * params.parameter_updates) log("Unfixing base model params after %i parameter updates" % unfix_base_model_params_pu) if resume: mpath = fpath((TrainResults.subfolder, MODEL_OUT.format(i=res.parameter_update))) log("Loading model from '%s'" % mpath) model.load_state_dict(torch.load(mpath, map_location=device)) log(f"Resuming training saved at parameter update {res.parameter_update}") else: res.groups_to_slices, t = all_params_groups_to_slices(model, bert_config.num_hidden_layers) log("Parameter groups and positions", t) res.paramdiff_1 = { name: np.zeros(ceil(params.parameter_updates/res.paramdiff_every)) for name in res.groups_to_slices } if is_distributed: model = DDP(model, device_ids=[rank], find_unused_parameters=True) non_ddp_model = model.module if is_distributed else model log("Setting up optimizer, scaler, and learning rate scheduler") optimizer = get_optimizer(non_ddp_model, params.weight_decay, params.lr) scaler = amp.GradScaler() if params.fp16 else None scheduler = get_lr_scheduler( optimizer, int(params.warmup_prop * params.parameter_updates), params.parameter_updates, unfix_base_model_params_pu, ) if resume: optimizer.load_state_dict(torch.load(fpath((TrainResults.subfolder, OPTIMIZER_OUT.format(i=res.parameter_update))), map_location=device)) scheduler.load_state_dict(torch.load(fpath((TrainResults.subfolder, SCHEDULER_OUT.format(i=res.parameter_update))), map_location=device)) if params.fp16: scaler.load_state_dict(torch.load(fpath((TrainResults.subfolder, SCALER_OUT.format(i=res.parameter_update))), map_location=device)) res.parameter_update += 1 # We saved the data at pu i, but should now commence pu i+1 log.debug("Time distribution before starting training", TT) log_memory_stats(device) log.section(f"Training DaLUKE for {params.parameter_updates} parameter updates") model.zero_grad() # To avoid tracking of model parameter manipulation model.train() # Start with transfer learned weights locked fix_base_model_params(res.luke_exclusive_params, non_ddp_model, True) fixed_params = True # Save initial parameters if is_master and not resume: with TT.profile("Saving progress"): paths = save_training(location, params, model.module if is_distributed else model, res, optimizer, scheduler, scaler, -1) log.debug("Saved initial state to", *paths) batch_iter = iter(loader) for i in range(res.parameter_update, params.parameter_updates): TT.profile("Parameter update") res.parameter_update = i if i >= unfix_base_model_params_pu and fixed_params: log("Unfixing base model params") fix_base_model_params(res.luke_exclusive_params, model, False) fixed_params = False if is_distributed and i % batches_in_data == 0: sampler.set_epoch(i // batches_in_data) # Losses and accuracies for this parameter update t_loss, w_loss, e_loss, s_loss = 0, 0, 0, 0 w_accuracies = np.zeros((grad_accumulation_steps, len(res.top_k))) e_accuracies = np.zeros((grad_accumulation_steps, len(res.top_k))) # Loop over enough batches to make a parameter update for j in range(grad_accumulation_steps): TT.profile("Sub-batch") try: batch = next(batch_iter) except StopIteration: batch_iter = iter(loader) batch = next(batch_iter) TT.profile("FP and gradients") with amp.autocast() if params.fp16 else contextlib.ExitStack(): word_preds, ent_preds = model(batch) # Compute and backpropagate loss word_loss = word_criterion(word_preds, batch.word_mask_labels) ent_loss = entity_criterion(ent_preds, batch.ent_mask_labels) has_entities = not torch.isnan(ent_loss).item() ent_loss = torch.nan_to_num(ent_loss) loss = loss_calculator(word_loss, ent_loss) loss /= grad_accumulation_steps # Only sync parameters on grad updates, aka last pass of this loop with model.no_sync() if is_distributed and j < grad_accumulation_steps - 1 else contextlib.ExitStack(): if params.fp16: scaled_loss = scaler.scale(loss) scaled_loss.backward() s_loss += scaled_loss.item() else: loss.backward() t_loss += loss.item() w_loss += word_loss.item() / grad_accumulation_steps e_loss += ent_loss.item() / grad_accumulation_steps if has_entities else 0 if torch.cuda.is_available(): torch.cuda.synchronize(rank if is_distributed else None) TT.end_profile() # Save accuracy for statistics if is_master: with TT.profile("Training accuracy"): w_accuracies[j] = top_k_accuracy(batch.word_mask_labels, word_preds, res.top_k) e_accuracies[j] = top_k_accuracy(batch.ent_mask_labels, ent_preds, res.top_k) TT.end_profile() # Update model parameters with TT.profile("Parameter step"): if params.fp16: scaler.step(optimizer) scaler.update() else: optimizer.step() scheduler.step() model.zero_grad() # Calculate how much gradient has changed if is_master and i % res.paramdiff_every == 0: with torch.no_grad(), TT.profile("Parameter changes"): log.debug("Calculating parameter changes") orig_pars = torch.from_numpy(res.orig_params).to(device) current_pars = all_params(model.module if is_distributed else model) absdiff = torch.abs(current_pars-orig_pars) for blockname, slice_ in res.groups_to_slices.items(): j = i // res.paramdiff_every res.paramdiff_1[blockname][j] = absdiff[slice_].sum().item() del orig_pars, current_pars res.losses[i] = t_loss res.w_losses[i] = w_loss res.e_losses[i] = e_loss res.scaled_loss[i] = s_loss res.lr[i] = scheduler.get_last_lr()[0] res.w_accuracies[i] = np.mean(w_accuracies, axis=0) res.e_accuracies[i] = np.nanmean(e_accuracies, axis=0) res.runtime[i] = TT.end_profile() log.debug( "Performed parameter update %i / %i in %.2f s" % (i, params.parameter_updates-1, res.runtime[i]), f" Loss (total, word, entity, scaled): {t_loss:9.4f}, {w_loss:9.4f}, {e_loss:9.4f}, {s_loss:.4f}", f" Accuracy (word, entity): {100*res.w_accuracies[i, 0]:7.2f} %, {100*res.e_accuracies[i, 0]:7.2f} %", ) if i in res.val_param_updates and is_master: TT.profile("Model validation") log("Validating model") vi = res.val_param_updates.tolist().index(i) res.val_w_losses[vi], res.val_e_losses[vi], res.val_w_accuracies[vi], res.val_e_accuracies[vi] =\ validate_model(model, val_loader, word_criterion, entity_criterion, res.top_k) res.val_losses[vi] = loss_calculator(res.val_w_losses[vi], res.val_e_losses[vi]) log( "Validation loss:", " Total: %9.4f" % res.val_losses[vi], " Word: %9.4f" % res.val_w_losses[vi], " Entity: %9.4f" % res.val_e_losses[vi], "Validation accuracy:", " Word: %7.2f %%" % (100 * res.val_w_accuracies[vi, 0]), " Entity: %7.2f %%" % (100 * res.val_e_accuracies[vi, 0]), ) model.train() TT.end_profile() log.debug("Time distribution so far", TT) # Save results and model if is_master and i in save_pus: with TT.profile("Saving progress"): save_progress(location, i, tmp_saved_pu, save_pus, params, model.module if is_distributed else model, res, optimizer, scheduler, scaler) if i in save_pus: log_memory_stats(device) # If timed out, save, quit, and run resume command if post_time and time.time() > post_time: log_memory_stats(device) log.section("Time limit reached. Quitting and running command '%s'" % post_command) with TT.profile("Saving progress"): save_progress(location, i, tmp_saved_pu, save_pus, params, model.module if is_distributed else model, res, optimizer, scheduler, scaler) execute_post_command = True break log.debug("Time distribution", TT) # Clean up multi-gpu if used cleanup(rank) if is_master and execute_post_command: os.system(post_command)
import torch from transformers import AutoTokenizer, AutoModelForPreTraining path = '/home/halamri/summer2020/avsd-transofrmers/mlmAVSD/' tokenizer = AutoTokenizer.from_pretrained(path) model = AutoModelForPreTraining.from_pretrained(path) ques = 'Hey..how are you?' ans = 'I will be just fine !' input_ids = tokenizer.encode_plus(text=[ques, ans], add_special_tokens=True, max_length=200, pad_to_max_length=True, return_tensors='pt') model.eval() output = model(input_ids=input_ids.data['input_ids'], attention_mask=input_ids.data['attention_mask'], token_type_ids=input_ids.data['token_type_ids']) print('Test..')
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if model_args.model_name_or_path: model = AutoModelForPreTraining.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) model.resize_token_embeddings(len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert" ] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm " "flag (masked language modeling).") if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = get_dataset( data_args, tokenizer=tokenizer) if training_args.do_train else None eval_dataset = get_dataset( data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None data_collator = DataCollatorForLanguageModelingAVSD( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, #prediction_loss_only=True, ) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} avsd_eval = True outputs = [] if avsd_eval: outputs = [] model = trainer.model data_loader = trainer.get_eval_dataloader(eval_dataset) for batch in tqdm(data_loader): for k, v in batch.items(): batch[k] = v.to(training_args.device) with torch.no_grad(): output = model(**batch) nsp_scores = output[1] #nsp_probs = F.softmax(nsp_scores, dim=1) nsp_scores = nsp_scores[:, 0].detach().cpu().tolist() outputs.extend(nsp_scores) results['avsd_train_set'] = outputs json.dump( results, open( '/home/halamri/summer2020/avsd-transofrmers/mlmAVSD/bertPredicitonResults_June15_corrected.txt', 'w')) print('Done....') '''
def prepare_model_and_optimizer(args, device): global_step = 0 args.resume_step = 0 checkpoint = None # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.config_name) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path) else: config = CONFIG_MAPPING[args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") config.dense_seq_output = args.dense_seq_output if args.model_name_or_path: model = AutoModelForPreTraining.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForPreTraining.from_config(config) ## Load from Pyt checkpoint - either given as init_checkpoint, or picked up from output_dir if found #if args.init_checkpoint is not None or found_resume_checkpoint(args): # # Prepare model # #model = BertForPreTraining(config) # model = BertForPreTrainingSegmented(config) # # for k,v in model.state_dict().items(): # # print(f'model-k,len(v)={k}, {v.numel()}') # #model = BertForPretraining(config) # if args.init_checkpoint is None: # finding checkpoint in output_dir # assert False, "code path not tested with cuda graphs" # checkpoint_str = "phase2_ckpt_*.pt" if args.phase2 else "phase1_ckpt_*.pt" # model_names = [f for f in glob.glob(os.path.join(args.output_dir, checkpoint_str))] # global_step = max([int(x.split('.pt')[0].split('_')[-1].strip()) for x in model_names]) # args.resume_step = global_step #used for throughput computation # resume_init_checkpoint = os.path.join(args.output_dir, checkpoint_str.replace("*", str(global_step))) # print("Setting init checkpoint to %s - which is the latest in %s" %(resume_init_checkpoint, args.output_dir)) # checkpoint=torch.load(resume_init_checkpoint, map_location="cpu") # else: # checkpoint=torch.load(args.init_checkpoint, map_location="cpu")["model"] param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta', 'LayerNorm'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay_rate}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] optimizer = Lamb(optimizer_grouped_parameters, lr=args.learning_rate, betas=(args.opt_lamb_beta_1, args.opt_lamb_beta_2), fused=True) if args.warmup_steps == 0: warmup_steps = int(args.max_steps * args.warmup_proportion) warmup_start = 0 else: warmup_steps = args.warmup_steps warmup_start = args.start_warmup_step lr_scheduler = LinearWarmupPolyDecayScheduler(optimizer, start_warmup_steps=warmup_start, warmup_steps=warmup_steps, total_steps=args.max_steps, end_learning_rate=0.0, degree=1.0) #if found_resume_checkpoint(args): # assert False, "code path not tested with cuda graphs" # optimizer.load_state_dict(checkpoint['optimizer']) #restores m,v states (only if resuming checkpoint, not for init_checkpoint and init_tf_checkpoint for now) return model, optimizer, lr_scheduler, checkpoint, global_step
#based on https://github.com/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb import torch from transformers import LineByLineTextDataset from transformers import Trainer, TrainingArguments from transformers import DataCollatorForLanguageModeling from transformers import AutoTokenizer from transformers import AutoModelForPreTraining from pathlib import Path import os import torch print(torch.cuda.is_available()) tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext") model = AutoModelForPreTraining.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext') from transformers import LineByLineTextDataset dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path="../results_file.txt", block_size=128, ) from transformers import DataCollatorForLanguageModeling data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=0.15 )