def convert_fairseq_mbart_checkpoint_from_disk( checkpoint_path, hf_config_path="facebook/mbart-large-en-ro"): state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] remove_ignore_keys_(state_dict) vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0] mbart_config = MBartConfig.from_pretrained(hf_config_path, vocab_size=vocab_size) state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] model = BartForConditionalGeneration(mbart_config) model.model.load_state_dict(state_dict) return model
def test_mbart_enro_config(self): mbart_models = ["facebook/mbart-large-en-ro"] expected = {"scale_embedding": True, "output_past": True} for name in mbart_models: config = MBartConfig.from_pretrained(name) for k, v in expected.items(): try: self.assertEqual(v, getattr(config, k)) except AssertionError as e: e.args += (name, k) raise
def convert_fairseq_mbart_checkpoint_from_disk( checkpoint_path, hf_config_path="facebook/mbart-large-en-ro", finetuned=False, mbart_50=False): state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] remove_ignore_keys_(state_dict) vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0] mbart_config = MBartConfig.from_pretrained(hf_config_path, vocab_size=vocab_size) if mbart_50 and finetuned: mbart_config.activation_function = "relu" state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] model = MBartForConditionalGeneration(mbart_config) model.model.load_state_dict(state_dict) if finetuned: model.lm_head = make_linear_from_emb(model.model.shared) return model
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() check_output_dir(training_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.parallel_mode == ParallelMode.DISTRIBUTED), training_args.fp16, ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = MBartConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout") for p in extra_model_params: if getattr(training_args, p, None): assert hasattr(config, p), f"({config.__class__.__name__}) doesn't have a `{p}` attribute" setattr(config, p, getattr(training_args, p)) tokenizer = MBartTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) # model = MBartForConditionalGeneration.from_pretrained( # model_args.model_name_or_path, # from_tf=".ckpt" in model_args.model_name_or_path, # config=config, # cache_dir=model_args.cache_dir, # ) # model = MBartForConditionalGeneration(config) # model = MBartForConditionalGeneration.from_pretrained(model_args.config_name) model_config = MBartConfig(vocab_size=300,d_model=10,encoder_layers=1,decoder_layers=1,encoder_attention_heads=1,decoder_attention_heads=1,encoder_ffn_dim=10,decoder_ffn_dim=10,max_position_embeddings=512) model = MBartModel(config=model_config) # use task specific params use_task_specific_params(model, data_args.task) # set num_beams for evaluation if data_args.eval_beams is None: data_args.eval_beams = model.config.num_beams # set decoder_start_token_id for MBart if model.config.decoder_start_token_id is None and isinstance(tokenizer, MBartTokenizer): assert ( data_args.tgt_lang is not None and data_args.src_lang is not None ), "mBart requires --tgt_lang and --src_lang" model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.tgt_lang] if model_args.freeze_embeds: freeze_embeds(model) if model_args.freeze_encoder: freeze_params(model.get_encoder()) assert_all_frozen(model.get_encoder()) dataset_class = Seq2SeqDataset # Get datasets train_dataset = ( dataset_class( tokenizer, type_path="train", data_dir=data_args.data_dir, n_obs=data_args.n_train, max_target_length=data_args.max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_train else None ) eval_dataset = ( dataset_class( tokenizer, type_path="val", data_dir=data_args.data_dir, n_obs=data_args.n_val, max_target_length=data_args.val_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_eval or training_args.evaluation_strategy != EvaluationStrategy.NO else None ) test_dataset = ( dataset_class( tokenizer, type_path="test", data_dir=data_args.data_dir, n_obs=data_args.n_test, max_target_length=data_args.test_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_predict else None ) # Initialize our Trainer compute_metrics_fn = ( build_compute_metrics_fn(data_args.task, tokenizer) if training_args.predict_with_generate else None ) trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=Seq2SeqDataCollator(tokenizer, data_args, training_args.tpu_num_cores), compute_metrics=compute_metrics_fn, tokenizer=tokenizer, ) all_metrics = {} # Training if training_args.do_train: logger.info("*** Train ***") train_result = trainer.train( model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None ) metrics = train_result.metrics metrics["train_n_objs"] = data_args.n_train trainer.save_model() # this also saves the tokenizer if trainer.is_world_process_zero(): handle_metrics("train", metrics, training_args.output_dir) all_metrics.update(metrics) # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json")) # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) tokenizer.save_pretrained(training_args.output_dir) # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate( metric_key_prefix="val", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams ) metrics["val_n_objs"] = data_args.n_val metrics["val_loss"] = round(metrics["val_loss"], 4) if trainer.is_world_process_zero(): handle_metrics("val", metrics, training_args.output_dir) all_metrics.update(metrics) if training_args.do_predict: logger.info("*** Predict ***") test_output = trainer.predict( test_dataset=test_dataset, metric_key_prefix="test", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams, ) metrics = test_output.metrics metrics["test_n_objs"] = data_args.n_test if trainer.is_world_process_zero(): metrics["test_loss"] = round(metrics["test_loss"], 4) handle_metrics("test", metrics, training_args.output_dir) all_metrics.update(metrics) if training_args.predict_with_generate: test_preds = tokenizer.batch_decode( test_output.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True ) test_preds = lmap(str.strip, test_preds) write_txt_file(test_preds, os.path.join(training_args.output_dir, "test_generations.txt")) if trainer.is_world_process_zero(): save_json(all_metrics, os.path.join(training_args.output_dir, "all_results.json")) return all_metrics
class TrainerConfig(DefaultArgs): tgt_file: str = 'data/parallel/IITB.en-hi.en' src_file: str = 'data/parallel/IITB.en-hi.hi' single_file: bool = False src_lang: str = 'hi_IN' max_length: int = 40 max_target_length: int = 40 tr_max_samples: int = -1 val_max_samples: int = -1 finetuned_id: str = "dummy" save_specific: bool = False load_specific_path: str = None # "specific-layers" batch_size: int = 32 lr: float = 1e-3 model_id: str = "facebook/mbart-large-cc25" # "vasudevgupta/mbart-iitb-hin-eng" tokenizer_id: str = "facebook/mbart-large-cc25" base_dir: str = "base_dir" tb_grads: str = "tb_grads" tb_params: str = "tb_params" test_size: float = .03 random_seed: int = 7232114 num_workers: int = 2 max_pred_length: int = 40 tgt_lang: str = 'en_XX' # control adapter from here # manually switch off layers in case you want to freeze load_adapter_path: str = None save_adapter_path: str = None enc_ffn_adapter: bool = False dec_ffn_adapter: bool = False enc_self_attn_adapter: bool = False dec_self_attn_adapter: bool = False cross_attn_adapter: bool = False enc_tok_embed_adapter: bool = False dec_tok_embed_adapter: bool = False # trainable-status of some parts of network embed_grad: bool = True pos_embed_grad: bool = True enc_ffn_grad: bool = True dec_ffn_grad: bool = True enc_attn_grad: bool = True dec_attn_grad: bool = True cross_attn_grad: bool = True enc_norm_grad: bool = True dec_norm_grad: bool = True cross_attn_norm_grad: bool = True # args used in torch_trainer max_epochs: int = 5 accumulation_steps: int = 1 save_epoch_dir: str = None early_stop_n: int = None map_location: torch.device = torch.device("cuda:0") save_dir: str = None load_dir: str = None tpus: int = 0 precision: str = 'float32' fast_dev_run: bool = False # all these args will be invalid if you run sweep project_name: str = 'transformers-adapters' wandb_run_name: str = None wandb_off: bool = False wandb_resume: bool = False wandb_run_id: str = None # bart inside config bart_config: MBartConfig = field( repr=False, default=MBartConfig.from_pretrained(model_id)) # adapter inside config enc_ffn_adapter_config: AdapterConfig = field( repr=False, default=AdapterConfig(input_size=1024)) dec_ffn_adapter_config: AdapterConfig = field( repr=False, default=AdapterConfig(input_size=1024)) enc_self_attn_adapter_config: AdapterConfig = field( repr=False, default=AdapterConfig(input_size=1024)) dec_self_attn_adapter_config: AdapterConfig = field( repr=False, default=AdapterConfig(input_size=1024)) cross_attn_adapter_config: AdapterConfig = field( repr=False, default=AdapterConfig(input_size=1024)) dec_tok_embed_adapter_config: AdapterConfig = field( repr=False, default=AdapterConfig(input_size=1024, add_layer_norm_after=False)) enc_tok_embed_adapter_config: AdapterConfig = field( repr=False, default=AdapterConfig(input_size=1024, add_layer_norm_after=False))
def main(args_dict=None): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if args_dict is not None: model_args, data_args, training_args = parser.parse_dict(args_dict) elif len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Set project name os.environ["WANDB_PROJECT"] = "multilingual_zeroshot" num_labels = 3 labels = ['entailment', 'neutral', 'contradiction'] # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = MBartConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, dropout=model_args.dropout, attention_dropout=model_args.attention_dropout, finetuning_task="mnli", cache_dir=model_args.cache_dir, ) tokenizer = MBartTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = MBartForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets columns = ['input_ids', 'attention_mask', 'labels'] map_fn = get_mnli_map_fn(data_args.lang, data_args.max_seq_length, tokenizer) train_dataset = nlp.load_dataset("multi_nli", split="train") train_dataset = train_dataset.map(map_fn, batched=True, batch_size=512) train_dataset.set_format(type='torch', columns=columns) eval_dataset = (nlp.load_dataset("multi_nli", split="validation_matched") if training_args.do_eval else None) eval_dataset = eval_dataset.map(map_fn, batched=True, batch_size=512) eval_dataset.set_format(type='torch', columns=columns) def compute_metrics_fn(p: EvalPrediction): preds = np.argmax(p.predictions, axis=1) return glue_compute_metrics("classification", preds, p.label_ids) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics_fn, data_collator=DataCollator(tokenizer), ) # disable wandb console logs logging.getLogger('wandb.run_manager').setLevel(logging.WARNING) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) mis_matched_dataset = nlp.load_dataset("multi_nli", split="validation_mismatched") mis_matched_dataset = mis_matched_dataset.map(map_fn, batched=True, batch_size=512) mis_matched_dataset.set_format(type='torch', columns=columns) eval_datasets = [eval_dataset, mis_matched_dataset] for eval_dataset in eval_datasets: trainer.compute_metrics = compute_metrics_fn eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join(training_args.output_dir, f"eval_results.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in eval_result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(eval_result)
def main(params): """ Evaluates a finetuned model on the test or validation dataset.""" # load model and tokenizer device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50") config = MBartConfig.from_pretrained("facebook/mbart-large-50") model = MBartForConditionalGeneration(config).to(device) checkpoint_location = params.location + '/' + params.name + '/checkpoint/checkpoint' model, _, _, _ = logging.load_checkpoint(checkpoint_location, device, model) def pipeline(dataset, langs, batch_size, max_len): cols = ['input_ids_' + l for l in langs] def tokenize_fn(example): """apply tokenization""" l_tok = [] for lang in langs: encoded = tokenizer.encode(example[lang]) encoded[0] = tokenizer.lang_code_to_id[LANG_CODES[lang]] l_tok.append(encoded) return {'input_ids_' + l: tok for l, tok in zip(langs, l_tok)} def pad_seqs(examples): """Apply padding""" ex_langs = list( zip(*[tuple(ex[col] for col in cols) for ex in examples])) ex_langs = tuple( pad_sequence(x, batch_first=True, max_len=max_len) for x in ex_langs) return ex_langs dataset = filter_languages(dataset, langs) dataset = dataset.map(tokenize_fn) dataset.set_format(type='torch', columns=cols) num_examples = len(dataset) print('-'.join(langs) + ' : {} examples.'.format(num_examples)) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, collate_fn=pad_seqs) return dataloader, num_examples # load data if params.split == 'val': test_dataset = load_dataset('ted_multi', split='validation') elif params.split == 'test': test_dataset = load_dataset('ted_multi', split='test') elif params.split == 'combine': test_dataset = load_dataset('ted_multi', split='validation+test') else: raise NotImplementedError # preprocess splits for each direction test_dataloaders = {} for l1, l2 in combinations(params.langs, 2): test_dataloaders[l1 + '-' + l2], _ = pipeline(test_dataset, [l1, l2], params.batch_size, params.max_len) # evaluate the model def evaluate(x, y, y_code, bleu): y_inp, y_tar = y[:, :-1].contiguous(), y[:, 1:].contiguous() enc_mask = (x != 0) x, y_inp, y_tar, enc_mask = to_devices((x, y_inp, y_tar, enc_mask), device) model.eval() y_pred = model.generate(input_ids=x, decoder_start_token_id=y_code, attention_mask=enc_mask, max_length=x.size(1) + 1, num_beams=params.num_beams, length_penalty=params.length_penalty, early_stopping=True) bleu(y_pred[:, 1:], y_tar) test_results = {} for direction, loader in test_dataloaders.items(): alt_direction = '-'.join(reversed(direction.split('-'))) bleu1, bleu2 = BLEU(), BLEU() bleu1.set_excluded_indices([0, 2]) bleu2.set_excluded_indices([0, 2]) x_code = tokenizer.lang_code_to_id[LANG_CODES[direction.split('-')[0]]] y_code = tokenizer.lang_code_to_id[LANG_CODES[direction.split('-') [-1]]] start_ = time.time() for i, (x, y) in enumerate(loader): if params.test_batches is not None: if i > params.test_batches: break evaluate(x, y, y_code, bleu1) if not params.single_direction: evaluate(y, x, x_code, bleu2) if i % params.verbose == 0: bl1, bl2 = bleu1.get_metric(), bleu2.get_metric() print( 'Batch {} Bleu1 {:.4f} Bleu2 {:.4f} in {:.4f} secs per batch' .format(i, bl1, bl2, (time.time() - start_) / (i + 1))) bl1, bl2 = bleu1.get_metric(), bleu2.get_metric() test_results[direction] = [bl1] test_results[alt_direction] = [bl2] print(direction, bl1, bl2) # save test_results pd.DataFrame(test_results).to_csv(params.location + '/' + params.name + '/test_results.csv', index=False)
import torch from transformers import MBartTokenizer, BartForConditionalGeneration, MBartConfig, \ DataCollatorForLanguageModeling, Trainer, TrainingArguments from datasets import load_dataset model_name = 'facebook/mbart-large-cc25' tokenizer_name = 'facebook/mbart-large-cc25' config = MBartConfig.from_pretrained(model_name) tokenizer = MBartTokenizer.from_pretrained(tokenizer_name) config.attention_type = 'performer' model = BartForConditionalGeneration.from_pretrained(model_name) device_maps_flat = { model_name: { "encoder": { 0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] }, "decoder": { 1: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] }, }, } model.parallelize(device_maps_flat[model_name]) wiki = load_dataset("wikipedia", "20200501.frr", split='train[:3%]') train_encodings = tokenizer(wiki['text'], padding=True, truncation=True) train_encodings.to("cuda:0")
def convert_wav2vec2_checkpoint( checkpoint_path, pytorch_dump_folder_path, dict_path, config_yaml_path, encoder_config_path, decoder_config_path, add_adapter, adapter_kernel_size, adapter_stride, decoder_start_token_id, encoder_output_dim, ): """ Copy/paste/tweak model's weights to transformers design. """ # load configs encoder_config = Wav2Vec2Config.from_pretrained( encoder_config_path, add_adapter=True, adapter_stride=adapter_stride, adapter_kernel_size=adapter_kernel_size, use_auth_token=True, output_hidden_size=encoder_output_dim, ) decoder_config = MBartConfig.from_pretrained(decoder_config_path) # load model model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( [checkpoint_path], arg_overrides={ "config_yaml": config_yaml_path, "data": "/".join(dict_path.split("/")[:-1]), "w2v_path": checkpoint_path, "load_pretrained_decoder_from": None, }, ) model = model[0].eval() # load feature extractor feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( encoder_config_path, use_auth_token=True) # set weights for wav2vec2 encoder hf_encoder = Wav2Vec2Model(encoder_config) recursively_load_weights_wav2vec2(model.encoder, hf_encoder) # load decoder weights hf_decoder = MBartForCausalLM(decoder_config) missing_keys, unexpected_keys = hf_decoder.model.decoder.load_state_dict( model.decoder.state_dict(), strict=False) logger.warning( f"The following keys are missing when loading the decoder weights: {missing_keys}" ) logger.warning( f"The following keys are unexpected when loading the decoder weights: {unexpected_keys}" ) hf_wav2vec = SpeechEncoderDecoderModel(encoder=hf_encoder, decoder=hf_decoder) hf_wav2vec.config.tie_word_embeddings = False tokenizer = MBart50Tokenizer(dict_path) tokenizer.save_pretrained(pytorch_dump_folder_path) config = hf_wav2vec.config.to_dict() config["pad_token_id"] = tokenizer.pad_token_id config["bos_token_id"] = tokenizer.bos_token_id config["eos_token_id"] = tokenizer.eos_token_id config["tokenizer_class"] = "mbart50" config["feature_extractor_type"] = "wav2vec2" config["decoder_start_token_id"] = tokenizer.eos_token_id config["forced_bos_token_id"] = 250004 config["forced_eos_token_id"] = tokenizer.eos_token_id hf_wav2vec.config = SpeechEncoderDecoderConfig.from_dict(config) hf_wav2vec.save_pretrained(pytorch_dump_folder_path) feature_extractor.save_pretrained(pytorch_dump_folder_path)