def get_tokenizer(name): if name == 'mbart': from transformers import MBartTokenizer return MBartTokenizer.from_pretrained("facebook/mbart-large-cc25") elif name == 'mbert': from transformers import BertTokenizer BertTokenizer.from_pretrained('bert-base-multilingual-uncased') from transformers import MBartTokenizer return MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
def test_mbart_dataset_truncation(): tokenizer = MBartTokenizer.from_pretrained(MBART_TINY) tmp_dir = make_test_data_dir() max_len_source = max(len(tokenizer.encode(a)) for a in ARTICLES) max_len_target = max(len(tokenizer.encode(a)) for a in SUMMARIES) trunc = 4 src_lang, tgt_lang = "ro_RO", "de_DE" # NOT WHAT IT WAS TRAINED ON train_dataset = MBartDataset( tokenizer, data_dir=tmp_dir, type_path="train", max_source_length=trunc, max_target_length=1000, # ignored src_lang=src_lang, tgt_lang=tgt_lang, ) dataloader = DataLoader(train_dataset, batch_size=2, collate_fn=train_dataset.collate_fn) for batch in dataloader: assert isinstance(batch, dict) assert batch["attention_mask"].shape == batch["input_ids"].shape # show that articles were trimmed. assert batch["input_ids"].shape[1] == trunc # show that targets are the same len assert batch["decoder_input_ids"].shape[1] == trunc # check language codes in correct place assert batch["decoder_input_ids"][0, 0].item() == tokenizer.lang_code_to_id[tgt_lang] assert batch["decoder_input_ids"][0, -1].item() == tokenizer.eos_token_id assert batch["input_ids"][0, -2].item() == tokenizer.eos_token_id assert batch["input_ids"][0, -1].item() == tokenizer.lang_code_to_id[src_lang] assert max_len_target > trunc # Truncated assert max_len_source > trunc break # No need to test every batch
def __init__( self, hparams: Namespace, ): super().__init__() self.hparams = hparams self.tokenizer = MBartTokenizer.from_pretrained( self.hparams.model_checkpoint) data: Dict[str, Dict[str, Dict[str, List[Dict[str, Dict[str, str]]]]]] = load_json( self.hparams.data) if self.hparams.dataset == "both": self.datasets: Dict[str, CldstMBartDataset] = { split: { dialogue_id: turns for dset_name, dset in data.items() for dialogue_id, turns in dset[split].items() } for split in ["train", "test", "val"] } else: self.datasets: Dict[str, CldstMBartDataset] = data[self.hparams.dataset]
def predict(model_name, test_file, output_file, batch_size, max_source_tokens_count, max_target_tokens_count, use_cuda): tokenizer = MBartTokenizer.from_pretrained(model_name) test_dataset = MBartSummarizationDataset(test_file, tokenizer, max_source_tokens_count, max_target_tokens_count) device = torch.device("cuda:0") if use_cuda else torch.device("cpu") model = MBartForConditionalGeneration.from_pretrained(model_name) model.to(device) predictions = [] for batch in test_dataset: summaries = model.generate( input_ids=batch["input_ids"].to(device), attention_mask=dct["attention_mask"].to(device), num_beams=5, length_penalty=1.0, max_length=max_target_tokens_count + 2, min_length=5, no_repeat_ngram_size=0, early_stopping=True) for s in summaries: p = tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=False) predictions.append(p) with open(output_file, "w") as w: for p in predictions: w.write(p.strip() + "\n")
def test_special_tokens_unaffacted_by_save_load(self): tmpdirname = tempfile.mkdtemp() original_special_tokens = self.tokenizer.fairseq_tokens_to_ids self.tokenizer.save_pretrained(tmpdirname) new_tok = MBartTokenizer.from_pretrained(tmpdirname) self.assertDictEqual(new_tok.fairseq_tokens_to_ids, original_special_tokens)
def setUpClass(cls): checkpoint_name = "mbart-large-en-ro" cls.tokenizer = MBartTokenizer.from_pretrained(checkpoint_name) cls.pad_token_id = 1 net_input = { "input_ids": _long_tensor([ [ 3493, 3060, 621, 104064, 1810, 100, 142, 566, 13158, 6889, 5, 2, 250004 ], [ 64511, 7, 765, 2837, 45188, 297, 4049, 237, 10, 122122, 5, 2, 250004 ], ]), "decoder_input_ids": _long_tensor([ [ 250020, 31952, 144, 9019, 242307, 21980, 55749, 11, 5, 2, 1, 1 ], [250020, 884, 9019, 96, 9, 916, 86792, 36, 18743, 15596, 5, 2], ]), "generation_mode": False, } net_input["attention_mask"] = net_input["input_ids"].ne( cls.pad_token_id) cls.net_input = net_input return cls
def main(): print('hello') model = Generator() tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25") TEXT = 'This is a test' ids = tokenizer.encode(TEXT) outputs = model.generate(input_ids=torch.tensor([ids]), decoder_start_token_id=250004) print(tokenizer.batch_decode(outputs))
def _test_TFMbart(self, size, large=False): from transformers import MBartTokenizer, TFMBartModel tokenizer = MBartTokenizer.from_pretrained(size) model = TFMBartModel.from_pretrained(size) input_dict = tokenizer("Hello, my dog is cute", return_tensors="tf") spec, input_dict = self.spec_and_pad(input_dict, max_length=128) outputs = ["last_hidden_state"] self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large, rtol=1.2)
def get_summarization_agents(): agents = { "model": MBartForConditionalGeneration.from_pretrained( "vasudevgupta/mbart-summarizer-interiit"), "tokenizer": MBartTokenizer.from_pretrained("facebook/mbart-large-cc25") } return agents
def load_tokenizer(self): if self.model in ["xlm-roberta-base"]: return XLMRobertaTokenizer.from_pretrained(self.model, strip_accents=False) elif self.model in ['bert-base-multilingual-cased']: return BertTokenizer.from_pretrained(self.model, strip_accents=False) elif self.model in ['facebook/mbart-large-cc25']: return MBartTokenizer.from_pretrained('facebook/mbart-large-cc25', strip_accents=False)
def __init__( self, hparams: Namespace, ): super().__init__() self.hparams = hparams self.tokenizer = MBartTokenizer.from_pretrained( self.hparams.model_checkpoint) self.model = MBartForConditionalGeneration.from_pretrained( self.hparams.model_checkpoint)
def generate_summaries_or_translations( examples: List[str], out_file: str, model_name: str, batch_size: int = 8, device: str = DEFAULT_DEVICE, fp16=False, task="summarization", prefix=None, **generate_kwargs, ) -> Dict: """Save model.generate results to <out_file>, and return how long it took.""" fout = Path(out_file).open("w", encoding="utf-8") model_name = str(model_name) #model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) model = MBartForConditionalGeneration.from_pretrained(model_name).to( device) if fp16: model = model.half() tokenizer = MBartTokenizer.from_pretrained(model_name) #tokenizer = AutoTokenizer.from_pretrained(model_name) #logger.info(f"Inferred tokenizer type: {tokenizer.__class__}") # if this is wrong, check config.model_type. start_time = time.time() # update config with task specific params use_task_specific_params(model, task) if prefix is None: prefix = prefix or getattr(model.config, "prefix", "") or "" for examples_chunk in tqdm(list(chunks(examples, batch_size))): examples_chunk = [prefix + text for text in examples_chunk] batch = tokenizer(examples_chunk, return_tensors="pt", truncation=True, padding="longest").to(device) summaries = model.generate( input_ids=batch.input_ids, attention_mask=batch.attention_mask, #**generate_kwargs, ) dec = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False) for hypothesis in dec: fout.write(hypothesis + "\n") fout.flush() fout.close() runtime = int(time.time() - start_time) # seconds n_obs = len(examples) return dict(n_obs=n_obs, runtime=runtime, seconds_per_sample=round(runtime / n_obs, 4))
def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_path = os.path.join(bolt.ARTIFACT_DIR, 'MBart_translation.pt') tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25") model = MBartForConditionalGeneration.from_pretrained( 'facebook/mbart-large-cc25') print("loading model") model.load_state_dict(torch.load(model_path)) print("model loaded") sentences_lst = "i love you" result = translate(sentences_lst, tokenizer, model, 3, device) print(result)
def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25') model = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25') # example_english_phrase = ["I love you", 'you hate i'] # expected_translation_chinese = ["我中意你", '你憎我'] print("Loading and processing data") en, yue = read_file("../MARIAN/en2yue/train.en", "../MARIAN/en2yue/train.yue") val_en, val_yue = read_file("../MARIAN/en2yue/val.en", '../MARIAN/en2yue/val.yue') train_dataset = token_(tokenizer, en, yue) loader = create_data_loader(train_dataset, 8) val_dataset = token_(tokenizer, val_en, val_yue) val_loader = create_data_loader(val_dataset, 8) EPOCHS = 10 optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False) total_steps = len(loader) * EPOCHS scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) resultdir = bolt.ARTIFACT_DIR MODEL_SAVE_PATH = os.path.join(resultdir, 'MBart_translation.pt') print("Start training") best_val_loss = float('inf') for epoch in range(EPOCHS): print(f'Epoch {epoch + 1}/{EPOCHS}') print('-' * 30) train_loss = train_epoch(model, loader, optimizer, scheduler, device) val_loss = evaluate_epoch(model, val_loader, device) print(f'Train_loss: {train_loss} | Val_loss: {val_loss}') if val_loss < best_val_loss: best_val_loss = val_loss torch.save(model.state_dict(), MODEL_SAVE_PATH) bolt.send_metrics({ "Total_train_loss": train_loss, "Total_val_loss": val_loss })
def setUpClass(cls): cls.tokenizer: MBartTokenizer = MBartTokenizer.from_pretrained( cls.checkpoint_name, src_lang="en_XX", tgt_lang="ro_RO" ) cls.pad_token_id = 1 return cls
import torch from transformers import MBartTokenizer, BartForConditionalGeneration, MBartConfig, \ DataCollatorForLanguageModeling, Trainer, TrainingArguments from datasets import load_dataset model_name = 'facebook/mbart-large-cc25' tokenizer_name = 'facebook/mbart-large-cc25' config = MBartConfig.from_pretrained(model_name) tokenizer = MBartTokenizer.from_pretrained(tokenizer_name) config.attention_type = 'performer' model = BartForConditionalGeneration.from_pretrained(model_name) device_maps_flat = { model_name: { "encoder": { 0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] }, "decoder": { 1: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] }, }, } model.parallelize(device_maps_flat[model_name]) wiki = load_dataset("wikipedia", "20200501.frr", split='train[:3%]') train_encodings = tokenizer(wiki['text'], padding=True, truncation=True) train_encodings.to("cuda:0")
def train_MBart(data_path,tokenizer,output_path): model_config = MBartConfig(vocab_size=300,d_model=10,encoder_layers=1,decoder_layers=1,encoder_attention_heads=1,decoder_attention_heads=1,encoder_ffn_dim=10,decoder_ffn_dim=10,max_position_embeddings=512) model = MBartModel(config=model_config) sentences = {} #associates lang_id with list of sentences #read data files and separate language data into different lists lang_id = 0 #counter for languages in dataset for sentence_file in os.listdir(data_path): with open(data_path+sentence_file,'r') as data: sentences[lang_id] = [] for line in data: sentences[lang_id].append(line) lang_id += 1 #create token sequences to pass into model src_lang,tgt_lang = (sentences[lang_id] for lang_id in sentences) batch = tokenizer.prepare_seq2seq_batch(src_texts=src_lang,tgt_texts=tgt_lang,return_tensors='pt') model(input_ids=batch['input_ids'],decoder_input_ids=batch['labels']) model.save_pretrained(output_path) if __name__=='__main__': args = get_args() tokenizer = MBartTokenizer.from_pretrained(args.model_file) #create new MBTokenizer from file created by google/sentencepiece train_MBart(args.data_path,tokenizer,args.output_path)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() check_output_dir(training_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.parallel_mode == ParallelMode.DISTRIBUTED), training_args.fp16, ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = MBartConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout") for p in extra_model_params: if getattr(training_args, p, None): assert hasattr(config, p), f"({config.__class__.__name__}) doesn't have a `{p}` attribute" setattr(config, p, getattr(training_args, p)) tokenizer = MBartTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) # model = MBartForConditionalGeneration.from_pretrained( # model_args.model_name_or_path, # from_tf=".ckpt" in model_args.model_name_or_path, # config=config, # cache_dir=model_args.cache_dir, # ) # model = MBartForConditionalGeneration(config) # model = MBartForConditionalGeneration.from_pretrained(model_args.config_name) model_config = MBartConfig(vocab_size=300,d_model=10,encoder_layers=1,decoder_layers=1,encoder_attention_heads=1,decoder_attention_heads=1,encoder_ffn_dim=10,decoder_ffn_dim=10,max_position_embeddings=512) model = MBartModel(config=model_config) # use task specific params use_task_specific_params(model, data_args.task) # set num_beams for evaluation if data_args.eval_beams is None: data_args.eval_beams = model.config.num_beams # set decoder_start_token_id for MBart if model.config.decoder_start_token_id is None and isinstance(tokenizer, MBartTokenizer): assert ( data_args.tgt_lang is not None and data_args.src_lang is not None ), "mBart requires --tgt_lang and --src_lang" model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.tgt_lang] if model_args.freeze_embeds: freeze_embeds(model) if model_args.freeze_encoder: freeze_params(model.get_encoder()) assert_all_frozen(model.get_encoder()) dataset_class = Seq2SeqDataset # Get datasets train_dataset = ( dataset_class( tokenizer, type_path="train", data_dir=data_args.data_dir, n_obs=data_args.n_train, max_target_length=data_args.max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_train else None ) eval_dataset = ( dataset_class( tokenizer, type_path="val", data_dir=data_args.data_dir, n_obs=data_args.n_val, max_target_length=data_args.val_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_eval or training_args.evaluation_strategy != EvaluationStrategy.NO else None ) test_dataset = ( dataset_class( tokenizer, type_path="test", data_dir=data_args.data_dir, n_obs=data_args.n_test, max_target_length=data_args.test_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_predict else None ) # Initialize our Trainer compute_metrics_fn = ( build_compute_metrics_fn(data_args.task, tokenizer) if training_args.predict_with_generate else None ) trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=Seq2SeqDataCollator(tokenizer, data_args, training_args.tpu_num_cores), compute_metrics=compute_metrics_fn, tokenizer=tokenizer, ) all_metrics = {} # Training if training_args.do_train: logger.info("*** Train ***") train_result = trainer.train( model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None ) metrics = train_result.metrics metrics["train_n_objs"] = data_args.n_train trainer.save_model() # this also saves the tokenizer if trainer.is_world_process_zero(): handle_metrics("train", metrics, training_args.output_dir) all_metrics.update(metrics) # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json")) # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) tokenizer.save_pretrained(training_args.output_dir) # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate( metric_key_prefix="val", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams ) metrics["val_n_objs"] = data_args.n_val metrics["val_loss"] = round(metrics["val_loss"], 4) if trainer.is_world_process_zero(): handle_metrics("val", metrics, training_args.output_dir) all_metrics.update(metrics) if training_args.do_predict: logger.info("*** Predict ***") test_output = trainer.predict( test_dataset=test_dataset, metric_key_prefix="test", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams, ) metrics = test_output.metrics metrics["test_n_objs"] = data_args.n_test if trainer.is_world_process_zero(): metrics["test_loss"] = round(metrics["test_loss"], 4) handle_metrics("test", metrics, training_args.output_dir) all_metrics.update(metrics) if training_args.predict_with_generate: test_preds = tokenizer.batch_decode( test_output.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True ) test_preds = lmap(str.strip, test_preds) write_txt_file(test_preds, os.path.join(training_args.output_dir, "test_generations.txt")) if trainer.is_world_process_zero(): save_json(all_metrics, os.path.join(training_args.output_dir, "all_results.json")) return all_metrics
def train( model_name, train_file, val_file, batch_size, output_dir, learning_rate, logging_steps, eval_steps, save_steps, warmup_steps, num_train_epochs, gradient_accumulation_steps, max_grad_norm, weight_decay, max_source_tokens_count, max_target_tokens_count, fp16_opt_level, fp16=False ): if fp16: print("Using FP16") tokenizer = MBartTokenizer.from_pretrained(model_name) train_dataset = MBartSummarizationDataset( train_file, tokenizer, max_source_tokens_count, max_target_tokens_count) val_dataset = MBartSummarizationDataset( val_file, tokenizer, max_source_tokens_count, max_target_tokens_count) model = MBartForConditionalGeneration.from_pretrained(model_name) training_args = TrainingArguments( output_dir=output_dir, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, do_train=True, do_eval=True, overwrite_output_dir=True, logging_steps=logging_steps, save_steps=save_steps, eval_steps=eval_steps, learning_rate=learning_rate, warmup_steps=warmup_steps, num_train_epochs=num_train_epochs, gradient_accumulation_steps=gradient_accumulation_steps, max_grad_norm=max_grad_norm, weight_decay=weight_decay, fp16=fp16, fp16_opt_level=fp16_opt_level, label_smoothing_factor=0.1, evaluation_strategy="steps" ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset ) trainer.train()
def setUpClass(cls): checkpoint_name = "facebook/mbart-large-en-ro" cls.tokenizer = MBartTokenizer.from_pretrained(checkpoint_name) cls.pad_token_id = 1 return cls
parser = argparse.ArgumentParser() parser.add_argument("--config", type=str, default="main", help="configurations defined in config.py") p_args = parser.parse_args() args = getattr(config, p_args.config) print(args) ## use this for running sweep # wandb.init(config=args.__dict__) # args = wandb.config # print(dict(args)) tokenizer = MBartTokenizer.from_pretrained(args.tokenizer_id) if args.load_dir: bart = MBartForConditionalGeneration(args.bart_config) print(f"model is loaded from {args.load_dir}") else: bart = MBartForConditionalGeneration.from_pretrained(args.model_id) print(f"model is loaded from {args.model_id}") print("====Working on layers freezing====") bart.ffn_requires_grad_(args.enc_ffn_grad, args.dec_ffn_grad) bart.attn_requires_grad_(args.enc_attn_grad, args.dec_attn_grad, args.cross_attn_grad) bart.embed_requires_grad_(args.embed_grad, args.pos_embed_grad) bart.norm_requires_grad_(args.enc_norm_grad, args.dec_norm_grad, args.cross_attn_norm_grad)
def setUpClass(cls): cls.tokenizer: MBartTokenizer = MBartTokenizer.from_pretrained( cls.checkpoint_name) cls.pad_token_id = 1 return cls
def setup(self, process_on_fly=True, n_augment=0): if process_on_fly: data = load_dataset("csv", data_files=self.file_path)["train"] data = data.map( lambda x: {"article_length": len(x["Text"].split())}) data = data.map( lambda x: {"summary_length": len(x["Headline"].split())}) data = data.map(lambda x: { "CleanedText": preprocess_article(x["cleaned"], self.sep_token) }) data = data.map(lambda x: {"CleanedHeadline": x["Headline"]}) fn_kwargs = { "model": MBartForConditionalGeneration.from_pretrained( "vasudevgupta/mbart-iitb-hin-eng"), "tokenizer": MBartTokenizer.from_pretrained( "vasudevgupta/mbart-iitb-hin-eng"), "max_pred_length": 32, } data = data.map(translate, fn_kwargs=fn_kwargs) data.to_csv(f"cleaned-{self.file_path}") else: data = load_dataset( "csv", data_files=f"cleaned-{self.file_path}")["train"] data = data.filter( lambda x: x["article_length"] > 32 and x["summary_length"] > 1) removed_samples = data.filter(lambda x: type(x["CleanedHeadline"]) != str or type(x["CleanedText"]) != str) print(removed_samples["CleanedHeadline"]) print(removed_samples["CleanedText"]) data = data.filter(lambda x: type(x["CleanedHeadline"]) == str and type(x["CleanedText"]) == str) print("Dataset", data) # print("Samples with article length > 560 are", data.filter(lambda x: x["article_length"] > 560)) data = data.train_test_split(test_size=600, shuffle=True, seed=self.seed) tr_dataset = data["train"].map(lambda x: {"split": "TRAIN"}) val_dataset = data["test"].map(lambda x: {"split": "VALIDATION"}) if n_augment > 0: print("AUGMENTING") tr_dataset = tr_dataset.map( lambda x: {"augmentation_status": "Not Augmented"}) val_dataset = val_dataset.map( lambda x: {"augmentation_status": "Not Augmented"}) noisy_dataset = tr_dataset.filter( lambda x: x["Mobile_Tech_Flag"] == 1) noisy_datasets = [] for _ in range(n_augment): noisy_datasets.append( noisy_dataset.map(lambda x: { "CleanedText": get_noisy_sent(x["CleanedText"].split()) })) noisy_dataset = concatenate_datasets(noisy_datasets) noisy_dataset = noisy_dataset.map( lambda x: {"augmentation_status": "Augmented"}) tr_dataset = concatenate_datasets([noisy_dataset, tr_dataset]) return tr_dataset, val_dataset
return features if __name__ == '__main__': class args: batch_size: int = 2 process_on_fly: bool = False num_workers: int = 2 max_length: int = 512 max_target_length: int = 20 file_path: str = "data/dev_data_article.csv" seed: int = 42 n_augment: int = 1 tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25") dl = DataLoader(tokenizer, args) tr_dataset, val_dataset = dl.setup(process_on_fly=args.process_on_fly, n_augment=args.n_augment) print(val_dataset) print(tr_dataset) dataset = concatenate_datasets([tr_dataset, val_dataset]).sort('Text_ID') print(dataset) print(tr_dataset[:2]) tr_dataset = dl.train_dataloader(tr_dataset) val_dataset = dl.val_dataloader(val_dataset)
def main(args_dict=None): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if args_dict is not None: model_args, data_args, training_args = parser.parse_dict(args_dict) elif len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Set project name os.environ["WANDB_PROJECT"] = "multilingual_zeroshot" num_labels = 3 labels = ['entailment', 'neutral', 'contradiction'] # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = MBartConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, dropout=model_args.dropout, attention_dropout=model_args.attention_dropout, finetuning_task="mnli", cache_dir=model_args.cache_dir, ) tokenizer = MBartTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = MBartForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets columns = ['input_ids', 'attention_mask', 'labels'] map_fn = get_mnli_map_fn(data_args.lang, data_args.max_seq_length, tokenizer) train_dataset = nlp.load_dataset("multi_nli", split="train") train_dataset = train_dataset.map(map_fn, batched=True, batch_size=512) train_dataset.set_format(type='torch', columns=columns) eval_dataset = (nlp.load_dataset("multi_nli", split="validation_matched") if training_args.do_eval else None) eval_dataset = eval_dataset.map(map_fn, batched=True, batch_size=512) eval_dataset.set_format(type='torch', columns=columns) def compute_metrics_fn(p: EvalPrediction): preds = np.argmax(p.predictions, axis=1) return glue_compute_metrics("classification", preds, p.label_ids) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics_fn, data_collator=DataCollator(tokenizer), ) # disable wandb console logs logging.getLogger('wandb.run_manager').setLevel(logging.WARNING) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) mis_matched_dataset = nlp.load_dataset("multi_nli", split="validation_mismatched") mis_matched_dataset = mis_matched_dataset.map(map_fn, batched=True, batch_size=512) mis_matched_dataset.set_format(type='torch', columns=columns) eval_datasets = [eval_dataset, mis_matched_dataset] for eval_dataset in eval_datasets: trainer.compute_metrics = compute_metrics_fn eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join(training_args.output_dir, f"eval_results.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in eval_result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(eval_result)
def setUp(self): super().setUp() # We have a SentencePiece fixture for testing tokenizer = MBartTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer.save_pretrained(self.tmpdirname)
def test_full_tokenizer(self): tokenizer = MBartTokenizer(SAMPLE_VOCAB, keep_accents=True) tokens = tokenizer.tokenize("This is a test") self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"]) self.assertListEqual( tokenizer.convert_tokens_to_ids(tokens), [value + tokenizer.fairseq_offset for value in [285, 46, 10, 170, 382]], ) tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") self.assertListEqual( tokens, [ SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "9", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "é", ".", ], ) ids = tokenizer.convert_tokens_to_ids(tokens) self.assertListEqual( ids, [ value + tokenizer.fairseq_offset for value in [8, 21, 84, 55, 24, 19, 7, 2, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 2, 4] # ^ unk: 2 + 1 = 3 unk: 2 + 1 = 3 ^ ], ) back_tokens = tokenizer.convert_ids_to_tokens(ids) self.assertListEqual( back_tokens, [ SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "<unk>", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "<unk>", ".", ], )
TGT_DATA = "./data_tgt_de.txt" SRC_DATA = "./data_source_hsb.txt" from transformers import MBartForConditionalGeneration, MBartTokenizer, MBartModel, MBartConfig #Read from the data files src_txts = [] tgt_txts = [] with open(SRC_DATA) as f: for line in f: src_txts.append(line) with open(TGT_DATA) as f: for line in f: tgt_txts.append(line) tokenizer = MBartTokenizer.from_pretrained('./tokenizer_de_hsb.model') batch = tokenizer.prepare_seq2seq_batch(src_texts=src_txts, src_lang="en_XX", tgt_texts=tgt_txts, tgt_lang="ro_RO", return_tensors="pt") config = MBartConfig() model = MBartModel(config) model(input_ids=batch['input_ids'], decoder_input_ids=batch['labels']) # forward pass model.save_pretrained('./trained_model')