def test_multilingual_translation(self): model = MBartForConditionalGeneration.from_pretrained( "facebook/mbart-large-50-many-to-many-mmt") tokenizer = MBart50TokenizerFast.from_pretrained( "facebook/mbart-large-50-many-to-many-mmt") translator = pipeline(task="translation", model=model, tokenizer=tokenizer) # Missing src_lang, tgt_lang with self.assertRaises(ValueError): translator("This is a test") outputs = translator("This is a test", src_lang="en_XX", tgt_lang="ar_AR") self.assertEqual(outputs, [{"translation_text": "هذا إختبار"}]) outputs = translator("This is a test", src_lang="en_XX", tgt_lang="hi_IN") self.assertEqual(outputs, [{"translation_text": "यह एक परीक्षण है"}]) # src_lang, tgt_lang can be defined at pipeline call time translator = pipeline(task="translation", model=model, tokenizer=tokenizer, src_lang="en_XX", tgt_lang="ar_AR") outputs = translator("This is a test") self.assertEqual(outputs, [{"translation_text": "هذا إختبار"}])
def predict(model_name, test_file, output_file, batch_size, max_source_tokens_count, max_target_tokens_count, use_cuda): tokenizer = MBartTokenizer.from_pretrained(model_name) test_dataset = MBartSummarizationDataset(test_file, tokenizer, max_source_tokens_count, max_target_tokens_count) device = torch.device("cuda:0") if use_cuda else torch.device("cpu") model = MBartForConditionalGeneration.from_pretrained(model_name) model.to(device) predictions = [] for batch in test_dataset: summaries = model.generate( input_ids=batch["input_ids"].to(device), attention_mask=dct["attention_mask"].to(device), num_beams=5, length_penalty=1.0, max_length=max_target_tokens_count + 2, min_length=5, no_repeat_ngram_size=0, early_stopping=True) for s in summaries: p = tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=False) predictions.append(p) with open(output_file, "w") as w: for p in predictions: w.write(p.strip() + "\n")
def __init__(self) -> None: self.model = MBartForConditionalGeneration.from_pretrained( "facebook/mbart-large-50-many-to-many-mmt" ) self.tokenizer = MBart50TokenizerFast.from_pretrained( "facebook/mbart-large-50-many-to-many-mmt" )
def __init__( self, model_or_path: str = "facebook/mbart-large-50-many-to-many-mmt", tokenizer_path: str = None, device: str = "auto", model_options: dict = None, tokenizer_options: dict = None, ): """ Instantiates a multilingual transformer model for translation. {{params}} {{model_or_path}} The path or the name of the model. Equivalent to the first argument of AutoModel.from_pretrained(). {{device}} "cpu", "gpu" or "auto". If it's set to "auto", will try to select a GPU when available or else fallback to CPU. {{tokenizer_path}} The path to the tokenizer, only if it is different from `model_or_path`; otherwise, leave it as `None`. {{model_options}} The keyword arguments passed to the transformer model, which is a mBART-Large for condition generation. {{tokenizer_options}} The keyword arguments passed to the tokenizer model, which is a mBART-50 Fast Tokenizer. """ self.model_or_path = model_or_path self.device = _select_device(device) # Resolve default values tokenizer_path = tokenizer_path or self.model_or_path model_options = model_options or {} tokenizer_options = tokenizer_options or {} self.tokenizer = MBart50TokenizerFast.from_pretrained( tokenizer_path, **tokenizer_options) if model_or_path.endswith(".pt"): self.bart_model = torch.load(model_or_path, map_location=self.device).eval() else: self.bart_model = (MBartForConditionalGeneration.from_pretrained( self.model_or_path, **model_options).to(self.device).eval())
def model(self): """Only load the model if needed.""" model = MBartForConditionalGeneration.from_pretrained( self.checkpoint_name).to(torch_device) if "cuda" in torch_device: model = model.half() return model
def get_summarization_agents(): agents = { "model": MBartForConditionalGeneration.from_pretrained( "vasudevgupta/mbart-summarizer-interiit"), "tokenizer": MBartTokenizer.from_pretrained("facebook/mbart-large-cc25") } return agents
def __init__(self, src_lang, tgt_lang): super().__init__() self.batch_size = 16 self.lr = 3e-5 self.src_lang = src_lang self.tgt_lang = tgt_lang self.model = MBartForConditionalGeneration.from_pretrained( "facebook/mbart-large-en-ro" )
def __init__(self): try: # using the latest model from facebook for many to many language translations model_name = "facebook/mbart-large-50-many-to-many-mmt" self.model = MBartForConditionalGeneration.from_pretrained( model_name) self.tokenizer = MBart50TokenizerFast.from_pretrained(model_name) except Exception as e: logging.error(f"Error initializing model. {e}")
def __init__(self): self.model = MBartForConditionalGeneration.from_pretrained( 'facebook/mbart-large-50-many-to-many-mmt') self.tokenizer = MBart50TokenizerFast.from_pretrained( 'facebook/mbart-large-50-many-to-many-mmt') self.supported_langs = [ 'en_XX', 'gu_IN', 'hi_IN', 'bn_IN', 'ml_IN', 'mr_IN', 'ta_IN', 'te_IN' ]
def __init__( self, hparams: Namespace, ): super().__init__() self.hparams = hparams self.tokenizer = MBartTokenizer.from_pretrained( self.hparams.model_checkpoint) self.model = MBartForConditionalGeneration.from_pretrained( self.hparams.model_checkpoint)
def __init__(self, config): model_name = config.get("model_name", None) model_path = config.get("model_path", None) device = config.get("device", 0) # default on gpu 0 self.tokenizer = MBart50TokenizerFast.from_pretrained(model_path) self.model = MBartForConditionalGeneration.from_pretrained(model_path) self.model.eval() self.model.half() self.device = torch.device( "cpu" if device < 0 else "cuda:{}".format(device)) if self.device.type == "cuda": self.model = self.model.to(self.device)
def generate_summaries_or_translations( examples: List[str], out_file: str, model_name: str, batch_size: int = 8, device: str = DEFAULT_DEVICE, fp16=False, task="summarization", prefix=None, **generate_kwargs, ) -> Dict: """Save model.generate results to <out_file>, and return how long it took.""" fout = Path(out_file).open("w", encoding="utf-8") model_name = str(model_name) #model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) model = MBartForConditionalGeneration.from_pretrained(model_name).to( device) if fp16: model = model.half() tokenizer = MBartTokenizer.from_pretrained(model_name) #tokenizer = AutoTokenizer.from_pretrained(model_name) #logger.info(f"Inferred tokenizer type: {tokenizer.__class__}") # if this is wrong, check config.model_type. start_time = time.time() # update config with task specific params use_task_specific_params(model, task) if prefix is None: prefix = prefix or getattr(model.config, "prefix", "") or "" for examples_chunk in tqdm(list(chunks(examples, batch_size))): examples_chunk = [prefix + text for text in examples_chunk] batch = tokenizer(examples_chunk, return_tensors="pt", truncation=True, padding="longest").to(device) summaries = model.generate( input_ids=batch.input_ids, attention_mask=batch.attention_mask, #**generate_kwargs, ) dec = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False) for hypothesis in dec: fout.write(hypothesis + "\n") fout.flush() fout.close() runtime = int(time.time() - start_time) # seconds n_obs = len(examples) return dict(n_obs=n_obs, runtime=runtime, seconds_per_sample=round(runtime / n_obs, 4))
def __init__(self, cfg_path, cfg_name): """ Constructor of BartForSeq2SeqLM Args: cfg_path (str): parents path cfg_name (str): config file name """ super().__init__(**self.load_args(cfg_path, cfg_name)) self.model = MBartForConditionalGeneration.from_pretrained( "facebook/mbart-large-cc25") if self.precision == 16: self.model = self.model.half()
def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_path = os.path.join(bolt.ARTIFACT_DIR, 'MBart_translation.pt') tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25") model = MBartForConditionalGeneration.from_pretrained( 'facebook/mbart-large-cc25') print("loading model") model.load_state_dict(torch.load(model_path)) print("model loaded") sentences_lst = "i love you" result = translate(sentences_lst, tokenizer, model, 3, device) print(result)
def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25') model = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25') # example_english_phrase = ["I love you", 'you hate i'] # expected_translation_chinese = ["我中意你", '你憎我'] print("Loading and processing data") en, yue = read_file("../MARIAN/en2yue/train.en", "../MARIAN/en2yue/train.yue") val_en, val_yue = read_file("../MARIAN/en2yue/val.en", '../MARIAN/en2yue/val.yue') train_dataset = token_(tokenizer, en, yue) loader = create_data_loader(train_dataset, 8) val_dataset = token_(tokenizer, val_en, val_yue) val_loader = create_data_loader(val_dataset, 8) EPOCHS = 10 optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False) total_steps = len(loader) * EPOCHS scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) resultdir = bolt.ARTIFACT_DIR MODEL_SAVE_PATH = os.path.join(resultdir, 'MBart_translation.pt') print("Start training") best_val_loss = float('inf') for epoch in range(EPOCHS): print(f'Epoch {epoch + 1}/{EPOCHS}') print('-' * 30) train_loss = train_epoch(model, loader, optimizer, scheduler, device) val_loss = evaluate_epoch(model, val_loader, device) print(f'Train_loss: {train_loss} | Val_loss: {val_loss}') if val_loss < best_val_loss: best_val_loss = val_loss torch.save(model.state_dict(), MODEL_SAVE_PATH) bolt.send_metrics({ "Total_train_loss": train_loss, "Total_val_loss": val_loss })
def load(self, path): """ Loads a model specified by path. Args: path: model path Returns: (model, tokenizer) """ if path.startswith("Helsinki-NLP"): model = MarianMTModel.from_pretrained(path) tokenizer = MarianTokenizer.from_pretrained(path) else: model = MBartForConditionalGeneration.from_pretrained(path) tokenizer = MBart50TokenizerFast.from_pretrained(path) # Apply model initialization routines model = self.prepare(model) return (model, tokenizer)
from transformers import MBartForConditionalGeneration, MBartTokenizer model = MBartForConditionalGeneration.from_pretrained( "/data00/wuwei.ai/code/transformers/examples/seq2seq/zhen_finetune_04/best_tfmr" ) # model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25") # model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro") tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25") article = "中国人民站起来了!" batch = tokenizer.prepare_seq2seq_batch(src_texts=[article], src_lang='zh_CN', tgt_lang='en_XX') translated_tokens = model.generate(**batch) translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] print(translation)
def from_pretrained(cls, model_name_or_path, *model_args, **kwargs): kwargs.pop("from_pt", None) model = MBartForConditionalGeneration.from_pretrained( model_name_or_path, *model_args, **kwargs) use_cache = model.config.use_cache origin_forward = model.forward def forward(*args, **kwargs): outputs = origin_forward(*args, **kwargs) if outputs.past_key_values is not None: # Multiply by 1.0 to workaround a bug in OpenVINO 2022.1 with # dynamic shapes inputs connected to model outputs: past_key_values = [] for i in range(12): past_key_values.append(( outputs.past_key_values[i][0], outputs.past_key_values[i][1], outputs.past_key_values[i][2] * 1.0, outputs.past_key_values[i][3] * 1.0, )) outputs.past_key_values = tuple(past_key_values) return Seq2SeqLMOutput( logits=outputs.logits, past_key_values=outputs.past_key_values, ) model.forward = lambda *args, **kwargs: forward(*args, **kwargs) # Create a separate network for encoder - it will be called just once. encoder = load_ov_model_from_pytorch(model.get_encoder()) inputs = { "input_ids": None, "attention_mask": torch.zeros([1, 11], dtype=torch.int32), "decoder_input_ids": torch.zeros([1, 1 if use_cache else 11], dtype=torch.int32), "decoder_attention_mask": None, "head_mask": None, "decoder_head_mask": None, "cross_attn_head_mask": None, "encoder_outputs": [torch.zeros([1, 11, 1024], dtype=torch.float32)], } net = load_ov_model_from_pytorch(model, inputs) # Fix for 2022.1 release if is_openvino_api_2: net.inputs[2].get_tensor().set_names(set(["encoder_outputs"])) if use_cache: inputs["past_key_values"] = [[ torch.zeros([1, 16, 1, 64], dtype=torch.float32), torch.zeros([1, 16, 1, 64], dtype=torch.float32), torch.zeros([1, 16, 11, 64], dtype=torch.float32), torch.zeros([1, 16, 11, 64], dtype=torch.float32), ]] * 12 net_past = load_ov_model_from_pytorch(model, inputs) else: net_past = None return OVMBartForConditionalGeneration(model.config, encoder, net, net_past)
def download_model(): model_name = "facebook/mbart-large-50-many-to-many-mmt" model = MBartForConditionalGeneration.from_pretrained(model_name) tokenizer = MBart50Tokenizer.from_pretrained(model_name) return model, tokenizer
# hf-experiments # @author Loreto Parisi (loretoparisi at gmail dot com) # Copyright (c) 2020-2021 Loreto Parisi (loretoparisi at gmail dot com) # HF: https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt import os from transformers import MBartForConditionalGeneration, MBart50TokenizerFast article_en = "The head of the United Nations says there is no military solution in Syria" model = MBartForConditionalGeneration.from_pretrained( "facebook/mbart-large-50-one-to-many-mmt", cache_dir=os.getenv("cache_dir", "../../models")) tokenizer = MBart50TokenizerFast.from_pretrained( "facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX", cache_dir=os.getenv("cache_dir", "../../models")) model_inputs = tokenizer(article_en, return_tensors="pt") # translate from English to Hindi generated_tokens = model.generate( **model_inputs, forced_bos_token_id=tokenizer.lang_code_to_id["hi_IN"]) tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) # => 'संयुक्त राष्ट्र के नेता कहते हैं कि सीरिया में कोई सैन्य समाधान नहीं है' # translate from English to Chinese generated_tokens = model.generate( **model_inputs, forced_bos_token_id=tokenizer.lang_code_to_id["zh_CN"]) decoded = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) # => '联合国首脑说,叙利亚没有军事解决办法' print(decoded)
length = len(data) num_batch = int(np.ceil(length / bs)) for i in range(num_batch): begin = i * bs stop = min((i+1)*bs, length) source = src[begin:stop] target = tgt[begin:stop] sources = tokenizer(source, return_tensors='pt', max_length=ml, padding=True, truncation=True) targets = tokenizer(target, return_tensors='pt', max_length=256, padding=True, truncation=True) tar_ids = targets['input_ids'] tar_mask = targets['attention_mask'] src_ids = sources['input_ids'] src_mask = sources['attention_mask'] if peg: prefix = torch.tensor([0]).unsqueeze(0).repeat_interleave(tar_ids.shape[0], 0) tar_ids = torch.cat((prefix, tar_ids), 1) prefix = torch.tensor([1]).unsqueeze(0).repeat_interleave(tar_mask.shape[0], 0) tar_mask = torch.cat((prefix, tar_mask), 1) yield src_ids, tar_ids, src_mask, tar_mask if __name__ == '__main__': from transformers import MBartForConditionalGeneration, MBartTokenizer model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro", use_cache=False) tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro") gen_bt(4, tokenizer, 'val', dataset='wmt', shuffle=False)
def setup(self, process_on_fly=True, n_augment=0): if process_on_fly: data = load_dataset("csv", data_files=self.file_path)["train"] data = data.map( lambda x: {"article_length": len(x["Text"].split())}) data = data.map( lambda x: {"summary_length": len(x["Headline"].split())}) data = data.map(lambda x: { "CleanedText": preprocess_article(x["cleaned"], self.sep_token) }) data = data.map(lambda x: {"CleanedHeadline": x["Headline"]}) fn_kwargs = { "model": MBartForConditionalGeneration.from_pretrained( "vasudevgupta/mbart-iitb-hin-eng"), "tokenizer": MBartTokenizer.from_pretrained( "vasudevgupta/mbart-iitb-hin-eng"), "max_pred_length": 32, } data = data.map(translate, fn_kwargs=fn_kwargs) data.to_csv(f"cleaned-{self.file_path}") else: data = load_dataset( "csv", data_files=f"cleaned-{self.file_path}")["train"] data = data.filter( lambda x: x["article_length"] > 32 and x["summary_length"] > 1) removed_samples = data.filter(lambda x: type(x["CleanedHeadline"]) != str or type(x["CleanedText"]) != str) print(removed_samples["CleanedHeadline"]) print(removed_samples["CleanedText"]) data = data.filter(lambda x: type(x["CleanedHeadline"]) == str and type(x["CleanedText"]) == str) print("Dataset", data) # print("Samples with article length > 560 are", data.filter(lambda x: x["article_length"] > 560)) data = data.train_test_split(test_size=600, shuffle=True, seed=self.seed) tr_dataset = data["train"].map(lambda x: {"split": "TRAIN"}) val_dataset = data["test"].map(lambda x: {"split": "VALIDATION"}) if n_augment > 0: print("AUGMENTING") tr_dataset = tr_dataset.map( lambda x: {"augmentation_status": "Not Augmented"}) val_dataset = val_dataset.map( lambda x: {"augmentation_status": "Not Augmented"}) noisy_dataset = tr_dataset.filter( lambda x: x["Mobile_Tech_Flag"] == 1) noisy_datasets = [] for _ in range(n_augment): noisy_datasets.append( noisy_dataset.map(lambda x: { "CleanedText": get_noisy_sent(x["CleanedText"].split()) })) noisy_dataset = concatenate_datasets(noisy_datasets) noisy_dataset = noisy_dataset.map( lambda x: {"augmentation_status": "Augmented"}) tr_dataset = concatenate_datasets([noisy_dataset, tr_dataset]) return tr_dataset, val_dataset
def __init__(self): self._model: MBartForConditionalGeneration = MBartForConditionalGeneration.from_pretrained( MODEL_PATH)
def __init__(self, model_path: str, device: str = 'cuda') -> None: self.device = device self.model = MBartForConditionalGeneration.from_pretrained(model_path).to(device) self.tokenizer = MBart50Tokenizer.from_pretrained(model_path)
args = getattr(config, p_args.config) print(args) ## use this for running sweep # wandb.init(config=args.__dict__) # args = wandb.config # print(dict(args)) tokenizer = MBartTokenizer.from_pretrained(args.tokenizer_id) if args.load_dir: bart = MBartForConditionalGeneration(args.bart_config) print(f"model is loaded from {args.load_dir}") else: bart = MBartForConditionalGeneration.from_pretrained(args.model_id) print(f"model is loaded from {args.model_id}") print("====Working on layers freezing====") bart.ffn_requires_grad_(args.enc_ffn_grad, args.dec_ffn_grad) bart.attn_requires_grad_(args.enc_attn_grad, args.dec_attn_grad, args.cross_attn_grad) bart.embed_requires_grad_(args.embed_grad, args.pos_embed_grad) bart.norm_requires_grad_(args.enc_norm_grad, args.dec_norm_grad, args.cross_attn_norm_grad) print("====Working on adding adapters====") bart.add_adapter_( args.enc_ffn_adapter, args.dec_ffn_adapter, args.enc_self_attn_adapter, args.dec_self_attn_adapter, args.cross_attn_adapter, args.enc_tok_embed_adapter, args.dec_tok_embed_adapter,
def get_pipeline(): model = MBartForConditionalGeneration.from_pretrained( "facebook/mbart-large-50-many-to-many-mmt") tokenizer = MBart50TokenizerFast.from_pretrained( "facebook/mbart-large-50-many-to-many-mmt") return model, tokenizer
def main(params): """ Finetunes the mBart50 model on some languages and then evaluates the BLEU score for each direction.""" if params.wandb: wandb.init(project='mnmt', entity='nlp-mnmt-project', group='finetuning', config={k: v for k, v in params.__dict__.items() if isinstance(v, (float, int, str, list))}) new_root_path = params.location new_name = params.name logger = logging.TrainLogger(params) logger.make_dirs() logger.save_params() # load model and tokenizer device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50") model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50").to(device) optimizer = torch.optim.Adam(model.parameters()) # scale in terms of max lr lr_scale = params.max_lr * np.sqrt(params.warmup_steps) scheduler = WarmupDecay(optimizer, params.warmup_steps, 1, lr_scale=lr_scale) # set dropout model.config.dropout = params.dropout model.config.attention_dropout = params.dropout def pipeline(dataset, langs, batch_size, max_len): cols = ['input_ids_' + l for l in langs] def tokenize_fn(example): """apply tokenization""" l_tok = [] for lang in langs: encoded = tokenizer.encode(example[lang]) encoded[0] = tokenizer.lang_code_to_id[LANG_CODES[lang]] l_tok.append(encoded) return {'input_ids_' + l: tok for l, tok in zip(langs, l_tok)} def pad_seqs(examples): """Apply padding""" ex_langs = list(zip(*[tuple(ex[col] for col in cols) for ex in examples])) ex_langs = tuple(pad_sequence(x, batch_first=True, max_len=max_len) for x in ex_langs) return ex_langs dataset = filter_languages(dataset, langs) dataset = dataset.map(tokenize_fn) dataset.set_format(type='torch', columns=cols) num_examples = len(dataset) print('-'.join(langs) + ' : {} examples.'.format(num_examples)) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, collate_fn=pad_seqs) return dataloader, num_examples # load data dataset = load_dataset('ted_multi') train_dataset = dataset['train'] test_dataset = dataset['validation' if params.split == 'val' else 'test'] # preprocess splits for each direction num_train_examples = {} train_dataloaders, val_dataloaders, test_dataloaders = {}, {}, {} for l1, l2 in combinations(params.langs, 2): train_dataloaders[l1+'-'+l2], num_train_examples[l1+'-'+l2] = pipeline( train_dataset, [l1, l2], params.batch_size, params.max_len) test_dataloaders[l1+'-'+l2], _ = pipeline(test_dataset, [l1, l2], params.batch_size, params.max_len) # print dataset sizes for direction, num in num_train_examples.items(): print(direction, ': {} examples.'.format(num)) def freeze_layers(layers, unfreeze=False): for n in layers: for parameter in model.model.encoder.layers[n].parameters(): parameter.requires_grad = unfreeze # define loss function if params.label_smoothing is not None: loss_object = LabelSmoothingLoss(params.label_smoothing) loss_fn = lambda out, tar: loss_object(out.logits, tar) else: loss_fn = lambda out, tar: out.loss # train the model _target = torch.tensor(1.0).to(device) def train_step(x, y, aux=False): y_inp, y_tar = y[:,:-1].contiguous(), y[:,1:].contiguous() enc_mask, dec_mask = (x != 0), (y_inp != 0) x, y_inp, y_tar, enc_mask, dec_mask = to_devices( (x, y_inp, y_tar, enc_mask, dec_mask), device) model.train() if aux: freeze_layers(params.frozen_layers, unfreeze=True) output = model(input_ids=x, decoder_input_ids=y_inp, labels=y_tar, attention_mask=enc_mask, decoder_attention_mask=dec_mask) optimizer.zero_grad() loss = loss_fn(output, y_tar) loss.backward(retain_graph=aux) if aux: freeze_layers(params.frozen_layers) torch.set_grad_enabled(aux) x_enc = output.encoder_last_hidden_state y_enc = model.model.encoder(y_inp, attention_mask=dec_mask)['last_hidden_state'] x_enc = torch.max(x_enc + -999 * (1-enc_mask.type(x_enc.dtype)).unsqueeze(-1), dim=1)[0] y_enc = torch.max(y_enc + -999 * (1-dec_mask.type(y_enc.dtype)).unsqueeze(-1), dim=1)[0] aux_loss = F.cosine_embedding_loss(x_enc, y_enc, _target) scaled_aux_loss = params.aux_strength * aux_loss torch.set_grad_enabled(True) if aux: scaled_aux_loss.backward() optimizer.step() scheduler.step() accuracy = accuracy_fn(output.logits, y_tar) return loss.item(), aux_loss.item(), accuracy.item() # prepare iterators iterators = {direction: iter(loader) for direction, loader in train_dataloaders.items()} # compute sampling probabilites (and set zero shot directions to 0) num_examples = num_train_examples.copy() zero_shots = [(params.zero_shot[i]+'-'+params.zero_shot[i+1]) for i in range(0, len(params.zero_shot), 2)] for d in zero_shots: num_examples[d] = 0 directions, num_examples = list(num_examples.keys()), np.array(list(num_examples.values())) dir_dist = (num_examples ** params.temp) / ((num_examples ** params.temp).sum()) #train losses, aux_losses, accs = [], [], [] start_ = time.time() for i in range(params.train_steps): # sample a direction direction = directions[int(np.random.choice(len(num_examples), p=dir_dist))] try: # check iterator is not exhausted x, y = next(iterators[direction]) except StopIteration: iterators[direction] = iter(train_dataloaders[direction]) x, y = next(iterators[direction]) x, y = get_direction(x, y, sample=not params.single_direction) # train on the direction loss, aux_loss, acc = train_step(x, y, aux=params.auxiliary) losses.append(loss) aux_losses.append(aux_loss) accs.append(acc) if i % params.verbose == 0: print('Batch {} Loss {:.4f} Aux Loss {:.4f} Acc {:.4f} in {:.4f} secs per batch'.format( i, np.mean(losses[-params.verbose:]), np.mean(aux_losses[-params.verbose:]), np.mean(accs[-params.verbose:]), (time.time() - start_)/(i+1))) if params.wandb: wandb.log({'train_loss':loss, 'aux_loss':aux_loss, 'train_acc':acc}) # save results if params.save: logger.save_model(params.train_steps, model, optimizer, scheduler=scheduler) train_results = {'loss':[np.mean(losses)], 'aux_loss':[np.mean(aux_losses)], 'accuarcy':[np.mean(accs)]} pd.DataFrame(train_results).to_csv(logger.root_path + '/train_results.csv', index=False) # evaluate the model def evaluate(x, y, y_code, bleu): y_inp, y_tar = y[:,:-1].contiguous(), y[:,1:].contiguous() enc_mask = (x != 0) x, y_inp, y_tar, enc_mask = to_devices( (x, y_inp, y_tar, enc_mask), device) model.eval() y_pred = model.generate(input_ids=x, decoder_start_token_id=y_code, attention_mask=enc_mask, max_length=params.max_len+1, num_beams=params.num_beams, length_penalty=params.length_penalty, early_stopping=True) bleu(y_pred[:,1:], y_tar) test_results = {} for direction, loader in test_dataloaders.items(): alt_direction = '-'.join(reversed(direction.split('-'))) bleu1, bleu2 = BLEU(), BLEU() bleu1.set_excluded_indices([0, 2]) bleu2.set_excluded_indices([0, 2]) x_code = tokenizer.lang_code_to_id[LANG_CODES[direction.split('-')[0]]] y_code = tokenizer.lang_code_to_id[LANG_CODES[direction.split('-')[-1]]] start_ = time.time() for i, (x, y) in enumerate(loader): if params.test_batches is not None: if i > params.test_batches: break evaluate(x, y, y_code, bleu1) if not params.single_direction: evaluate(y, x, x_code, bleu2) if i % params.verbose == 0: bl1, bl2 = bleu1.get_metric(), bleu2.get_metric() print('Batch {} Bleu1 {:.4f} Bleu2 {:.4f} in {:.4f} secs per batch'.format( i, bl1, bl2, (time.time() - start_)/(i+1))) if params.wandb: wandb.log({'Bleu1':bl1, 'Bleu2':bl2}) test_results[direction] = [bleu1.get_metric()] test_results[alt_direction] = [bleu2.get_metric()] # save test_results pd.DataFrame(test_results).to_csv(logger.root_path + '/test_results.csv', index=False) if params.wandb: wandb.finish()
def train( model_name, train_file, val_file, batch_size, output_dir, learning_rate, logging_steps, eval_steps, save_steps, warmup_steps, num_train_epochs, gradient_accumulation_steps, max_grad_norm, weight_decay, max_source_tokens_count, max_target_tokens_count, fp16_opt_level, fp16=False ): if fp16: print("Using FP16") tokenizer = MBartTokenizer.from_pretrained(model_name) train_dataset = MBartSummarizationDataset( train_file, tokenizer, max_source_tokens_count, max_target_tokens_count) val_dataset = MBartSummarizationDataset( val_file, tokenizer, max_source_tokens_count, max_target_tokens_count) model = MBartForConditionalGeneration.from_pretrained(model_name) training_args = TrainingArguments( output_dir=output_dir, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, do_train=True, do_eval=True, overwrite_output_dir=True, logging_steps=logging_steps, save_steps=save_steps, eval_steps=eval_steps, learning_rate=learning_rate, warmup_steps=warmup_steps, num_train_epochs=num_train_epochs, gradient_accumulation_steps=gradient_accumulation_steps, max_grad_norm=max_grad_norm, weight_decay=weight_decay, fp16=fp16, fp16_opt_level=fp16_opt_level, label_smoothing_factor=0.1, evaluation_strategy="steps" ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset ) trainer.train()
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast import pandas as pd ######### # Mbart50 ######### path_to_new_dataset = '../../../03_dataset/task_01/subtask1-document/additional_training_data' model = MBartForConditionalGeneration.from_pretrained( "facebook/mbart-large-50-many-to-many-mmt") tokenizer = MBart50TokenizerFast.from_pretrained( "facebook/mbart-large-50-many-to-many-mmt") translate_sentence = 'I like icecream.' # translate Eng to Hindi tokenizer.src_lang = "en_XX" encoded_hi = tokenizer(translate_sentence, return_tensors="pt") generated_tokens = model.generate( **encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["de_DE"]) tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) # ACLED (EN) acled_en_pos = pd.read_json(f"{path_to_new_dataset}/acled_eng.json", lines=True).rename(columns={ "notes": "text", "label": "label" }) acled_en_pos_select = acled_en_pos[:6928]
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) check_output_dir(training_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.parallel_mode == ParallelMode.DISTRIBUTED), training_args.fp16, ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = MBartConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout") for p in extra_model_params: if getattr(training_args, p, None): assert hasattr( config, p ), f"({config.__class__.__name__}) doesn't have a `{p}` attribute" setattr(config, p, getattr(training_args, p)) tokenizer = MBartTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) # model = MBartForConditionalGeneration.from_pretrained( # model_args.model_name_or_path, # from_tf=".ckpt" in model_args.model_name_or_path, # config=config, # cache_dir=model_args.cache_dir, # ) # model = MBartForConditionalGeneration(config) model = MBartForConditionalGeneration.from_pretrained( model_args.config_name) # use task specific params use_task_specific_params(model, data_args.task) # set num_beams for evaluation if data_args.eval_beams is None: data_args.eval_beams = model.config.num_beams # set decoder_start_token_id for MBart if model.config.decoder_start_token_id is None and isinstance( tokenizer, MBartTokenizer): assert (data_args.tgt_lang is not None and data_args.src_lang is not None), "mBart requires --tgt_lang and --src_lang" model.config.decoder_start_token_id = tokenizer.lang_code_to_id[ data_args.tgt_lang] if model_args.freeze_embeds: freeze_embeds(model) if model_args.freeze_encoder: freeze_params(model.get_encoder()) assert_all_frozen(model.get_encoder()) dataset_class = Seq2SeqDataset # Get datasets train_dataset = (dataset_class( tokenizer, type_path="train", data_dir=data_args.data_dir, n_obs=data_args.n_train, max_target_length=data_args.max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_train else None) eval_dataset = (dataset_class( tokenizer, type_path="val", data_dir=data_args.data_dir, n_obs=data_args.n_val, max_target_length=data_args.val_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_eval or training_args.evaluation_strategy != EvaluationStrategy.NO else None) test_dataset = (dataset_class( tokenizer, type_path="test", data_dir=data_args.data_dir, n_obs=data_args.n_test, max_target_length=data_args.test_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_predict else None) # Initialize our Trainer compute_metrics_fn = (build_compute_metrics_fn(data_args.task, tokenizer) if training_args.predict_with_generate else None) trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=Seq2SeqDataCollator(tokenizer, data_args, training_args.tpu_num_cores), compute_metrics=compute_metrics_fn, tokenizer=tokenizer, ) all_metrics = {} # Training if training_args.do_train: logger.info("*** Train ***") train_result = trainer.train( model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) metrics = train_result.metrics metrics["train_n_objs"] = data_args.n_train trainer.save_model() # this also saves the tokenizer if trainer.is_world_process_zero(): handle_metrics("train", metrics, training_args.output_dir) all_metrics.update(metrics) # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) tokenizer.save_pretrained(training_args.output_dir) # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate(metric_key_prefix="val", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams) metrics["val_n_objs"] = data_args.n_val metrics["val_loss"] = round(metrics["val_loss"], 4) if trainer.is_world_process_zero(): handle_metrics("val", metrics, training_args.output_dir) all_metrics.update(metrics) if training_args.do_predict: logger.info("*** Predict ***") test_output = trainer.predict( test_dataset=test_dataset, metric_key_prefix="test", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams, ) metrics = test_output.metrics metrics["test_n_objs"] = data_args.n_test if trainer.is_world_process_zero(): metrics["test_loss"] = round(metrics["test_loss"], 4) handle_metrics("test", metrics, training_args.output_dir) all_metrics.update(metrics) if training_args.predict_with_generate: test_preds = tokenizer.batch_decode( test_output.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True) test_preds = lmap(str.strip, test_preds) write_txt_file( test_preds, os.path.join(training_args.output_dir, "test_generations.txt")) if trainer.is_world_process_zero(): save_json(all_metrics, os.path.join(training_args.output_dir, "all_results.json")) return all_metrics