def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size) attention_mask = None if self.use_attention_mask: attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2) lm_labels = None if self.use_labels: lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size) config = MarianConfig( vocab_size=self.vocab_size, d_model=self.d_model, decoder_layers=self.decoder_layers, decoder_ffn_dim=self.decoder_ffn_dim, encoder_attention_heads=self.encoder_attention_heads, decoder_attention_heads=self.decoder_attention_heads, eos_token_id=self.eos_token_id, bos_token_id=self.bos_token_id, use_cache=self.use_cache, pad_token_id=self.pad_token_id, decoder_start_token_id=self.decoder_start_token_id, max_position_embeddings=self.max_position_embeddings, is_encoder_decoder=self.is_encoder_decoder, ) return ( config, input_ids, attention_mask, lm_labels, )
def prepare_config_and_inputs(self): input_ids = np.clip(ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size), 3, self.vocab_size) input_ids = np.concatenate((input_ids, 2 * np.ones((self.batch_size, 1), dtype=np.int64)), -1) decoder_input_ids = shift_tokens_right(input_ids, 1, 2) config = MarianConfig( vocab_size=self.vocab_size, d_model=self.hidden_size, encoder_layers=self.num_hidden_layers, decoder_layers=self.num_hidden_layers, encoder_attention_heads=self.num_attention_heads, decoder_attention_heads=self.num_attention_heads, encoder_ffn_dim=self.intermediate_size, decoder_ffn_dim=self.intermediate_size, dropout=self.hidden_dropout_prob, attention_dropout=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, eos_token_id=self.eos_token_id, bos_token_id=self.bos_token_id, pad_token_id=self.pad_token_id, initializer_range=self.initializer_range, use_cache=False, ) inputs_dict = prepare_marian_inputs_dict(config, input_ids, decoder_input_ids) return config, inputs_dict
def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(3, ) input_ids[:, -1] = self.eos_token_id # Eos Token decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) config = MarianConfig( vocab_size=self.vocab_size, d_model=self.hidden_size, encoder_layers=self.num_hidden_layers, decoder_layers=self.num_hidden_layers, encoder_attention_heads=self.num_attention_heads, decoder_attention_heads=self.num_attention_heads, encoder_ffn_dim=self.intermediate_size, decoder_ffn_dim=self.intermediate_size, dropout=self.hidden_dropout_prob, attention_dropout=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, eos_token_id=self.eos_token_id, bos_token_id=self.bos_token_id, pad_token_id=self.pad_token_id, decoder_start_token_id=self.decoder_start_token_id, ) inputs_dict = prepare_marian_inputs_dict(config, input_ids, decoder_input_ids) return config, inputs_dict
def __init__(self, source_dir, eos_token_id=0): npz_path = find_model_file(source_dir) self.state_dict = np.load(npz_path) cfg = load_config_from_state_dict(self.state_dict) if cfg["dim-vocabs"][0] != cfg["dim-vocabs"][1]: raise ValueError if "Wpos" in self.state_dict: raise ValueError("Wpos key in state dictionary") self.state_dict = dict(self.state_dict) self.wemb, self.final_bias = add_emb_entries(self.state_dict["Wemb"], self.state_dict[BIAS_KEY], 1) self.pad_token_id = self.wemb.shape[0] - 1 cfg["vocab_size"] = self.pad_token_id + 1 # self.state_dict['Wemb'].sha self.state_keys = list(self.state_dict.keys()) if "Wtype" in self.state_dict: raise ValueError("Wtype key in state dictionary") self._check_layer_entries() self.source_dir = source_dir self.cfg = cfg hidden_size, intermediate_shape = self.state_dict[ "encoder_l1_ffn_W1"].shape if hidden_size != 512 or cfg["dim-emb"] != 512: raise ValueError( f"Hidden size {hidden_size} and configured size {cfg['dim_emb']} mismatched or not 512" ) # Process decoder.yml decoder_yml = cast_marian_config(load_yaml(source_dir / "decoder.yml")) check_marian_cfg_assumptions(cfg) self.hf_config = MarianConfig( vocab_size=cfg["vocab_size"], decoder_layers=cfg["dec-depth"], encoder_layers=cfg["enc-depth"], decoder_attention_heads=cfg["transformer-heads"], encoder_attention_heads=cfg["transformer-heads"], decoder_ffn_dim=cfg["transformer-dim-ffn"], encoder_ffn_dim=cfg["transformer-dim-ffn"], d_model=cfg["dim-emb"], activation_function=cfg["transformer-aan-activation"], pad_token_id=self.pad_token_id, eos_token_id=eos_token_id, forced_eos_token_id=eos_token_id, bos_token_id=0, max_position_embeddings=cfg["dim-emb"], scale_embedding=True, normalize_embedding="n" in cfg["transformer-preprocess"], static_position_embeddings=not cfg[ "transformer-train-position-embeddings"], dropout=0.1, # see opus-mt-train repo/transformer-dropout param. # default: add_final_layer_norm=False, num_beams=decoder_yml["beam-size"], decoder_start_token_id=self.pad_token_id, bad_words_ids=[[self.pad_token_id]], max_length=512, )
def __init__(self, parent): self.config = MarianConfig( vocab_size=99, d_model=24, encoder_layers=2, decoder_layers=2, encoder_attention_heads=2, decoder_attention_heads=2, encoder_ffn_dim=32, decoder_ffn_dim=32, max_position_embeddings=48, add_final_layer_norm=True, )
def __init__(self, source_dir): npz_path = find_model_file(source_dir) self.state_dict = np.load(npz_path) cfg = load_config_from_state_dict(self.state_dict) assert cfg["dim-vocabs"][0] == cfg["dim-vocabs"][1] assert "Wpos" not in self.state_dict self.state_dict = dict(self.state_dict) self.wemb, self.final_bias = add_emb_entries(self.state_dict["Wemb"], self.state_dict[BIAS_KEY], 1) self.pad_token_id = self.wemb.shape[0] - 1 cfg["vocab_size"] = self.pad_token_id + 1 # self.state_dict['Wemb'].sha self.state_keys = list(self.state_dict.keys()) if "Wtype" in self.state_dict: raise ValueError("found Wtype key") self._check_layer_entries() self.source_dir = source_dir self.cfg = cfg hidden_size, intermediate_shape = self.state_dict[ "encoder_l1_ffn_W1"].shape assert hidden_size == cfg["dim-emb"] == 512 # Process decoder.yml decoder_yml = cast_marian_config(load_yaml(source_dir / "decoder.yml")) # TODO: what are normalize and word-penalty? check_marian_cfg_assumptions(cfg) self.hf_config = MarianConfig( vocab_size=cfg["vocab_size"], decoder_layers=cfg["dec-depth"], encoder_layers=cfg["enc-depth"], decoder_attention_heads=cfg["transformer-heads"], encoder_attention_heads=cfg["transformer-heads"], decoder_ffn_dim=cfg["transformer-dim-ffn"], encoder_ffn_dim=cfg["transformer-dim-ffn"], d_model=cfg["dim-emb"], activation_function=cfg["transformer-aan-activation"], pad_token_id=self.pad_token_id, eos_token_id=0, bos_token_id=0, max_position_embeddings=cfg["dim-emb"], scale_embedding=True, normalize_embedding="n" in cfg["transformer-preprocess"], static_position_embeddings=not cfg[ "transformer-train-position-embeddings"], dropout=0.1, # see opus-mt-train repo/transformer-dropout param. # default: add_final_layer_norm=False, num_beams=decoder_yml["beam-size"], )
def get_config(self): return MarianConfig( vocab_size=self.vocab_size, d_model=self.hidden_size, encoder_layers=self.num_hidden_layers, decoder_layers=self.num_hidden_layers, encoder_attention_heads=self.num_attention_heads, decoder_attention_heads=self.num_attention_heads, encoder_ffn_dim=self.intermediate_size, decoder_ffn_dim=self.intermediate_size, dropout=self.hidden_dropout_prob, attention_dropout=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, eos_token_id=self.eos_token_id, bos_token_id=self.bos_token_id, pad_token_id=self.pad_token_id, decoder_start_token_id=self.decoder_start_token_id, )
def bulk_update_local_configs(models, update_dict=DEFAULT_UPDATE_DICT, save_dir=MIRROR_DIR): failures = [] for slug in tqdm_nice(models): assert slug.startswith('opus-mt') try: cfg = MarianConfig.from_pretrained(f'Helsinki-NLP/{slug}') except OSError: failures.append(slug) continue for k, v in update_dict.items(): setattr(cfg, k, v) # if a new value depends on a cfg value, add code here # e.g. cfg.decoder_start_token_id = cfg.pad_token_id dest_dir = (save_dir / 'Helsinki-NLP' / slug) if not dest_dir.exists(): print(f'making {dest_dir}') dest_dir.mkdir(exist_ok=True) cfg.save_pretrained(dest_dir) assert cfg.from_pretrained(dest_dir).model_type == 'marian'
def __init__(self, source_dir, eos_token_id=0): npz_path = find_model_file(source_dir) self.state_dict = np.load(npz_path) cfg = load_config_from_state_dict(self.state_dict) if cfg["dim-vocabs"][0] != cfg["dim-vocabs"][1]: raise ValueError if "Wpos" in self.state_dict: raise ValueError("Wpos key in state dictionary") self.state_dict = dict(self.state_dict) if cfg["tied-embeddings-all"]: cfg["tied-embeddings-src"] = True cfg["tied-embeddings"] = True self.share_encoder_decoder_embeddings = cfg["tied-embeddings-src"] # create the tokenizer here because we need to know the eos_token_id self.source_dir = source_dir self.tokenizer = self.load_tokenizer() # retrieve EOS token and set correctly tokenizer_has_eos_token_id = (hasattr(self.tokenizer, "eos_token_id") and self.tokenizer.eos_token_id is not None) eos_token_id = self.tokenizer.eos_token_id if tokenizer_has_eos_token_id else 0 if cfg["tied-embeddings-src"]: self.wemb, self.final_bias = add_emb_entries( self.state_dict["Wemb"], self.state_dict[BIAS_KEY], 1) self.pad_token_id = self.wemb.shape[0] - 1 cfg["vocab_size"] = self.pad_token_id + 1 else: self.wemb, _ = add_emb_entries(self.state_dict["encoder_Wemb"], self.state_dict[BIAS_KEY], 1) self.dec_wemb, self.final_bias = add_emb_entries( self.state_dict["decoder_Wemb"], self.state_dict[BIAS_KEY], 1) # still assuming that vocab size is same for encoder and decoder self.pad_token_id = self.wemb.shape[0] - 1 cfg["vocab_size"] = self.pad_token_id + 1 cfg["decoder_vocab_size"] = self.pad_token_id + 1 if cfg["vocab_size"] != self.tokenizer.vocab_size: raise ValueError( f"Original vocab size {cfg['vocab_size']} and new vocab size {len(self.tokenizer.encoder)} mismatched." ) # self.state_dict['Wemb'].sha self.state_keys = list(self.state_dict.keys()) if "Wtype" in self.state_dict: raise ValueError("Wtype key in state dictionary") self._check_layer_entries() self.cfg = cfg hidden_size, intermediate_shape = self.state_dict[ "encoder_l1_ffn_W1"].shape if hidden_size != cfg["dim-emb"]: raise ValueError( f"Hidden size {hidden_size} and configured size {cfg['dim_emb']} mismatched" ) # Process decoder.yml decoder_yml = cast_marian_config(load_yaml(source_dir / "decoder.yml")) check_marian_cfg_assumptions(cfg) self.hf_config = MarianConfig( vocab_size=cfg["vocab_size"], decoder_vocab_size=cfg.get("decoder_vocab_size", cfg["vocab_size"]), share_encoder_decoder_embeddings=cfg["tied-embeddings-src"], decoder_layers=cfg["dec-depth"], encoder_layers=cfg["enc-depth"], decoder_attention_heads=cfg["transformer-heads"], encoder_attention_heads=cfg["transformer-heads"], decoder_ffn_dim=cfg["transformer-dim-ffn"], encoder_ffn_dim=cfg["transformer-dim-ffn"], d_model=cfg["dim-emb"], activation_function=cfg["transformer-ffn-activation"], pad_token_id=self.pad_token_id, eos_token_id=eos_token_id, forced_eos_token_id=eos_token_id, bos_token_id=0, max_position_embeddings=cfg["dim-emb"], scale_embedding=True, normalize_embedding="n" in cfg["transformer-preprocess"], static_position_embeddings=not cfg[ "transformer-train-position-embeddings"], tie_word_embeddings=cfg["tied-embeddings"], dropout=0.1, # see opus-mt-train repo/transformer-dropout param. # default: add_final_layer_norm=False, num_beams=decoder_yml["beam-size"], decoder_start_token_id=self.pad_token_id, bad_words_ids=[[self.pad_token_id]], max_length=512, )
def config(self): config = MarianConfig.from_pretrained("sshleifer/tiny-marian-en-de") return config
def main(args): df = pd.read_csv(args.input_fname, encoding='utf-8')[[args.source_lang, args.target_lang]] logging.info(f'Loaded {df.shape}') #convert to dictionary j = {'translation': []} for i in df.itertuples(): j['translation'] += [{args.source_lang: i[1], args.target_lang: i[2]}] train_dataset = Dataset.from_dict(j) raw_datasets = train_dataset.train_test_split(test_size=args.valid_pct, seed=args.seed) logging.info(f'Datasets created {raw_datasets}') tokenizer = MarianTokenizer.from_pretrained(args.output_dir) logging.info(f'Tokenizer loaded from {args.output_dir}') #tokenize datasets tokenized_datasets = raw_datasets.map( partial(preprocess_function, tokenizer=tokenizer, max_input_length=args.max_input_length, max_target_length=args.max_target_length, source_lang=args.source_lang, target_lang=args.target_lang), batched=True, ) logging.info(f'Tokenized datasets: {tokenized_datasets}') #filter those with too few tokens tokenized_datasets = tokenized_datasets.filter( lambda example: len(example['translation']['zh']) > 2) tokenized_datasets = tokenized_datasets.filter( lambda example: len(example['translation']['th']) > 2) logging.info( f'Tokenized datasets when filtered out less than 2 tokens per sequence: {tokenized_datasets}' ) config = MarianConfig.from_pretrained(args.output_dir) model = MarianMTModel(config) logging.info(f'Loaded model from {args.output_dir}') training_args = Seq2SeqTrainingArguments( args.output_dir, evaluation_strategy="epoch", load_best_model_at_end=True, learning_rate=args.learning_rate, warmup_ratio=args.warmup_ratio, per_device_train_batch_size=args.batch_size, per_device_eval_batch_size=args.batch_size, weight_decay=args.weight_decay, save_total_limit=args.save_total_limit, num_train_epochs=args.num_train_epochs, predict_with_generate=True, fp16=args.fp16, seed=args.seed, ) logging.info(f'Training congig {training_args}') data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) trainer = Seq2SeqTrainer( model, training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["test"], data_collator=data_collator, tokenizer=tokenizer, compute_metrics=partial(compute_metrics, tokenizer=tokenizer, metric=metric, metric_tokenize=args.metric_tokenize), ) logging.info(f'Trainer created') trainer.train() model.save_pretrained(f"{args.output_dir}_best") tokenizer.save_pretrained(f"{args.output_dir}_best") logging.info(f'Best model saved') model.cpu() src_text = ['我爱你', '国王有很多心事。我明白'] translated = model.generate( **tokenizer(src_text, return_tensors="pt", padding=True)) print([tokenizer.decode(t, skip_special_tokens=True) for t in translated])