Ejemplo n.º 1
0
    def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)

        attention_mask = None
        if self.use_attention_mask:
            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)

        lm_labels = None
        if self.use_labels:
            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)

        config = MarianConfig(
            vocab_size=self.vocab_size,
            d_model=self.d_model,
            decoder_layers=self.decoder_layers,
            decoder_ffn_dim=self.decoder_ffn_dim,
            encoder_attention_heads=self.encoder_attention_heads,
            decoder_attention_heads=self.decoder_attention_heads,
            eos_token_id=self.eos_token_id,
            bos_token_id=self.bos_token_id,
            use_cache=self.use_cache,
            pad_token_id=self.pad_token_id,
            decoder_start_token_id=self.decoder_start_token_id,
            max_position_embeddings=self.max_position_embeddings,
            is_encoder_decoder=self.is_encoder_decoder,
        )

        return (
            config,
            input_ids,
            attention_mask,
            lm_labels,
        )
Ejemplo n.º 2
0
    def prepare_config_and_inputs(self):
        input_ids = np.clip(ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size), 3, self.vocab_size)
        input_ids = np.concatenate((input_ids, 2 * np.ones((self.batch_size, 1), dtype=np.int64)), -1)

        decoder_input_ids = shift_tokens_right(input_ids, 1, 2)

        config = MarianConfig(
            vocab_size=self.vocab_size,
            d_model=self.hidden_size,
            encoder_layers=self.num_hidden_layers,
            decoder_layers=self.num_hidden_layers,
            encoder_attention_heads=self.num_attention_heads,
            decoder_attention_heads=self.num_attention_heads,
            encoder_ffn_dim=self.intermediate_size,
            decoder_ffn_dim=self.intermediate_size,
            dropout=self.hidden_dropout_prob,
            attention_dropout=self.attention_probs_dropout_prob,
            max_position_embeddings=self.max_position_embeddings,
            eos_token_id=self.eos_token_id,
            bos_token_id=self.bos_token_id,
            pad_token_id=self.pad_token_id,
            initializer_range=self.initializer_range,
            use_cache=False,
        )
        inputs_dict = prepare_marian_inputs_dict(config, input_ids, decoder_input_ids)
        return config, inputs_dict
Ejemplo n.º 3
0
    def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.seq_length],
                               self.vocab_size)
        input_ids = ids_tensor([self.batch_size, self.seq_length],
                               self.vocab_size).clamp(3, )
        input_ids[:, -1] = self.eos_token_id  # Eos Token

        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length],
                                       self.vocab_size)

        config = MarianConfig(
            vocab_size=self.vocab_size,
            d_model=self.hidden_size,
            encoder_layers=self.num_hidden_layers,
            decoder_layers=self.num_hidden_layers,
            encoder_attention_heads=self.num_attention_heads,
            decoder_attention_heads=self.num_attention_heads,
            encoder_ffn_dim=self.intermediate_size,
            decoder_ffn_dim=self.intermediate_size,
            dropout=self.hidden_dropout_prob,
            attention_dropout=self.attention_probs_dropout_prob,
            max_position_embeddings=self.max_position_embeddings,
            eos_token_id=self.eos_token_id,
            bos_token_id=self.bos_token_id,
            pad_token_id=self.pad_token_id,
            decoder_start_token_id=self.decoder_start_token_id,
        )
        inputs_dict = prepare_marian_inputs_dict(config, input_ids,
                                                 decoder_input_ids)
        return config, inputs_dict
Ejemplo n.º 4
0
    def __init__(self, source_dir, eos_token_id=0):
        npz_path = find_model_file(source_dir)
        self.state_dict = np.load(npz_path)
        cfg = load_config_from_state_dict(self.state_dict)
        if cfg["dim-vocabs"][0] != cfg["dim-vocabs"][1]:
            raise ValueError
        if "Wpos" in self.state_dict:
            raise ValueError("Wpos key in state dictionary")
        self.state_dict = dict(self.state_dict)
        self.wemb, self.final_bias = add_emb_entries(self.state_dict["Wemb"],
                                                     self.state_dict[BIAS_KEY],
                                                     1)
        self.pad_token_id = self.wemb.shape[0] - 1
        cfg["vocab_size"] = self.pad_token_id + 1
        # self.state_dict['Wemb'].sha
        self.state_keys = list(self.state_dict.keys())
        if "Wtype" in self.state_dict:
            raise ValueError("Wtype key in state dictionary")
        self._check_layer_entries()
        self.source_dir = source_dir
        self.cfg = cfg
        hidden_size, intermediate_shape = self.state_dict[
            "encoder_l1_ffn_W1"].shape
        if hidden_size != 512 or cfg["dim-emb"] != 512:
            raise ValueError(
                f"Hidden size {hidden_size} and configured size {cfg['dim_emb']} mismatched or not 512"
            )

        # Process decoder.yml
        decoder_yml = cast_marian_config(load_yaml(source_dir / "decoder.yml"))
        check_marian_cfg_assumptions(cfg)
        self.hf_config = MarianConfig(
            vocab_size=cfg["vocab_size"],
            decoder_layers=cfg["dec-depth"],
            encoder_layers=cfg["enc-depth"],
            decoder_attention_heads=cfg["transformer-heads"],
            encoder_attention_heads=cfg["transformer-heads"],
            decoder_ffn_dim=cfg["transformer-dim-ffn"],
            encoder_ffn_dim=cfg["transformer-dim-ffn"],
            d_model=cfg["dim-emb"],
            activation_function=cfg["transformer-aan-activation"],
            pad_token_id=self.pad_token_id,
            eos_token_id=eos_token_id,
            forced_eos_token_id=eos_token_id,
            bos_token_id=0,
            max_position_embeddings=cfg["dim-emb"],
            scale_embedding=True,
            normalize_embedding="n" in cfg["transformer-preprocess"],
            static_position_embeddings=not cfg[
                "transformer-train-position-embeddings"],
            dropout=0.1,  # see opus-mt-train repo/transformer-dropout param.
            # default: add_final_layer_norm=False,
            num_beams=decoder_yml["beam-size"],
            decoder_start_token_id=self.pad_token_id,
            bad_words_ids=[[self.pad_token_id]],
            max_length=512,
        )
 def __init__(self, parent):
     self.config = MarianConfig(
         vocab_size=99,
         d_model=24,
         encoder_layers=2,
         decoder_layers=2,
         encoder_attention_heads=2,
         decoder_attention_heads=2,
         encoder_ffn_dim=32,
         decoder_ffn_dim=32,
         max_position_embeddings=48,
         add_final_layer_norm=True,
     )
Ejemplo n.º 6
0
    def __init__(self, source_dir):
        npz_path = find_model_file(source_dir)
        self.state_dict = np.load(npz_path)
        cfg = load_config_from_state_dict(self.state_dict)
        assert cfg["dim-vocabs"][0] == cfg["dim-vocabs"][1]
        assert "Wpos" not in self.state_dict
        self.state_dict = dict(self.state_dict)
        self.wemb, self.final_bias = add_emb_entries(self.state_dict["Wemb"],
                                                     self.state_dict[BIAS_KEY],
                                                     1)
        self.pad_token_id = self.wemb.shape[0] - 1
        cfg["vocab_size"] = self.pad_token_id + 1
        # self.state_dict['Wemb'].sha
        self.state_keys = list(self.state_dict.keys())
        if "Wtype" in self.state_dict:
            raise ValueError("found Wtype key")
        self._check_layer_entries()
        self.source_dir = source_dir
        self.cfg = cfg
        hidden_size, intermediate_shape = self.state_dict[
            "encoder_l1_ffn_W1"].shape
        assert hidden_size == cfg["dim-emb"] == 512

        # Process decoder.yml
        decoder_yml = cast_marian_config(load_yaml(source_dir / "decoder.yml"))
        # TODO: what are normalize and word-penalty?
        check_marian_cfg_assumptions(cfg)
        self.hf_config = MarianConfig(
            vocab_size=cfg["vocab_size"],
            decoder_layers=cfg["dec-depth"],
            encoder_layers=cfg["enc-depth"],
            decoder_attention_heads=cfg["transformer-heads"],
            encoder_attention_heads=cfg["transformer-heads"],
            decoder_ffn_dim=cfg["transformer-dim-ffn"],
            encoder_ffn_dim=cfg["transformer-dim-ffn"],
            d_model=cfg["dim-emb"],
            activation_function=cfg["transformer-aan-activation"],
            pad_token_id=self.pad_token_id,
            eos_token_id=0,
            bos_token_id=0,
            max_position_embeddings=cfg["dim-emb"],
            scale_embedding=True,
            normalize_embedding="n" in cfg["transformer-preprocess"],
            static_position_embeddings=not cfg[
                "transformer-train-position-embeddings"],
            dropout=0.1,  # see opus-mt-train repo/transformer-dropout param.
            # default: add_final_layer_norm=False,
            num_beams=decoder_yml["beam-size"],
        )
Ejemplo n.º 7
0
 def get_config(self):
     return MarianConfig(
         vocab_size=self.vocab_size,
         d_model=self.hidden_size,
         encoder_layers=self.num_hidden_layers,
         decoder_layers=self.num_hidden_layers,
         encoder_attention_heads=self.num_attention_heads,
         decoder_attention_heads=self.num_attention_heads,
         encoder_ffn_dim=self.intermediate_size,
         decoder_ffn_dim=self.intermediate_size,
         dropout=self.hidden_dropout_prob,
         attention_dropout=self.attention_probs_dropout_prob,
         max_position_embeddings=self.max_position_embeddings,
         eos_token_id=self.eos_token_id,
         bos_token_id=self.bos_token_id,
         pad_token_id=self.pad_token_id,
         decoder_start_token_id=self.decoder_start_token_id,
     )
def bulk_update_local_configs(models,
                              update_dict=DEFAULT_UPDATE_DICT,
                              save_dir=MIRROR_DIR):
    failures = []
    for slug in tqdm_nice(models):
        assert slug.startswith('opus-mt')
        try:
            cfg = MarianConfig.from_pretrained(f'Helsinki-NLP/{slug}')
        except OSError:
            failures.append(slug)
            continue
        for k, v in update_dict.items():
            setattr(cfg, k, v)
        # if a new value depends on a cfg value, add code here
        # e.g. cfg.decoder_start_token_id = cfg.pad_token_id

        dest_dir = (save_dir / 'Helsinki-NLP' / slug)
        if not dest_dir.exists():
            print(f'making {dest_dir}')
            dest_dir.mkdir(exist_ok=True)
        cfg.save_pretrained(dest_dir)
        assert cfg.from_pretrained(dest_dir).model_type == 'marian'
    def __init__(self, source_dir, eos_token_id=0):
        npz_path = find_model_file(source_dir)
        self.state_dict = np.load(npz_path)
        cfg = load_config_from_state_dict(self.state_dict)
        if cfg["dim-vocabs"][0] != cfg["dim-vocabs"][1]:
            raise ValueError
        if "Wpos" in self.state_dict:
            raise ValueError("Wpos key in state dictionary")
        self.state_dict = dict(self.state_dict)
        if cfg["tied-embeddings-all"]:
            cfg["tied-embeddings-src"] = True
            cfg["tied-embeddings"] = True
        self.share_encoder_decoder_embeddings = cfg["tied-embeddings-src"]

        # create the tokenizer here because we need to know the eos_token_id
        self.source_dir = source_dir
        self.tokenizer = self.load_tokenizer()
        # retrieve EOS token and set correctly
        tokenizer_has_eos_token_id = (hasattr(self.tokenizer, "eos_token_id")
                                      and self.tokenizer.eos_token_id
                                      is not None)
        eos_token_id = self.tokenizer.eos_token_id if tokenizer_has_eos_token_id else 0

        if cfg["tied-embeddings-src"]:
            self.wemb, self.final_bias = add_emb_entries(
                self.state_dict["Wemb"], self.state_dict[BIAS_KEY], 1)
            self.pad_token_id = self.wemb.shape[0] - 1
            cfg["vocab_size"] = self.pad_token_id + 1
        else:
            self.wemb, _ = add_emb_entries(self.state_dict["encoder_Wemb"],
                                           self.state_dict[BIAS_KEY], 1)
            self.dec_wemb, self.final_bias = add_emb_entries(
                self.state_dict["decoder_Wemb"], self.state_dict[BIAS_KEY], 1)
            # still assuming that vocab size is same for encoder and decoder
            self.pad_token_id = self.wemb.shape[0] - 1
            cfg["vocab_size"] = self.pad_token_id + 1
            cfg["decoder_vocab_size"] = self.pad_token_id + 1

        if cfg["vocab_size"] != self.tokenizer.vocab_size:
            raise ValueError(
                f"Original vocab size {cfg['vocab_size']} and new vocab size {len(self.tokenizer.encoder)} mismatched."
            )

        # self.state_dict['Wemb'].sha
        self.state_keys = list(self.state_dict.keys())
        if "Wtype" in self.state_dict:
            raise ValueError("Wtype key in state dictionary")
        self._check_layer_entries()
        self.cfg = cfg
        hidden_size, intermediate_shape = self.state_dict[
            "encoder_l1_ffn_W1"].shape
        if hidden_size != cfg["dim-emb"]:
            raise ValueError(
                f"Hidden size {hidden_size} and configured size {cfg['dim_emb']} mismatched"
            )

        # Process decoder.yml
        decoder_yml = cast_marian_config(load_yaml(source_dir / "decoder.yml"))
        check_marian_cfg_assumptions(cfg)
        self.hf_config = MarianConfig(
            vocab_size=cfg["vocab_size"],
            decoder_vocab_size=cfg.get("decoder_vocab_size",
                                       cfg["vocab_size"]),
            share_encoder_decoder_embeddings=cfg["tied-embeddings-src"],
            decoder_layers=cfg["dec-depth"],
            encoder_layers=cfg["enc-depth"],
            decoder_attention_heads=cfg["transformer-heads"],
            encoder_attention_heads=cfg["transformer-heads"],
            decoder_ffn_dim=cfg["transformer-dim-ffn"],
            encoder_ffn_dim=cfg["transformer-dim-ffn"],
            d_model=cfg["dim-emb"],
            activation_function=cfg["transformer-ffn-activation"],
            pad_token_id=self.pad_token_id,
            eos_token_id=eos_token_id,
            forced_eos_token_id=eos_token_id,
            bos_token_id=0,
            max_position_embeddings=cfg["dim-emb"],
            scale_embedding=True,
            normalize_embedding="n" in cfg["transformer-preprocess"],
            static_position_embeddings=not cfg[
                "transformer-train-position-embeddings"],
            tie_word_embeddings=cfg["tied-embeddings"],
            dropout=0.1,  # see opus-mt-train repo/transformer-dropout param.
            # default: add_final_layer_norm=False,
            num_beams=decoder_yml["beam-size"],
            decoder_start_token_id=self.pad_token_id,
            bad_words_ids=[[self.pad_token_id]],
            max_length=512,
        )
Ejemplo n.º 10
0
 def config(self):
     config = MarianConfig.from_pretrained("sshleifer/tiny-marian-en-de")
     return config
Ejemplo n.º 11
0
def main(args):
    df = pd.read_csv(args.input_fname,
                     encoding='utf-8')[[args.source_lang, args.target_lang]]
    logging.info(f'Loaded {df.shape}')

    #convert to dictionary
    j = {'translation': []}
    for i in df.itertuples():
        j['translation'] += [{args.source_lang: i[1], args.target_lang: i[2]}]

    train_dataset = Dataset.from_dict(j)
    raw_datasets = train_dataset.train_test_split(test_size=args.valid_pct,
                                                  seed=args.seed)
    logging.info(f'Datasets created {raw_datasets}')

    tokenizer = MarianTokenizer.from_pretrained(args.output_dir)
    logging.info(f'Tokenizer loaded from {args.output_dir}')

    #tokenize datasets
    tokenized_datasets = raw_datasets.map(
        partial(preprocess_function,
                tokenizer=tokenizer,
                max_input_length=args.max_input_length,
                max_target_length=args.max_target_length,
                source_lang=args.source_lang,
                target_lang=args.target_lang),
        batched=True,
    )
    logging.info(f'Tokenized datasets: {tokenized_datasets}')

    #filter those with too few tokens
    tokenized_datasets = tokenized_datasets.filter(
        lambda example: len(example['translation']['zh']) > 2)
    tokenized_datasets = tokenized_datasets.filter(
        lambda example: len(example['translation']['th']) > 2)
    logging.info(
        f'Tokenized datasets when filtered out less than 2 tokens per sequence: {tokenized_datasets}'
    )

    config = MarianConfig.from_pretrained(args.output_dir)
    model = MarianMTModel(config)
    logging.info(f'Loaded model from {args.output_dir}')

    training_args = Seq2SeqTrainingArguments(
        args.output_dir,
        evaluation_strategy="epoch",
        load_best_model_at_end=True,
        learning_rate=args.learning_rate,
        warmup_ratio=args.warmup_ratio,
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        weight_decay=args.weight_decay,
        save_total_limit=args.save_total_limit,
        num_train_epochs=args.num_train_epochs,
        predict_with_generate=True,
        fp16=args.fp16,
        seed=args.seed,
    )
    logging.info(f'Training congig {training_args}')

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    trainer = Seq2SeqTrainer(
        model,
        training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=partial(compute_metrics,
                                tokenizer=tokenizer,
                                metric=metric,
                                metric_tokenize=args.metric_tokenize),
    )
    logging.info(f'Trainer created')

    trainer.train()

    model.save_pretrained(f"{args.output_dir}_best")
    tokenizer.save_pretrained(f"{args.output_dir}_best")
    logging.info(f'Best model saved')

    model.cpu()
    src_text = ['我爱你', '国王有很多心事。我明白']
    translated = model.generate(
        **tokenizer(src_text, return_tensors="pt", padding=True))
    print([tokenizer.decode(t, skip_special_tokens=True) for t in translated])