Exemple #1
0
 def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='.', bpe='gpt2', **kwargs):
     from fairseq import hub_utils
     x = hub_utils.from_pretrained(
         model_name_or_path,
         checkpoint_file,
         data_name_or_path,
         archive_map=cls.hub_models(),
         bpe=bpe,
         load_checkpoint_heads=True,
         **kwargs,
     )
     return RobertaHubInterface(x['args'], x['task'], x['models'][0])
def umberto_wikipedia_uncased(**kwargs):
    from fairseq import hub_utils
    from fairseq.models.roberta.hub_interface import RobertaHubInterface
    x = hub_utils.from_pretrained(
        model_name_or_path=
        'https://mxmdownloads.s3.amazonaws.com/umberto/umberto.wikipedia.uncased.tar.gz',
        checkpoint_file='model.pt',
        data_name_or_path='.',
        bpe='sentencepiece',
        load_checkpoint_heads=True,
        **kwargs,
    )
    return RobertaHubInterface(x['args'], x['task'], x['models'][0])
    def load_model(cls, model_name: str, lang: str, **kwargs):
        from fairseq import hub_utils

        ckpt_dir = download_or_load(model_name, lang)
        tok_path = download_or_load(f"tokenizers/bpe32k.{lang}.zip", lang)

        x = hub_utils.from_pretrained(ckpt_dir,
                                      "model.pt",
                                      ckpt_dir,
                                      load_checkpoint_heads=True,
                                      **kwargs)
        return BrainRobertaHubInterface(x["args"], x["task"], x["models"][0],
                                        tok_path)
    def __init__(
        self,
        sent_tokenizer,
        device: str,
        ext_model_name: str,
        config,
    ):
        super().__init__(config)
        ckpt_dir = download_or_load(f"bert/{ext_model_name}", config.lang)
        tok_path = download_or_load(
            f"tokenizers/bpe32k.{config.lang}.zip",
            config.lang,
        )

        x = hub_utils.from_pretrained(
            ckpt_dir,
            "model.pt",
            load_checkpoint_heads=True,
        )

        wrapper = BrainRobertaHubInterface(
            x["args"],
            x["task"],
            x["models"][0],
            tok_path,
        )

        clf_dict = torch.load(
            f"{ckpt_dir}/classifier.pt",
            map_location=device,
        )

        classifier_size = 768 if "base" in config.n_model else 1024

        self._device = device
        self._classifier = nn.Linear(classifier_size, 1).to(device).eval()
        self._classifier.load_state_dict(clf_dict)
        self._model = wrapper.model.encoder.sentence_encoder.to(device).eval()

        if "cuda" in device.type:
            self._model = self._model.half()
            self._classifier = self._classifier.half()

        self._tokenizer = BertSumTokenizer(
            bpe=wrapper.bpe,
            dictionary=wrapper.task.source_dictionary,
            sent_tokenizer=sent_tokenizer,
        )
Exemple #5
0
    def load_model(cls, model_name: str, lang: str, **kwargs):
        """
        Load pre-trained model as RobertaHubInterface.
        :param model_name: model name from available_models
        :return: pre-trained model
        """
        from fairseq import hub_utils

        ckpt_dir = download_or_load(model_name, lang)
        x = hub_utils.from_pretrained(
            ckpt_dir,
            "model.pt",
            load_checkpoint_heads=True,
            **kwargs,
        )
        return JabertaHubInterface(x["args"], x["task"], x["models"][0])
Exemple #6
0
 def _load_model(self, path: str, bpe: str, bpe_filename:str) -> RobertaHubInterface:
     if path == "xlmr.large" or path == "xlmr.base":
         return hub.load("pytorch/fairseq", path, force_reload=True)
     else:
         checkpoint_file = "model.pt" if os.path.exists(os.path.join(path, "model.pt")) else "checkpoint_best.pt"
         loaded = hub_utils.from_pretrained(
             model_name_or_path=path,
             checkpoint_file=checkpoint_file,
             data_name_or_path=path,
             bpe=bpe,
             sentencepiece_vocab=os.path.join(path, bpe_filename),
             sentencepiece_model=os.path.join(path, bpe_filename),
             load_checkpoint_heads=True,
             archive_map=RobertaModel.hub_models(),
             cpu=False
         )
         return RobertaHubInterface(loaded['args'], loaded['task'], loaded['models'][0])
Exemple #7
0
 def from_pretrained(
         cls,
         model_name_or_path,
         checkpoint_file="model.pt",
         data_name_or_path=".",
         config_yaml="config.yaml",
         **kwargs,
 ):
     from fairseq import hub_utils
     x = hub_utils.from_pretrained(
         model_name_or_path,
         checkpoint_file,
         data_name_or_path,
         archive_map=cls.hub_models(),
         config_yaml=config_yaml,
         **kwargs,
     )
     return S2THubInterface(x["args"], x["task"], x["models"][0])
Exemple #8
0
 def from_pretrained(
     cls,
     model_name_or_path,
     checkpoint_file='model.pt',
     data_name_or_path='.',
     bpe='bert',
     **kwargs,
 ):
     x = hub_utils.from_pretrained(
         model_name_or_path,
         checkpoint_file,
         data_name_or_path,
         archive_map=cls.hub_models(),
         bpe=bpe,
         load_checkpoint_heads=True,
         **kwargs,
     )
     return ProphetNetHubInterface(x['args'], x['task'], x['models'][0])
Exemple #9
0
    def from_pretrained(cls,
                        model_name_or_path,
                        checkpoint_file="model.pt",
                        data_name_or_path=".",
                        bpe="sentencepiece",
                        **kwargs):
        from fairseq import hub_utils

        x = hub_utils.from_pretrained(
            model_name_or_path,
            checkpoint_file,
            data_name_or_path,
            archive_map=cls.hub_models(),
            bpe=bpe,
            load_checkpoint_heads=True,
            **kwargs,
        )
        return RobertaHubInterface(x["args"], x["task"], x["models"][0])
Exemple #10
0
    def from_pretrained(
        cls,
        model_name_or_path,
        checkpoint_file="model.pt",
        data_name_or_path=".",
        **kwargs,
    ):
        """
        Load a :class:`~fairseq.models.FairseqModel` from a pre-trained model
        file. Downloads and caches the pre-trained model file if needed.

        The base implementation returns a
        :class:`~fairseq.hub_utils.GeneratorHubInterface`, which can be used to
        generate translations or sample from language models. The underlying
        :class:`~fairseq.models.FairseqModel` can be accessed via the
        *generator.models* attribute.

        Other models may override this to implement custom hub interfaces.

        Args:
            model_name_or_path (str): either the name of a pre-trained model to
                load or a path/URL to a pre-trained model state dict
            checkpoint_file (str, optional): colon-separated list of checkpoint
                files in the model archive to ensemble (default: 'model.pt')
            data_name_or_path (str, optional): point args.data to the archive
                at the given path/URL. Can start with '.' or './' to reuse the
                model archive path.
        """
        from fairseq import hub_utils

        x = hub_utils.from_pretrained(
            model_name_or_path,
            checkpoint_file,
            data_name_or_path,
            archive_map=cls.hub_models(),
            **kwargs,
        )

        cls.upgrade_args(x["args"])

        logger.info(x["args"])
        return hub_utils.GeneratorHubInterface(x["args"], x["task"],
                                               x["models"])
Exemple #11
0
 def evaluate_task(self):
     checkpoints_output_dir = os.path.join("checkpoints", self.model_name,
                                           self.task.spec().output_path())
     checkpoint_file = "checkpoint_last.pt" if self.task.spec(
     ).no_dev_set else "checkpoint_best.pt"
     loaded = hub_utils.from_pretrained(
         model_name_or_path=checkpoints_output_dir,
         checkpoint_file=checkpoint_file,
         data_name_or_path=self.task_output_dir,
         bpe="sentencepiece",
         sentencepiece_vocab=os.path.join(self.model_dir,
                                          "sentencepiece.bpe.model"),
         load_checkpoint_heads=True,
         archive_map=RobertaModel.hub_models())
     roberta = RobertaHubInterface(loaded['args'], loaded['task'],
                                   loaded['models'][0])
     evaluator = TaskEvaluator(self.task, self.task_id, roberta,
                               self.input_dir, checkpoints_output_dir)
     return evaluator.evaluate()
Exemple #12
0
    def from_pretrained(
        cls,
        model_name_or_path,
        checkpoint_file="model.pt",
        data_name_or_path=".",
        bpe="gpt2",
        **kwargs,
    ):
        from fairseq import hub_utils

        x = hub_utils.from_pretrained(
            model_name_or_path,
            checkpoint_file,
            data_name_or_path,
            archive_map=cls.hub_models(),
            bpe=bpe,
            load_checkpoint_heads=True,
            **kwargs,
        )
        return cls(x["args"], x["task"])
Exemple #13
0
 def evaluate_task(self):
     checkpoints_output_dir = os.path.join("checkpoints", self.model_name, self.task.spec().output_path())
     checkpoint_file = "checkpoint_last.pt" if self.task.spec().no_dev_set else "checkpoint_best.pt"
     model_classes = {"roberta": (RobertaModel, RobertaHubInterface), "bart": (BARTModel, CustomBARTHubInterface)}
     arch_type = self.arch.split("_")[0]
     model_class = model_classes[arch_type][0]
     spm_path = os.path.join(self.model_dir, "sentencepiece.bpe.model")
     loaded = hub_utils.from_pretrained(
         model_name_or_path=checkpoints_output_dir,
         checkpoint_file=checkpoint_file,
         data_name_or_path=self.task_output_dir,
         bpe="sentencepiece",
         sentencepiece_model=spm_path,
         sentencepiece_vocab=spm_path,
         load_checkpoint_heads=True,
         archive_map=model_class.hub_models()
     )
     model_interface = model_classes[arch_type][1]
     model = model_interface(loaded['args'], loaded['task'], loaded['models'][0])
     evaluator = TaskEvaluator(self.task, self.task_id, model, self.input_dir, checkpoints_output_dir)
     return evaluator.evaluate()
Exemple #14
0
 def from_pretrained(
     cls,
     model_name_or_path,
     checkpoint_file="model.pt",
     data_name_or_path=".",
     config_yaml="config.yaml",
     vocoder: str = "griffin_lim",
     fp16: bool = False,
     **kwargs,
 ):
     from fairseq import hub_utils
     x = hub_utils.from_pretrained(
         model_name_or_path,
         checkpoint_file,
         data_name_or_path,
         archive_map=cls.hub_models(),
         config_yaml=config_yaml,
         vocoder=vocoder,
         fp16=fp16,
         **kwargs,
     )
     return TTSHubInterface(x["args"], x["task"], x["models"][0])
Exemple #15
0
    def from_pretrained(
        cls,
        model_name_or_path,
        checkpoint_file="model.pt",
        data_name_or_path=".",
        bpe="gpt2",
        sample_break_mode="eos",
        **kwargs,
    ):
        from fairseq import hub_utils

        x = hub_utils.from_pretrained(
            model_name_or_path,
            checkpoint_file,
            data_name_or_path,
            archive_map=cls.hub_models(),
            bpe=bpe,
            load_checkpoint_heads=True,
            sample_break_mode=sample_break_mode,
            **kwargs,
        )
        return BARTHubInterface(x["args"], x["task"], x["models"][0])
 def from_pretrained(
     cls,
     model_name_or_path,
     checkpoint_file='model.pt',
     data_name_or_path='.',
     bpe='gpt2',
     **kwargs,
 ):
     from fairseq import hub_utils
     x = hub_utils.from_pretrained(
         model_name_or_path,
         checkpoint_file,
         data_name_or_path,
         archive_map=cls.hub_models(),
         bpe=bpe,
         load_checkpoint_heads=True,
         **kwargs,
     )
     if 'mbart' in model_name_or_path:
         x['args'].sentencepiece_vocab = os.path.join(
             model_name_or_path, 'sentence.bpe.model')
     return BARTHubInterface(x['args'], x['task'], x['models'][0])
Exemple #17
0
    def load_model(cls, model_name: str, lang: str, **kwargs):
        """
        Load pre-trained model as RobertaHubInterface.
        :param model_name: model name from available_models
        :return: pre-trained model
        """
        from fairseq import hub_utils

        # cache directory is treated as the home directory for both model and data files
        ckpt_dir = download_or_load(model_name, lang)
        x = hub_utils.from_pretrained(
            ckpt_dir,
            "model.pt",
            ckpt_dir,
            load_checkpoint_heads=True,
            **kwargs,
        )
        return SegmentBertHubInterface(
            x["args"],
            x["task"],
            x["models"][0],
            lang,
        )
    def from_pretrained(
        cls,
        model_name_or_path,
        sentencepiece_model="spm_256000.model",
        checkpoint_file="model.pt",
        data_name_or_path=".",
        bpe="sentencepiece",
        layernorm_embedding=True,
        **kwargs,
    ):
        from fairseq import hub_utils

        x = hub_utils.from_pretrained(
            model_name_or_path,
            checkpoint_file,
            data_name_or_path,
            archive_map=cls.hub_models(),
            bpe=bpe,
            load_checkpoint_heads=True,
            sentencepiece_model=os.path.join(model_name_or_path, sentencepiece_model),
            **kwargs,
        )
        return GENREHubInterface(x["args"], x["task"], x["models"][0])
Exemple #19
0
    def initialize(self, ctx):
        self.manifest = ctx.manifest

        properties = ctx.system_properties
        model_dir = properties.get("model_dir")
        self.device = torch.device("cuda:" +
                                   str(properties.get("gpu_id")) if torch.cuda.
                                   is_available() else "cpu")
        logger.debug('Will load from {0}'.format(model_dir))
        # Read model serialize/pt file
        x = hub_utils.from_pretrained(model_dir,
                                      "model.pt",
                                      DATA_PATH,
                                      load_checkpoint_heads=True)
        model_interface = BrainRobertaHubInterface(
            x["args"],
            x["task"],
            x["models"][0],
            model_dir,
        ).to(self.device)

        tagger = mecab.MeCab()

        self.model = PororoBertMrc(
            model_interface, tagger, postprocess_span,
            TaskConfig("mrc", "ko", "brainbert.base.ko.korquad"))

        # Read the mapping file, index to object name
        # mapping_file_path = os.path.join(model_dir, "index_to_name.json")

        # if os.path.isfile(mapping_file_path):
        #     with open(mapping_file_path) as f:
        #         self.mapping = json.load(f)
        # else:
        #     logger.warning('Missing the index_to_name.json file. Inference output will not include class name.')

        self.initialized = True
Exemple #20
0
    def from_pretrained(
        cls,
        model_path: str,
        sentencepiece_prefix: str,
        dictionary_path: str,
    ):
        x = hub_utils.from_pretrained("./",
                                      checkpoint_file=model_path,
                                      archive_map={},
                                      data_name_or_path=dictionary_path,
                                      task="multilingual_translation")

        sp_models = {
            lang: SentencePieceProcessor(
                model_file=f"{sentencepiece_prefix}.{lang}.model")
            for lang in x["task"].langs
        }

        return cls(
            models=x["models"],
            task=x["task"],
            cfg=x["args"],
            sp_models=sp_models,
        )
Exemple #21
0
    def from_pretrained(cls,
                        model_name_or_path,
                        checkpoint_file="model.pt",
                        data_name_or_path=".",
                        bpe="hf_byte_bpe",
                        bpe_vocab="vocab.json",
                        bpe_merges="merges.txt",
                        bpe_add_prefix_space=False,
                        **kwargs):
        from fairseq import hub_utils

        x = hub_utils.from_pretrained(
            model_name_or_path,
            checkpoint_file,
            data_name_or_path,
            archive_map=cls.hub_models(),
            bpe=bpe,
            load_checkpoint_heads=True,
            bpe_vocab=bpe_vocab,
            bpe_merges=bpe_merges,
            bpe_add_prefix_space=bpe_add_prefix_space,
            **kwargs,
        )
        return RobertaHubInterface(x["args"], x["task"], x["models"][0])
Exemple #22
0
    def from_pretrained(cls,
                        model_name_or_path,
                        checkpoint_file='model.pt',
                        data_name_or_path='.',
                        bpe='hf_byte_bpe',
                        bpe_vocab='vocab.json',
                        bpe_merges='merges.txt',
                        bpe_add_prefix_space=False,
                        **kwargs):
        from fairseq import hub_utils

        x = hub_utils.from_pretrained(
            model_name_or_path,
            checkpoint_file,
            data_name_or_path,
            archive_map=cls.hub_models(),
            bpe=bpe,
            load_checkpoint_heads=True,
            bpe_vocab=bpe_vocab,
            bpe_merges=bpe_merges,
            bpe_add_prefix_space=bpe_add_prefix_space,
            **kwargs,
        )
        return RobertaHubInterface(x['args'], x['task'], x['models'][0])
Exemple #23
0
from fairseq.models.roberta import RobertaHubInterface, RobertaModel
import mecab
from pororo.models.brainbert.BrainRoBERTa import BrainRobertaHubInterface
from pororo.tasks.machine_reading_comprehension import PororoBertMrc
from pororo.tasks.utils.base import TaskConfig
from pororo.tasks.utils.download_utils import download_or_load
from pororo.tasks.utils.tokenizer import CustomTokenizer
from pororo.utils import postprocess_span
import torch

ckpt_dir = download_or_load("bert/brainbert.base.ko.korquad", "ko")
tok_path = download_or_load(f"tokenizers/bpe32k.ko.zip", "ko")

x = hub_utils.from_pretrained(
    ckpt_dir,
    "model.pt",
    ckpt_dir,
    load_checkpoint_heads=True
)
model = BrainRobertaHubInterface(
    x["args"],
    x["task"],
    x["models"][0],
    tok_path,
).to(torch.device("cuda"))

tagger = mecab.MeCab()
final = PororoBertMrc(model, tagger, postprocess_span, TaskConfig("mrc", "ko", "brainbert.base.ko.korquad"))

print(final("이름이 뭐야?", "이름은 시리야."))
Exemple #24
0
def to_pytorch(fsmt_checkpoint_path, save_path):
    assert os.path.exists(fsmt_checkpoint_path)
    os.makedirs(save_path, exist_ok=True)
    print(f"Writing results to {save_path}")
    checkpoint_file = basename(fsmt_checkpoint_path)
    fsmt_folder_path = dirname(fsmt_checkpoint_path)
    cls = fairseq.model_parallel.models.transformer.ModelParallelTransformerModel
    models = cls.hub_models()
    kw = {"bpe": "fastbpe", "tokenizer": "moses"}
    data_name_or_path = "."
    print(f"using checkpoint {checkpoint_file}")
    chkpt = hub_utils.from_pretrained(fsmt_folder_path,
                                      checkpoint_file,
                                      data_name_or_path,
                                      archive_map=models,
                                      **kw)
    args = vars(chkpt["args"]["model"])
    src_lang = args["source_lang"]
    tgt_lang = args["target_lang"]
    data_root = dirname(save_path)
    model_dir = basename(save_path)
    src_dict_file = os.path.join(fsmt_folder_path, f"dict.{src_lang}.txt")
    tgt_dict_file = os.path.join(fsmt_folder_path, f"dict.{tgt_lang}.txt")
    src_dict = Dictionary.load(src_dict_file)
    src_vocab = rewrite_dict_keys(src_dict.indices)
    s_src_vocab = len(src_vocab)
    src_vocab_file = os.path.join(save_path, "vocab-src.json")
    print(
        f"Generating {src_vocab_file} of {s_src_vocab} of {src_lang} records")
    with open(src_vocab_file, "w", encoding="utf-8") as f:
        f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent))
    do_lower_case = True
    for k in src_vocab.keys():
        if not k.islower():
            do_lower_case = False
            break
    tgt_dict = Dictionary.load(tgt_dict_file)
    tgt_vocab = rewrite_dict_keys(tgt_dict.indices)
    s_tgt_vocab = len(tgt_vocab)
    tgt_vocab_file = os.path.join(save_path, "vocab-tgt.json")
    print(
        f"Generating {tgt_vocab_file} of {s_tgt_vocab} of {tgt_lang} records")
    with open(tgt_vocab_file, "w", encoding="utf-8") as f:
        f.write(json.dumps(tgt_vocab, ensure_ascii=False, indent=json_indent))
    merges_file = os.path.join(save_path, VOCAB_FS["merges_file"])
    for fn in ["bpecodes",
               "code"]:  # older fairseq called the merges file "code"
        fsmt_merges_file = os.path.join(fsmt_folder_path, fn)
        if os.path.exists(fsmt_merges_file):
            break
    with open(fsmt_merges_file, encoding="utf-8") as fin:
        merges = fin.read()
    merges = re.sub(r" \d+$", "", merges, 0, re.M)  # remove frequency number
    print(f"Generating {merges_file}")
    with open(merges_file, "w", encoding="utf-8") as fout:
        fout.write(merges)
    fsmt_model_config_file = os.path.join(save_path, "config.json")
    assert args[
        "bpe"] == "fastbpe", f"need to extend tokenizer to support bpe={args['bpe']}"
    assert (args["tokenizer"] == "moses"
            ), f"need to extend tokenizer to support bpe={args['tokenizer']}"

    model_conf = {
        "archs": ["FSMTForConditionalGeneration"],
        "model_type": "fsmt",
        "drop_act": args["drop_act"],
        "act_fun": "relu",
        "drop_attn": args["drop_attn"],
        "d_hidden": args["decoder_embed_dim"],
        "drop": args["drop"],
        "init_std": 0.02,
        "n_pos": args["max_source_positions"],
        "n_lays": args["n_enc_lays"],
        "s_src_vocab": s_src_vocab,
        "s_tgt_vocab": s_tgt_vocab,
        "langs": [src_lang, tgt_lang],
        "n_enc_heads": args["n_enc_heads"],
        "d_enc_ffn": args["encoder_ffn_embed_dim"],
        "drop_enc": args["drop_enc"],
        "n_enc_lays": args["n_enc_lays"],
        "n_dec_heads": args["n_dec_heads"],
        "d_dec_ffn": args["decoder_ffn_embed_dim"],
        "drop_dec": args["drop_dec"],
        "n_dec_lays": args["n_dec_lays"],
        "BOS": 0,
        "PAD": 1,
        "EOS": 2,
        "is_enc_dec": True,
        "scale": not args["no_scale_embedding"],
        "tie_word_embeds": args["share_all_embeddings"],
    }
    model_conf["n_beams"] = 5
    model_conf["early_stop"] = False
    if model_dir in best_score_hparams and "len_penalty" in best_score_hparams[
            model_dir]:
        model_conf["len_penalty"] = best_score_hparams[model_dir][
            "len_penalty"]
    else:
        model_conf["len_penalty"] = 1.0
    print(f"Generating {fsmt_model_config_file}")
    with open(fsmt_model_config_file, "w", encoding="utf-8") as f:
        f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent))
    fsmt_tokenizer_config_file = os.path.join(save_path, TOKENIZER_CONFIG_FILE)
    tokenizer_conf = {
        "langs": [src_lang, tgt_lang],
        "model_max_length": 1024,
        "do_lower_case": do_lower_case,
    }
    print(f"Generating {fsmt_tokenizer_config_file}")
    with open(fsmt_tokenizer_config_file, "w", encoding="utf-8") as f:
        f.write(
            json.dumps(tokenizer_conf, ensure_ascii=False, indent=json_indent))
    model = chkpt["models"][0]
    model_state_dict = model.state_dict()
    model_state_dict = OrderedDict(
        ("model." + k, v) for k, v in model_state_dict.items())
    ignore_keys = [
        "model.model",
        "model.encoder.version",
        "model.decoder.version",
        "model.encoder_embed_tokens.weight",
        "model.decoder_embed_tokens.weight",
        "model.encoder.embed_positions._float_tensor",
        "model.decoder.embed_positions._float_tensor",
    ]
    for k in ignore_keys:
        model_state_dict.pop(k, None)
    config = PreTrained.from_pretrained(save_path)
    model_new = ForConditionalGen(config)
    model_new.load_state_dict(model_state_dict, strict=False)
    pytorch_weights_dump_path = os.path.join(save_path, WEIGHTS_NAME)
    print(f"Generating {pytorch_weights_dump_path}")
    torch.save(model_state_dict, pytorch_weights_dump_path)
    print("Conversion is done!")
    print("\nLast step is to upload the files to s3")
    print(f"cd {data_root}")
    print(f"transformers-cli upload {model_dir}")
Exemple #25
0
from fairseq.models.roberta import RobertaModel
from fairseq import hub_utils
from fairseq.models.roberta import RobertaModel, RobertaHubInterface

import os
from tqdm import tqdm

model_path = "polish_roberta_large_no_finetune"
loaded = hub_utils.from_pretrained(model_name_or_path=model_path,
                                   data_name_or_path=model_path,
                                   bpe="sentencepiece",
                                   sentencepiece_vocab=os.path.join(
                                       model_path, "sentencepiece.bpe.model"),
                                   load_checkpoint_heads=True,
                                   archive_map=RobertaModel.hub_models(),
                                   cpu=False)
roberta = RobertaHubInterface(loaded['args'], loaded['task'],
                              loaded['models'][0])

roberta.eval()
roberta.cuda()

preds = roberta.fill_mask('Ala <mask>, kota', topk=3)
#import pdb; pdb.set_trace()


def predict(f_in_path, f_out_path):
    f_in = open(f_in_path, 'r', newline='\n')
    f_out = open(f_out_path, 'w', newline='\n')

    for line in tqdm(f_in, total=19986):
def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder_path):

    # prep
    assert os.path.exists(fsmt_checkpoint_path)
    os.makedirs(pytorch_dump_folder_path, exist_ok=True)
    print(f"Writing results to {pytorch_dump_folder_path}")

    # handle various types of models

    checkpoint_file = basename(fsmt_checkpoint_path)
    fsmt_folder_path = dirname(fsmt_checkpoint_path)

    cls = fairseq.model_parallel.models.transformer.ModelParallelTransformerModel
    models = cls.hub_models()
    kwargs = {"bpe": "fastbpe", "tokenizer": "moses"}
    data_name_or_path = "."
    # note: since the model dump is old, fairseq has upgraded its model some
    # time later, and it does a whole lot of rewrites and splits on the saved
    # weights, therefore we can't use torch.load() directly on the model file.
    # see: upgrade_state_dict(state_dict) in fairseq_model.py
    print(f"using checkpoint {checkpoint_file}")
    chkpt = hub_utils.from_pretrained(
        fsmt_folder_path, checkpoint_file, data_name_or_path, archive_map=models, **kwargs
    )

    args = vars(chkpt["args"]["model"])

    src_lang = args["source_lang"]
    tgt_lang = args["target_lang"]

    data_root = dirname(pytorch_dump_folder_path)
    model_dir = basename(pytorch_dump_folder_path)

    # dicts
    src_dict_file = os.path.join(fsmt_folder_path, f"dict.{src_lang}.txt")
    tgt_dict_file = os.path.join(fsmt_folder_path, f"dict.{tgt_lang}.txt")

    src_dict = Dictionary.load(src_dict_file)
    src_vocab = rewrite_dict_keys(src_dict.indices)
    src_vocab_size = len(src_vocab)
    src_vocab_file = os.path.join(pytorch_dump_folder_path, "vocab-src.json")
    print(f"Generating {src_vocab_file} of {src_vocab_size} of {src_lang} records")
    with open(src_vocab_file, "w", encoding="utf-8") as f:
        f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent))

    # detect whether this is a do_lower_case situation, which can be derived by checking whether we
    # have at least one upcase letter in the source vocab
    do_lower_case = True
    for k in src_vocab.keys():
        if not k.islower():
            do_lower_case = False
            break

    tgt_dict = Dictionary.load(tgt_dict_file)
    tgt_vocab = rewrite_dict_keys(tgt_dict.indices)
    tgt_vocab_size = len(tgt_vocab)
    tgt_vocab_file = os.path.join(pytorch_dump_folder_path, "vocab-tgt.json")
    print(f"Generating {tgt_vocab_file} of {tgt_vocab_size} of {tgt_lang} records")
    with open(tgt_vocab_file, "w", encoding="utf-8") as f:
        f.write(json.dumps(tgt_vocab, ensure_ascii=False, indent=json_indent))

    # merges_file (bpecodes)
    merges_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["merges_file"])
    for fn in ["bpecodes", "code"]:  # older fairseq called the merges file "code"
        fsmt_merges_file = os.path.join(fsmt_folder_path, fn)
        if os.path.exists(fsmt_merges_file):
            break
    with open(fsmt_merges_file, encoding="utf-8") as fin:
        merges = fin.read()
    merges = re.sub(r" \d+$", "", merges, 0, re.M)  # remove frequency number
    print(f"Generating {merges_file}")
    with open(merges_file, "w", encoding="utf-8") as fout:
        fout.write(merges)

    # model config
    fsmt_model_config_file = os.path.join(pytorch_dump_folder_path, "config.json")

    # validate bpe/tokenizer config, as currently it's hardcoded to moses+fastbpe -
    # may have to modify the tokenizer if a different type is used by a future model
    assert args["bpe"] == "fastbpe", f"need to extend tokenizer to support bpe={args['bpe']}"
    assert args["tokenizer"] == "moses", f"need to extend tokenizer to support bpe={args['tokenizer']}"

    model_conf = {
        "architectures": ["FSMTForConditionalGeneration"],
        "model_type": "fsmt",
        "activation_dropout": args["activation_dropout"],
        "activation_function": "relu",
        "attention_dropout": args["attention_dropout"],
        "d_model": args["decoder_embed_dim"],
        "dropout": args["dropout"],
        "init_std": 0.02,
        "max_position_embeddings": args["max_source_positions"],
        "num_hidden_layers": args["encoder_layers"],
        "src_vocab_size": src_vocab_size,
        "tgt_vocab_size": tgt_vocab_size,
        "langs": [src_lang, tgt_lang],
        "encoder_attention_heads": args["encoder_attention_heads"],
        "encoder_ffn_dim": args["encoder_ffn_embed_dim"],
        "encoder_layerdrop": args["encoder_layerdrop"],
        "encoder_layers": args["encoder_layers"],
        "decoder_attention_heads": args["decoder_attention_heads"],
        "decoder_ffn_dim": args["decoder_ffn_embed_dim"],
        "decoder_layerdrop": args["decoder_layerdrop"],
        "decoder_layers": args["decoder_layers"],
        "bos_token_id": 0,
        "pad_token_id": 1,
        "eos_token_id": 2,
        "is_encoder_decoder": True,
        "scale_embedding": not args["no_scale_embedding"],
        "tie_word_embeddings": args["share_all_embeddings"],
    }

    # good hparam defaults to start with
    model_conf["num_beams"] = 5
    model_conf["early_stopping"] = False
    if model_dir in best_score_hparams and "length_penalty" in best_score_hparams[model_dir]:
        model_conf["length_penalty"] = best_score_hparams[model_dir]["length_penalty"]
    else:
        model_conf["length_penalty"] = 1.0

    print(f"Generating {fsmt_model_config_file}")
    with open(fsmt_model_config_file, "w", encoding="utf-8") as f:
        f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent))

    # tokenizer config
    fsmt_tokenizer_config_file = os.path.join(pytorch_dump_folder_path, TOKENIZER_CONFIG_FILE)

    tokenizer_conf = {
        "langs": [src_lang, tgt_lang],
        "model_max_length": 1024,
        "do_lower_case": do_lower_case,
    }

    print(f"Generating {fsmt_tokenizer_config_file}")
    with open(fsmt_tokenizer_config_file, "w", encoding="utf-8") as f:
        f.write(json.dumps(tokenizer_conf, ensure_ascii=False, indent=json_indent))

    # model
    model = chkpt["models"][0]
    model_state_dict = model.state_dict()

    # rename keys to start with 'model.'
    model_state_dict = OrderedDict(("model." + k, v) for k, v in model_state_dict.items())

    # remove unneeded keys
    ignore_keys = [
        "model.model",
        "model.encoder.version",
        "model.decoder.version",
        "model.encoder_embed_tokens.weight",
        "model.decoder_embed_tokens.weight",
        "model.encoder.embed_positions._float_tensor",
        "model.decoder.embed_positions._float_tensor",
    ]
    for k in ignore_keys:
        model_state_dict.pop(k, None)

    config = FSMTConfig.from_pretrained(pytorch_dump_folder_path)
    model_new = FSMTForConditionalGeneration(config)

    # check that it loads ok
    model_new.load_state_dict(model_state_dict, strict=False)

    # save
    pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
    print(f"Generating {pytorch_weights_dump_path}")
    torch.save(model_state_dict, pytorch_weights_dump_path)

    print("Conversion is done!")
    print("\nLast step is to upload the files to s3")
    print(f"cd {data_root}")
    print(f"transformers-cli upload {model_dir}")
Exemple #27
0
 def from_pretrained(path, id2label, **kwargs):
     x = hub_utils.from_pretrained(path)
     return RobertaClassifier(x['args'], x['task'], x['models'][0], path,
                              id2label)
model_path = os.path.join(root_path,"checkpoints/")
checkpoint_file = "checkpoint77.pt"
checkpoint_file = "checkpoint94.pt"
checkpoint_file = "checkpoint127.pt"
checkpoint_file = "checkpoint_best.pt"


vocab_model_file="wikipedia_upper_voc_32000_sen10000000.model"
vocab_path = os.path.join(root_path, "vocab", vocab_model_file)
#%%

loaded = hub_utils.from_pretrained(
    model_name_or_path=model_path,
    checkpoint_file=checkpoint_file,
    data_name_or_path='./',
    bpe="sentencepiece",
    sentencepiece_vocab=vocab_path,
    load_checkpoint_heads=True,
    archive_map=RobertaModel.hub_models(),
    cpu=True
)
roberta = RobertaHubInterface(loaded['args'], loaded['task'], loaded['models'][0])
roberta.eval()

#%%

def print_mask(s, predicted):

    print(s)
    for p in predicted:
        print(f'\t{p[2]} - {p[0]} - confidence {p[1]}')
    def __init__(self, args, task):
        super(BertRanker, self).__init__(args, task)

        init_model = getattr(args, "pretrained_model", "")
        self.joint_layers = nn.ModuleList()
        if os.path.isfile(init_model):
            print(f"initialize weight from {init_model}")

            from fairseq import hub_utils

            x = hub_utils.from_pretrained(
                os.path.dirname(init_model),
                checkpoint_file=os.path.basename(init_model),
            )

            in_state_dict = x["models"][0].state_dict()
            init_args = x["args"].model

            num_positional_emb = init_args.max_positions + task.dictionary.pad(
            ) + 1

            # follow the setup in roberta
            self.model = TransformerSentenceEncoder(
                padding_idx=task.dictionary.pad(),
                vocab_size=len(task.dictionary),
                num_encoder_layers=getattr(args, "encoder_layers",
                                           init_args.encoder_layers),
                embedding_dim=init_args.encoder_embed_dim,
                ffn_embedding_dim=init_args.encoder_ffn_embed_dim,
                num_attention_heads=init_args.encoder_attention_heads,
                dropout=init_args.dropout,
                attention_dropout=init_args.attention_dropout,
                activation_dropout=init_args.activation_dropout,
                num_segments=2,  # add language embeddings
                max_seq_len=num_positional_emb,
                offset_positions_by_padding=False,
                encoder_normalize_before=True,
                apply_bert_init=True,
                activation_fn=init_args.activation_fn,
                freeze_embeddings=args.freeze_embeddings,
                n_trans_layers_to_freeze=args.n_trans_layers_to_freeze,
            )

            # still need to learn segment embeddings as we added a second language embedding
            if args.freeze_embeddings:
                for p in self.model.segment_embeddings.parameters():
                    p.requires_grad = False

            update_init_roberta_model_state(in_state_dict)
            print("loading weights from the pretrained model")
            self.model.load_state_dict(
                in_state_dict,
                strict=False)  # ignore mismatch in language embeddings

            ffn_embedding_dim = init_args.encoder_ffn_embed_dim
            num_attention_heads = init_args.encoder_attention_heads
            dropout = init_args.dropout
            attention_dropout = init_args.attention_dropout
            activation_dropout = init_args.activation_dropout
            activation_fn = init_args.activation_fn

            classifier_embed_dim = getattr(args, "embed_dim",
                                           init_args.encoder_embed_dim)
            if classifier_embed_dim != init_args.encoder_embed_dim:
                self.transform_layer = nn.Linear(init_args.encoder_embed_dim,
                                                 classifier_embed_dim)
        else:
            self.model = TransformerSentenceEncoder(
                padding_idx=task.dictionary.pad(),
                vocab_size=len(task.dictionary),
                num_encoder_layers=args.encoder_layers,
                embedding_dim=args.embed_dim,
                ffn_embedding_dim=args.ffn_embed_dim,
                num_attention_heads=args.attention_heads,
                dropout=args.dropout,
                attention_dropout=args.attention_dropout,
                activation_dropout=args.activation_dropout,
                max_seq_len=task.max_positions()
                if task.max_positions() else args.tokens_per_sample,
                num_segments=2,
                offset_positions_by_padding=False,
                encoder_normalize_before=args.encoder_normalize_before,
                apply_bert_init=args.apply_bert_init,
                activation_fn=args.activation_fn,
            )

            classifier_embed_dim = args.embed_dim
            ffn_embedding_dim = args.ffn_embed_dim
            num_attention_heads = args.attention_heads
            dropout = args.dropout
            attention_dropout = args.attention_dropout
            activation_dropout = args.activation_dropout
            activation_fn = args.activation_fn

        self.joint_classification = args.joint_classification
        if args.joint_classification == "sent":
            if args.joint_normalize_before:
                self.joint_layer_norm = LayerNorm(classifier_embed_dim)
            else:
                self.joint_layer_norm = None

            self.joint_layers = nn.ModuleList([
                TransformerSentenceEncoderLayer(
                    embedding_dim=classifier_embed_dim,
                    ffn_embedding_dim=ffn_embedding_dim,
                    num_attention_heads=num_attention_heads,
                    dropout=dropout,
                    attention_dropout=attention_dropout,
                    activation_dropout=activation_dropout,
                    activation_fn=activation_fn,
                ) for _ in range(args.num_joint_layers)
            ])

        self.classifier = RobertaClassificationHead(
            classifier_embed_dim,
            classifier_embed_dim,
            1,  # num_classes
            "tanh",
            args.classifier_dropout,
        )